From ccb42f2458669aaac84a661091b05a59bb781197 Mon Sep 17 00:00:00 2001 From: adilger Date: Mon, 6 Jan 2003 22:22:15 +0000 Subject: [PATCH] Merge b_md to HEAD for 0.5.19 release. Fixes a _huge_ number of bugs: - Fully reactivate OST imports after reconnection (512, others) - Make sure client sees our -ENOTCONN from mds_handle (513 - partial) - More graceful error handling for truncating on dead OST (515) - Don't error out unless we're actually accessing dead stripes (474) - Fix garbage sizes when stripes are missing (410) - LRU counters were broken, causing constant lock purge (433, 432) - garbage on read from stripes with failed OSTs (441) - mark OSCs as active before reconnecting during recovery (438) - lov_enqueue and lov_cancel need to handle inactive OSTs (403) - lfind did not preserve OST order in output (443) - symlinks cause hung clients, incorrect data (439) - stop dereferencing request after dropping refcount (457) - don't LASSERT(spin_is_locked) on non-SMP (455) - fixes for many rename() bugs - fstat didn't correctly synchronize attributes (399) - server must handle lock cancellation during blocking AST prep (487) - bulk descriptors were free()d too soon (511) - fix paths in lconf, which would load incorrect modules (451, 507) - fix confusing lconf 'host not found' error message (386) - fix lock order deadlock on OST (O/R i_sem before journal ops, 478) - fix race condition in mdc_blocking_ast() for inode access (526) - fix lov_unpackmd() unpacking wrong number of stripes (537) - fix lov_set_osc_active() marking wrong OSC inactive (440) - fix bad lstripe lov_unpackmd() assertion (fix layering too) (527) - fix multiple writes of stripe MD to MDS (358, maybe 519) - fix lstripe in several ways (kernel side) (527) - fix request leak in ldlm_cli_enqueue (262) - incorrect OSC was marked inactive after OST failure - call mds_fs_cleanup before unmounting filesystem (524) - fix races between taking ns_lock and ldlm_lock_change_resource - fix races updating LOV export open file list - fix lov_enqueue error path, avoid decref-ing bad lock handle - fix recovery NULL deref in ldlm_cli_cancel_unused - fix some DLM races by using new hash table for lock handles (419) - permit the client to specify desired inodes, at replay - duplicate requests when we queue them for replay reintegration - fix last_rcvd offset calculation - sync after each recovered transaction, so we always make progress - never, not always, ERESTART requests without transnos - store the lov_desc in the MDS, so we don't depend on getlovinfo to set it - skip replay if the MDS says that the client is already connected - don't check for a recovery-enabled export to match lctl's UUID - don't INC_USE_COUNT for phantom exports - don't crash when cleaning up phantom exports (567) - don't double-finish or set replay data for errored mdc_open requests - abort requests when they time out, so we don't get old replies - send/receive replies for AST messages again - if the client says that it doesn't have the lock, cancel it on the server - if we timeout during I/O, don't try to cancel an in-use lock; instead mark it as destroyed, it will all work out when decref is called - fix module use counts (22, 581) * protocol changes - ASTs now expect a reply (server cancels lock on error reply) --- lustre/.cvsignore | 1 + lustre/ChangeLog | 38 +- lustre/Makefile.am | 6 +- lustre/cobd/.cvsignore | 3 + lustre/cobd/Makefile.am | 15 + lustre/cobd/cache_obd.c | 329 +++++++ lustre/cobd/lproc_cache.c | 95 ++ lustre/conf/.cvsignore | 2 + lustre/conf/Makefile.am | 13 + lustre/conf/lustre.dtd | 111 +++ lustre/conf/lustre2ldif.xsl | 212 +++++ lustre/conf/slapd-lustre.conf | 12 + lustre/conf/top.ldif | 4 + lustre/configure.in | 1 + lustre/extN/Makefile.am | 15 +- lustre/extN/extN-noread.diff | 225 +++++ lustre/extN/extN-wantedi.diff | 163 ++++ lustre/include/linux/lustre_dlm.h | 73 +- lustre/include/linux/lustre_export.h | 7 + lustre/include/linux/lustre_ha.h | 6 +- lustre/include/linux/lustre_idl.h | 52 +- lustre/include/linux/lustre_lib.h | 22 +- lustre/include/linux/lustre_lite.h | 2 +- lustre/include/linux/lustre_mds.h | 7 +- lustre/include/linux/lustre_net.h | 26 +- lustre/include/linux/obd.h | 51 +- lustre/include/linux/obd_cache.h | 13 + lustre/include/linux/obd_class.h | 16 +- lustre/include/linux/obd_ost.h | 4 +- lustre/include/linux/obd_ptlbd.h | 30 + lustre/include/linux/obd_support.h | 10 +- lustre/ldlm/l_lock.c | 4 + lustre/ldlm/ldlm_extent.c | 7 +- lustre/ldlm/ldlm_lock.c | 197 ++-- lustre/ldlm/ldlm_lockd.c | 120 ++- lustre/ldlm/ldlm_request.c | 85 +- lustre/ldlm/ldlm_resource.c | 31 +- lustre/ldlm/ldlm_test.c | 19 +- lustre/lib/Makefile.am | 2 +- lustre/lib/client.c | 28 +- lustre/lib/obd_pack.c | 12 +- lustre/lib/simple.c | 23 +- lustre/lib/target.c | 46 +- lustre/llite/Makefile.am | 7 +- lustre/llite/commit_callback.c | 30 +- lustre/llite/dir.c | 66 +- lustre/llite/file.c | 621 ++++++------- lustre/llite/namei.c | 12 +- lustre/llite/recover.c | 5 +- lustre/llite/rw.c | 21 +- lustre/llite/super.c | 70 +- lustre/llite/super25.c | 75 +- lustre/llite/symlink.c | 2 +- lustre/lov/lov_obd.c | 417 +++++---- lustre/lov/lov_pack.c | 184 +++- lustre/lov/lproc_lov.c | 104 +-- lustre/mdc/Makefile.am | 4 +- lustre/mdc/mdc_request.c | 200 +++- lustre/mds/Makefile.am | 4 +- lustre/mds/handler.c | 363 ++++---- lustre/mds/lproc_mds.c | 81 +- lustre/mds/mds_fs.c | 78 +- lustre/mds/mds_lov.c | 28 + lustre/mds/mds_reint.c | 33 +- lustre/obdclass/Makefile.am | 14 +- lustre/obdclass/class_obd.c | 54 +- lustre/obdclass/fsfilt.c | 3 - lustre/obdclass/fsfilt_extN.c | 89 +- lustre/obdclass/fsfilt_reiserfs.c | 193 ++++ lustre/obdclass/genops.c | 86 +- lustre/obdclass/lprocfs_status.c | 27 +- lustre/{lib/ll_pack.c => obdclass/statfs_pack.c} | 12 +- lustre/obdecho/echo.c | 52 +- lustre/obdecho/echo_client.c | 16 +- lustre/obdfilter/Makefile.am | 5 +- lustre/obdfilter/filter.c | 83 +- lustre/osc/Makefile.am | 4 +- lustre/osc/osc_request.c | 139 +-- lustre/ost/Makefile.am | 4 +- lustre/ost/ost_handler.c | 109 ++- lustre/patches/.cvsignore | 8 - lustre/ptlbd/.cvsignore | 3 + lustre/ptlbd/Makefile.am | 14 + lustre/ptlbd/blk.c | 247 +++++ lustre/ptlbd/client.c | 142 +++ lustre/ptlbd/main.c | 70 ++ lustre/ptlbd/rpc.c | 550 +++++++++++ lustre/ptlbd/server.c | 154 ++++ lustre/ptlrpc/client.c | 113 ++- lustre/ptlrpc/niobuf.c | 34 +- lustre/ptlrpc/recovd.c | 9 +- lustre/ptlrpc/recover.c | 43 +- lustre/ptlrpc/rpc.c | 48 +- lustre/ptlrpc/service.c | 14 +- lustre/tests/.cvsignore | 2 + lustre/tests/Makefile.am | 9 +- lustre/tests/createmany.c | 68 +- lustre/tests/echo.sh | 47 + lustre/tests/llecho.sh | 46 +- lustre/tests/llechocleanup.sh | 13 +- lustre/tests/llmount.sh | 4 +- lustre/tests/llmountcleanup.sh | 17 +- lustre/tests/local.sh | 8 +- lustre/tests/lov.sh | 12 +- lustre/tests/lovstripe.c | 164 ---- lustre/tests/sanity.sh | 35 +- lustre/tests/statmany.c | 214 +++++ lustre/tests/uml.sh | 21 +- lustre/tests/wantedi.c | 48 + lustre/utils/automatic-reconnect-sample | 34 + lustre/utils/lconf.in | 1055 ++++++++++++++-------- lustre/utils/lctl.c | 1 + lustre/utils/lfind.c | 23 +- lustre/utils/lmc | 150 ++- lustre/utils/lstripe.c | 26 +- lustre/utils/lustre.dtd | 110 --- lustre/utils/obd.c | 75 +- lustre/utils/obdctl.h | 1 + 118 files changed, 6488 insertions(+), 2577 deletions(-) create mode 100644 lustre/cobd/.cvsignore create mode 100644 lustre/cobd/Makefile.am create mode 100644 lustre/cobd/cache_obd.c create mode 100644 lustre/cobd/lproc_cache.c create mode 100644 lustre/conf/.cvsignore create mode 100644 lustre/conf/Makefile.am create mode 100644 lustre/conf/lustre.dtd create mode 100644 lustre/conf/lustre2ldif.xsl create mode 100644 lustre/conf/slapd-lustre.conf create mode 100644 lustre/conf/top.ldif create mode 100644 lustre/extN/extN-noread.diff create mode 100644 lustre/extN/extN-wantedi.diff create mode 100644 lustre/include/linux/obd_cache.h create mode 100644 lustre/include/linux/obd_ptlbd.h create mode 100644 lustre/obdclass/fsfilt_reiserfs.c rename lustre/{lib/ll_pack.c => obdclass/statfs_pack.c} (88%) delete mode 100644 lustre/patches/.cvsignore create mode 100644 lustre/ptlbd/.cvsignore create mode 100644 lustre/ptlbd/Makefile.am create mode 100644 lustre/ptlbd/blk.c create mode 100644 lustre/ptlbd/client.c create mode 100644 lustre/ptlbd/main.c create mode 100644 lustre/ptlbd/rpc.c create mode 100644 lustre/ptlbd/server.c create mode 100755 lustre/tests/echo.sh delete mode 100644 lustre/tests/lovstripe.c create mode 100644 lustre/tests/statmany.c create mode 100644 lustre/tests/wantedi.c create mode 100755 lustre/utils/automatic-reconnect-sample delete mode 100644 lustre/utils/lustre.dtd diff --git a/lustre/.cvsignore b/lustre/.cvsignore index 111b232..34373dd 100644 --- a/lustre/.cvsignore +++ b/lustre/.cvsignore @@ -12,3 +12,4 @@ TAGS lustre*.tar.gz cscope.files cscope.out +autom4te-2.53.cache diff --git a/lustre/ChangeLog b/lustre/ChangeLog index fc930e2d..41e712f 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -1,4 +1,5 @@ -TBA +2003-01-06 Andreas Dilger + * version v0_5_19 * bug fixes - Fully reactivate OST imports after reconnection (512, others) - Make sure client sees our -ENOTCONN from mds_handle (513 - partial) @@ -20,6 +21,41 @@ TBA - fix paths in lconf, which would load incorrect modules (451, 507) - fix confusing lconf 'host not found' error message (386) - fix lock order deadlock on OST (O/R i_sem before journal ops, 478) + - fix race condition in mdc_blocking_ast() for inode access (526) + - fix lov_unpackmd() unpacking wrong number of stripes (537) + - fix lov_set_osc_active() marking wrong OSC inactive (440) + - fix bad lstripe lov_unpackmd() assertion (fix layering too) (527) + - fix multiple writes of stripe MD to MDS (358, maybe 519) + - fix lstripe in several ways (kernel side) (527) + - fix request leak in ldlm_cli_enqueue (262) + - incorrect OSC was marked inactive after OST failure + - call mds_fs_cleanup before unmounting filesystem (524) + - fix races between taking ns_lock and ldlm_lock_change_resource + - fix races updating LOV export open file list + - fix lov_enqueue error path, avoid decref-ing bad lock handle + - fix recovery NULL deref in ldlm_cli_cancel_unused + - fix some DLM races by using new hash table for lock handles (419) + - permit the client to specify desired inodes, at replay + - duplicate requests when we queue them for replay reintegration + - fix last_rcvd offset calculation + - sync after each recovered transaction, so we always make progress + - never, not always, ERESTART requests without transnos + - store the lov_desc in the MDS, so we don't depend on getlovinfo to + set it + - skip replay if the MDS says that the client is already connected + - don't check for a recovery-enabled export to match lctl's UUID + - don't INC_USE_COUNT for phantom exports + - don't crash when cleaning up phantom exports (567) + - don't double-finish or set replay data for errored mdc_open requests + - abort requests when they time out, so we don't get old replies + - send/receive replies for AST messages again + - if the client says that it doesn't have the lock, cancel it on the + server + - if we timeout during I/O, don't try to cancel an in-use lock; instead + mark it as destroyed, it will all work out when decref is called + - fix module use counts (22, 581) + * protocol changes + - ASTs now expect a reply (server cancels lock on error reply) 2002-12-02 Andreas Dilger * version v0_5_18 diff --git a/lustre/Makefile.am b/lustre/Makefile.am index 6e9281d..b0d8dd3 100644 --- a/lustre/Makefile.am +++ b/lustre/Makefile.am @@ -12,8 +12,8 @@ DIRS24 = extN mds endif # NOTE: keep extN before mds and obdfilter -SUBDIRS = $(DIRS24) obdclass utils ptlrpc ldlm lib obdfilter mdc osc ost llite -SUBDIRS+= obdecho lov tests doc scripts +SUBDIRS = $(DIRS24) obdclass utils ptlrpc ldlm lib obdfilter mdc osc ost llite +SUBDIRS+= obdecho lov cobd ptlbd tests doc scripts conf DIST_SUBDIRS = $(SUBDIRS) EXTRA_DIST = BUGS FDL Rules include archdep.m4 @@ -28,4 +28,4 @@ dist-hook: include $(top_srcdir)/Rules rpms: dist Makefile - rpm -ta $(distdir).tar.gz + rpmbuild -ta $(distdir).tar.gz diff --git a/lustre/cobd/.cvsignore b/lustre/cobd/.cvsignore new file mode 100644 index 0000000..e995588 --- /dev/null +++ b/lustre/cobd/.cvsignore @@ -0,0 +1,3 @@ +.deps +Makefile +Makefile.in diff --git a/lustre/cobd/Makefile.am b/lustre/cobd/Makefile.am new file mode 100644 index 0000000..781c6ce --- /dev/null +++ b/lustre/cobd/Makefile.am @@ -0,0 +1,15 @@ +# Copyright (C) 2002 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +DEFS= + +MODULE = cobd +modulefs_DATA = cobd.o +EXTRA_PROGRAMS = cobd +LINX= + +cobd_SOURCES = cache_obd.c lproc_cache.c $(LINX) + +include $(top_srcdir)/Rules diff --git a/lustre/cobd/cache_obd.c b/lustre/cobd/cache_obd.c new file mode 100644 index 0000000..ac921d8 --- /dev/null +++ b/lustre/cobd/cache_obd.c @@ -0,0 +1,329 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2001, 2002 Cluster File Systems, Inc. + * + * This code is issued under the GNU General Public License. + * See the file COPYING in this distribution + */ + +#define DEBUG_SUBSYSTEM S_COBD + +#include +#include +#include +#include +#include +#include + +extern struct lprocfs_vars status_var_nm_1[]; +extern struct lprocfs_vars status_class_var[]; + +static int +cobd_attach (struct obd_device *dev, obd_count len, void *data) +{ + return (lprocfs_reg_obd (dev, status_var_nm_1, dev)); +} + +static int +cobd_detach (struct obd_device *dev) +{ + return (lprocfs_dereg_obd (dev)); +} + +static int +cobd_setup (struct obd_device *dev, obd_count len, void *buf) +{ + struct obd_ioctl_data *data = (struct obd_ioctl_data *)buf; + struct cache_obd *cobd = &dev->u.cobd; + struct obd_device *target; + struct obd_device *cache; + int rc; + + if (data->ioc_inlbuf1 == NULL || + data->ioc_inlbuf2 == NULL) + return (-EINVAL); + + target = class_uuid2obd (data->ioc_inlbuf1); + cache = class_uuid2obd (data->ioc_inlbuf2); + if (target == NULL || + cache == NULL) + return (-EINVAL); + + /* don't bother checking attached/setup; + * obd_connect() should, and it can change underneath us */ + + rc = obd_connect (&cobd->cobd_target, target, NULL, NULL, NULL); + if (rc != 0) + return (rc); + + rc = obd_connect (&cobd->cobd_cache, cache, NULL, NULL, NULL); + if (rc != 0) + goto fail_0; + + return (0); + + fail_0: + obd_disconnect (&cobd->cobd_target); + return (rc); +} + +static int +cobd_cleanup (struct obd_device *dev) +{ + struct cache_obd *cobd = &dev->u.cobd; + int rc; + + if (!list_empty (&dev->obd_exports)) + return (-EBUSY); + + rc = obd_disconnect (&cobd->cobd_cache); + if (rc != 0) + CERROR ("error %d disconnecting cache\n", rc); + + rc = obd_disconnect (&cobd->cobd_target); + if (rc != 0) + CERROR ("error %d disconnecting target\n", rc); + + return (0); +} + +static int +cobd_connect (struct lustre_handle *conn, struct obd_device *obd, + obd_uuid_t cluuid, struct recovd_obd *recovd, + ptlrpc_recovery_cb_t recover) +{ + int rc = class_connect (conn, obd, cluuid); + + CERROR ("rc %d\n", rc); + return (rc); +} + +static int +cobd_disconnect (struct lustre_handle *conn) +{ + int rc = class_disconnect (conn); + + CERROR ("rc %d\n", rc); + return (rc); +} + +static int +cobd_get_info(struct lustre_handle *conn, obd_count keylen, + void *key, obd_count *vallen, void **val) +{ + struct obd_device *obd = class_conn2obd(conn); + struct cache_obd *cobd; + + if (obd == NULL) { + CERROR("invalid client "LPX64"\n", conn->addr); + return -EINVAL; + } + + cobd = &obd->u.cobd; + + /* intercept cache utilisation info? */ + + return (obd_get_info (&cobd->cobd_target, + keylen, key, vallen, val)); +} + +static int +cobd_statfs(struct lustre_handle *conn, struct obd_statfs *osfs) +{ + struct obd_device *obd = class_conn2obd(conn); + struct cache_obd *cobd; + + if (obd == NULL) { + CERROR("invalid client "LPX64"\n", conn->addr); + return -EINVAL; + } + + cobd = &obd->u.cobd; + return (obd_statfs (&cobd->cobd_target, osfs)); +} + +static int +cobd_getattr(struct lustre_handle *conn, struct obdo *oa, + struct lov_stripe_md *lsm) +{ + struct obd_device *obd = class_conn2obd(conn); + struct cache_obd *cobd; + + if (obd == NULL) { + CERROR("invalid client "LPX64"\n", conn->addr); + return -EINVAL; + } + + cobd = &obd->u.cobd; + return (obd_getattr (&cobd->cobd_target, oa, lsm)); +} + +static int +cobd_open(struct lustre_handle *conn, struct obdo *oa, + struct lov_stripe_md *lsm) +{ + struct obd_device *obd = class_conn2obd(conn); + struct cache_obd *cobd; + + if (obd == NULL) { + CERROR("invalid client "LPX64"\n", conn->addr); + return -EINVAL; + } + + cobd = &obd->u.cobd; + return (obd_open (&cobd->cobd_target, oa, lsm)); +} + +static int +cobd_close(struct lustre_handle *conn, struct obdo *oa, + struct lov_stripe_md *lsm) +{ + struct obd_device *obd = class_conn2obd(conn); + struct cache_obd *cobd; + + if (obd == NULL) { + CERROR("invalid client "LPX64"\n", conn->addr); + return -EINVAL; + } + + cobd = &obd->u.cobd; + return (obd_close (&cobd->cobd_target, oa, lsm)); +} + +static int +cobd_preprw(int cmd, struct lustre_handle *conn, + int objcount, struct obd_ioobj *obj, + int niocount, struct niobuf_remote *nb, + struct niobuf_local *res, void **desc_private) +{ + struct obd_device *obd = class_conn2obd(conn); + struct cache_obd *cobd; + + if (obd == NULL) { + CERROR("invalid client "LPX64"\n", conn->addr); + return -EINVAL; + } + + if ((cmd & OBD_BRW_WRITE) != 0) + return -EOPNOTSUPP; + + cobd = &obd->u.cobd; + return (obd_preprw (cmd, &cobd->cobd_target, + objcount, obj, + niocount, nb, + res, desc_private)); +} + +static int +cobd_commitrw(int cmd, struct lustre_handle *conn, + int objcount, struct obd_ioobj *obj, + int niocount, struct niobuf_local *local, + void *desc_private) +{ + struct obd_device *obd = class_conn2obd(conn); + struct cache_obd *cobd; + + if (obd == NULL) { + CERROR("invalid client "LPX64"\n", conn->addr); + return -EINVAL; + } + + if ((cmd & OBD_BRW_WRITE) != 0) + return -EOPNOTSUPP; + + cobd = &obd->u.cobd; + return (obd_commitrw (cmd, &cobd->cobd_target, + objcount, obj, + niocount, local, + desc_private)); +} + +static inline int +cobd_brw(int cmd, struct lustre_handle *conn, + struct lov_stripe_md *lsm, obd_count oa_bufs, + struct brw_page *pga, struct obd_brw_set *set) +{ + struct obd_device *obd = class_conn2obd(conn); + struct cache_obd *cobd; + + if (obd == NULL) { + CERROR("invalid client "LPX64"\n", conn->addr); + return -EINVAL; + } + + if ((cmd & OBD_BRW_WRITE) != 0) + return -EOPNOTSUPP; + + cobd = &obd->u.cobd; + return (obd_brw (cmd, &cobd->cobd_target, + lsm, oa_bufs, pga, set)); +} + +static int +cobd_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len, + void *karg, void *uarg) +{ + struct obd_device *obd = class_conn2obd(conn); + struct cache_obd *cobd; + + if (obd == NULL) { + CERROR("invalid client "LPX64"\n", conn->addr); + return -EINVAL; + } + + /* intercept? */ + + cobd = &obd->u.cobd; + return (obd_iocontrol (cmd, &cobd->cobd_target, len, karg, uarg)); +} + +static struct obd_ops cobd_ops = { + o_owner: THIS_MODULE, + o_attach: cobd_attach, + o_detach: cobd_detach, + + o_setup: cobd_setup, + o_cleanup: cobd_cleanup, + + o_connect: cobd_connect, + o_disconnect: cobd_disconnect, + + o_get_info: cobd_get_info, + o_statfs: cobd_statfs, + + o_getattr: cobd_getattr, + o_open: cobd_open, + o_close: cobd_close, + o_preprw: cobd_preprw, + o_commitrw: cobd_commitrw, + o_brw: cobd_brw, + o_iocontrol: cobd_iocontrol, +}; + +static int __init +cobd_init (void) +{ + int rc; + + printk (KERN_INFO "Lustre Caching OBD driver\n"); + + rc = class_register_type (&cobd_ops, status_class_var, + OBD_CACHE_DEVICENAME); + return (rc); +} + +static void __exit +cobd_exit (void) +{ + class_unregister_type (OBD_CACHE_DEVICENAME); +} + +MODULE_AUTHOR("Cluster Filesystems Inc. "); +MODULE_DESCRIPTION("Lustre Caching OBD driver"); +MODULE_LICENSE("GPL"); + +module_init(cobd_init); +module_exit(cobd_exit); + + diff --git a/lustre/cobd/lproc_cache.c b/lustre/cobd/lproc_cache.c new file mode 100644 index 0000000..5adcaf8 --- /dev/null +++ b/lustre/cobd/lproc_cache.c @@ -0,0 +1,95 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ +#define DEBUG_SUBSYSTEM S_CLASS + +#include +#include + +/* + * Common STATUS namespace + */ + +static int rd_uuid (char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device* dev = (struct obd_device*)data; + + return (snprintf(page, count, "%s\n", dev->obd_uuid)); +} + +static int rd_target (char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device *dev = (struct obd_device*)data; + struct cache_obd *cobd = &dev->u.cobd; + struct lustre_handle *conn = &cobd->cobd_target; + struct obd_export *exp; + int rc; + + if ((dev->obd_flags & OBD_SET_UP) == 0) + rc = snprintf (page, count, "not set up\n"); + else { + exp = class_conn2export (conn); + LASSERT (exp != NULL); + rc = snprintf(page, count, "%s\n", exp->exp_obd->obd_uuid); + } + return (rc); +} + +static int rd_cache(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device *dev = (struct obd_device*)data; + struct cache_obd *cobd = &dev->u.cobd; + struct lustre_handle *conn = &cobd->cobd_cache; + struct obd_export *exp; + int rc; + + if ((dev->obd_flags & OBD_SET_UP) == 0) + rc = snprintf (page, count, "not set up\n"); + else { + exp = class_conn2export (conn); + LASSERT (exp != NULL); + rc = snprintf(page, count, "%s\n", exp->exp_obd->obd_uuid); + } + return (rc); +} + +struct lprocfs_vars status_var_nm_1[] = { + {"status/uuid", rd_uuid, 0, 0}, + {"status/target_uuid", rd_target, 0, 0}, + {"status/cache_uuid", rd_cache, 0, 0}, + {0} +}; + +int rd_numrefs(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_type* class = (struct obd_type*)data; + + return (snprintf(page, count, "%d\n", class->typ_refcnt)); +} + +struct lprocfs_vars status_class_var[] = { + {"status/num_refs", rd_numrefs, 0, 0}, + {0} +}; diff --git a/lustre/conf/.cvsignore b/lustre/conf/.cvsignore new file mode 100644 index 0000000..282522d --- /dev/null +++ b/lustre/conf/.cvsignore @@ -0,0 +1,2 @@ +Makefile +Makefile.in diff --git a/lustre/conf/Makefile.am b/lustre/conf/Makefile.am new file mode 100644 index 0000000..7f98129 --- /dev/null +++ b/lustre/conf/Makefile.am @@ -0,0 +1,13 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +EXTRA_DIST = lustre2ldif.xsl lustre.dtd lustre.schema slapd-lustre.conf +ldapconfdir = $(sysconfdir)/openldap +ldapschemadir = $(sysconfdir)/openldap/schema +ldapconf_SCRIPTS = slapd-lustre.conf +ldapschema_SCRIPTS = lustre.schema + +include $(top_srcdir)/Rules + diff --git a/lustre/conf/lustre.dtd b/lustre/conf/lustre.dtd new file mode 100644 index 0000000..73f7c95 --- /dev/null +++ b/lustre/conf/lustre.dtd @@ -0,0 +1,111 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/lustre/conf/lustre2ldif.xsl b/lustre/conf/lustre2ldif.xsl new file mode 100644 index 0000000..f5d8098 --- /dev/null +++ b/lustre/conf/lustre2ldif.xsl @@ -0,0 +1,212 @@ + + + + +fs=lustre +config=,fs=lustre + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/lustre/conf/slapd-lustre.conf b/lustre/conf/slapd-lustre.conf new file mode 100644 index 0000000..de89c76 --- /dev/null +++ b/lustre/conf/slapd-lustre.conf @@ -0,0 +1,12 @@ +####################################################################### +# lustre ldap config database +# $Id: slapd-lustre.conf,v 1.2 2003/01/06 22:17:53 adilger Exp $ +####################################################################### + +database ldbm +suffix "fs=lustre" +rootdn "cn=Manager,fs=lustre" +include /etc/openldap/schema/lustre.schema +rootpw secret +directory /var/lib/ldap/lustre +index objectClass eq, uuid eq diff --git a/lustre/conf/top.ldif b/lustre/conf/top.ldif new file mode 100644 index 0000000..8629444 --- /dev/null +++ b/lustre/conf/top.ldif @@ -0,0 +1,4 @@ +dn: fs=lustre +fs:lustre +objectClass: lustre +desc: Lustre Config diff --git a/lustre/configure.in b/lustre/configure.in index bd378b7..c172cd2 100644 --- a/lustre/configure.in +++ b/lustre/configure.in @@ -128,6 +128,7 @@ AC_SUBST(demodir) AC_OUTPUT(Makefile lib/Makefile ldlm/Makefile obdecho/Makefile ptlrpc/Makefile \ lov/Makefile osc/Makefile mdc/Makefile mds/Makefile ost/Makefile \ + cobd/Makefile ptlbd/Makefile conf/Makefile \ utils/Makefile utils/lconf tests/Makefile obdfilter/Makefile \ obdclass/Makefile llite/Makefile doc/Makefile scripts/Makefile \ scripts/lustre.spec extN/Makefile, chmod +x utils/lconf) diff --git a/lustre/extN/Makefile.am b/lustre/extN/Makefile.am index 33c6d07..5ad1642 100644 --- a/lustre/extN/Makefile.am +++ b/lustre/extN/Makefile.am @@ -16,7 +16,8 @@ EXTRA_PROGRAMS = extN EXTN_FIXES = patch-2.4.18-chaos22 #EXTN_FIXES = ext3-2.4.18-fixes.diff EXTNP = htree-ext3-2.4.18.diff linux-2.4.18ea-0.8.26.diff -EXTNP+= ext3-2.4.18-ino_sb_macro.diff extN-misc-fixup.diff +EXTNP+= ext3-2.4.18-ino_sb_macro.diff extN-misc-fixup.diff extN-noread.diff +EXTNP+= extN-wantedi.diff EXTNC = balloc.c bitmap.c dir.c file.c fsync.c ialloc.c inode.c ioctl.c EXTNC+= namei.c super.c symlink.c EXTNI = extN_fs.h extN_fs_i.h extN_fs_sb.h extN_jbd.h quotaops.h @@ -52,31 +53,27 @@ diff: $(RM) extN.patchT l='$(EXTNC)'; for f in $$l; do \ echo "$$f"; \ - (diff -u $(extN_orig)/$$f extN/$$f) >> extN.patchT; \ - test $$? -le 1 || exit 1; + (diff -u $(extN_orig)/$$f extN/$$f) >> extN.patchT; \ + test $$? -le 1 || exit 1; \ done l='$(EXTNI)'; for f in $$l; do \ echo "$$f"; \ (diff -u $(extN_include_orig)/$$f $(top_srcdir)/include/linux/$$f)>>extN.patchT;\ - test $$? -le 1 || exit 1; + test $$? -le 1 || exit 1; \ done l='$(EXTN_EXTRA)'; for f in $$l; do \ f=`echo "$$f" | sed 's%^fs/%%'`; \ echo "$$f"; \ (cd $(top_srcdir) && \ diff -u /dev/null $$f) >> extN.patchT; \ - test $$? -le 1 || exit 1; + test $$? -le 1 || exit 1; \ done mv -f extN.patchT $(top_builddir)/$(subdir)/extN.patch-$(RELEASE) echo "Don't forget to add $(srcdir)/extN.patch-$(RELEASE) to CVS!" - - .PHONY: diff # Just do the SUB transformation on all our source files. - - sed-stamp: $(RM) $@ rm -rf $(extN_orig) $(extN_include_orig) diff --git a/lustre/extN/extN-noread.diff b/lustre/extN/extN-noread.diff new file mode 100644 index 0000000..463516c --- /dev/null +++ b/lustre/extN/extN-noread.diff @@ -0,0 +1,225 @@ +diff -ru lustre-head/fs/extN/ialloc.c lustre/fs/extN/ialloc.c +--- lustre-head/fs/extN/ialloc.c Mon Dec 23 10:02:58 2002 ++++ lustre/fs/extN/ialloc.c Mon Dec 23 09:46:20 2002 +@@ -289,6 +289,37 @@ + } + + /* ++ * @block_group: block group of inode ++ * @offset: relative offset of inode within @block_group ++ * ++ * Check whether any of the inodes in this disk block are in use. ++ * ++ * Caller must be holding superblock lock (group/bitmap read lock in future). ++ */ ++int extN_itable_block_used(struct super_block *sb, unsigned int block_group, ++ int offset) ++{ ++ int bitmap_nr = load_inode_bitmap(sb, block_group); ++ int inodes_per_block; ++ unsigned long inum, iend; ++ struct buffer_head *ibitmap; ++ ++ if (bitmap_nr < 0) ++ return 1; ++ ++ inodes_per_block = sb->s_blocksize / EXTN_SB(sb)->s_inode_size; ++ inum = offset & ~(inodes_per_block - 1); ++ iend = inum + inodes_per_block; ++ ibitmap = EXTN_SB(sb)->s_inode_bitmap[bitmap_nr]; ++ for (; inum < iend; inum++) { ++ if (inum != offset && extN_test_bit(inum, ibitmap->b_data)) ++ return 1; ++ } ++ ++ return 0; ++} ++ ++/* + * There are two policies for allocating an inode. If the new inode is + * a directory, then a forward search is made for a block group with both + * free space and a low directory-to-inode ratio; if that fails, then of +@@ -312,6 +343,7 @@ + struct extN_group_desc * gdp; + struct extN_group_desc * tmp; + struct extN_super_block * es; ++ struct extN_iloc iloc; + int err = 0; + + /* Cannot create files in a deleted directory */ +@@ -505,7 +538,7 @@ + ei->i_prealloc_count = 0; + #endif + ei->i_block_group = i; +- ++ + if (ei->i_flags & EXTN_SYNC_FL) + inode->i_flags |= S_SYNC; + if (IS_SYNC(inode)) +@@ -514,9 +547,18 @@ + inode->i_generation = sbi->s_next_generation++; + + ei->i_state = EXTN_STATE_NEW; +- err = extN_mark_inode_dirty(handle, inode); ++ err = extN_get_inode_loc_new(inode, &iloc, 1); + if (err) goto fail; +- ++ BUFFER_TRACE(iloc->bh, "get_write_access"); ++ err = extN_journal_get_write_access(handle, iloc.bh); ++ if (err) { ++ brelse(iloc.bh); ++ iloc.bh = NULL; ++ goto fail; ++ } ++ err = extN_mark_iloc_dirty(handle, inode, &iloc); ++ if (err) goto fail; ++ + unlock_super (sb); + if(DQUOT_ALLOC_INODE(inode)) { + DQUOT_DROP(inode); +diff -ru lustre-head/fs/extN/inode.c lustre/fs/extN/inode.c +--- lustre-head/fs/extN/inode.c Mon Dec 23 10:02:58 2002 ++++ lustre/fs/extN/inode.c Mon Dec 23 09:50:25 2002 +@@ -2011,23 +1994,32 @@ + extN_journal_stop(handle, inode); + } + +-/* +- * extN_get_inode_loc returns with an extra refcount against the +- * inode's underlying buffer_head on success. +- */ ++extern int extN_itable_block_used(struct super_block *sb, ++ unsigned int block_group, ++ int offset); ++ ++#define NUM_INODE_PREREAD 16 + +-int extN_get_inode_loc (struct inode *inode, struct extN_iloc *iloc) ++/* ++ * extN_get_inode_loc returns with an extra refcount against the inode's ++ * underlying buffer_head on success. If this is for a new inode allocation ++ * (new is non-zero) then we may be able to optimize away the read if there ++ * are no other in-use inodes in this inode table block. If we need to do ++ * a read, then read in a whole chunk of blocks to avoid blocking again soon ++ * if we are doing lots of creates/updates. ++ */ ++int extN_get_inode_loc_new(struct inode *inode, struct extN_iloc *iloc, int new) + { + struct super_block *sb = inode->i_sb; + struct extN_sb_info *sbi = EXTN_SB(sb); +- struct buffer_head *bh = 0; ++ struct buffer_head *bh[NUM_INODE_PREREAD]; + unsigned long block; + unsigned long block_group; + unsigned long group_desc; + unsigned long desc; + unsigned long offset; + struct extN_group_desc * gdp; +- ++ + if ((inode->i_ino != EXTN_ROOT_INO && + inode->i_ino != EXTN_JOURNAL_INO && + inode->i_ino < EXTN_FIRST_INO(sb)) || +@@ -2042,38 +2034,86 @@ + } + group_desc = block_group >> sbi->s_desc_per_block_bits; + desc = block_group & (sbi->s_desc_per_block - 1); +- bh = sbi->s_group_desc[group_desc]; +- if (!bh) { ++ if (!sbi->s_group_desc[group_desc]) { + extN_error(sb, __FUNCTION__, "Descriptor not loaded"); + goto bad_inode; + } + +- gdp = (struct extN_group_desc *) bh->b_data; ++ gdp = (struct extN_group_desc *)(sbi->s_group_desc[group_desc]->b_data); ++ + /* + * Figure out the offset within the block group inode table + */ +- offset = ((inode->i_ino - 1) % sbi->s_inodes_per_group) * +- sbi->s_inode_size; ++ offset = ((inode->i_ino - 1) % sbi->s_inodes_per_group); ++ + block = le32_to_cpu(gdp[desc].bg_inode_table) + +- (offset >> EXTN_BLOCK_SIZE_BITS(sb)); +- if (!(bh = sb_bread(sb, block))) { +- extN_error (sb, __FUNCTION__, +- "unable to read inode block - " +- "inode=%lu, block=%lu", inode->i_ino, block); +- goto bad_inode; ++ (offset * sbi->s_inode_size >> EXTN_BLOCK_SIZE_BITS(sb)); ++ ++ bh[0] = sb_getblk(sb, block); ++ if (buffer_uptodate(bh[0])) ++ goto done; ++ ++ /* If we don't really need to read this block, and it isn't already ++ * in memory, then we just zero it out. Otherwise, we keep the ++ * current block contents (deleted inode data) for posterity. ++ */ ++ if (new && !extN_itable_block_used(sb, block_group, offset)) { ++ lock_buffer(bh[0]); ++ memset(bh[0]->b_data, 0, bh[0]->b_size); ++ mark_buffer_uptodate(bh[0], 1); ++ unlock_buffer(bh[0]); ++ } else { ++ unsigned long block_end, itable_end; ++ int count = 1; ++ ++ itable_end = le32_to_cpu(gdp[desc].bg_inode_table) + ++ sbi->s_itb_per_group; ++ block_end = block + NUM_INODE_PREREAD; ++ if (block_end > itable_end) ++ block_end = itable_end; ++ ++ for (; block < block_end; block++) { ++ bh[count] = sb_getblk(sb, block); ++ if (count && (buffer_uptodate(bh[count]) || ++ buffer_locked(bh[count]))) { ++ __brelse(bh[count]); ++ } else ++ count++; ++ } ++ ++ ll_rw_block(READ, count, bh); ++ ++ /* Release all but the block we actually need (bh[0]) */ ++ while (--count > 0) ++ __brelse(bh[count]); ++ ++ wait_on_buffer(bh[0]); ++ if (!buffer_uptodate(bh[0])) { ++ extN_error(sb, __FUNCTION__, ++ "unable to read inode block - " ++ "inode=%lu, block=%lu", inode->i_ino, ++ bh[0]->b_blocknr); ++ goto bad_inode; ++ } + } +- offset &= (EXTN_BLOCK_SIZE(sb) - 1); ++ done: ++ offset = (offset * sbi->s_inode_size) & (EXTN_BLOCK_SIZE(sb) - 1); + +- iloc->bh = bh; +- iloc->raw_inode = (struct extN_inode *) (bh->b_data + offset); ++ iloc->bh = bh[0]; ++ iloc->raw_inode = (struct extN_inode *)(bh[0]->b_data + offset); + iloc->block_group = block_group; +- ++ + return 0; +- ++ + bad_inode: + return -EIO; + } + ++int extN_get_inode_loc(struct inode *inode, struct extN_iloc *iloc) ++{ ++ return extN_get_inode_loc_new(inode, iloc, 0); ++} ++ + void extN_read_inode(struct inode * inode) + { + struct extN_iloc iloc; diff --git a/lustre/extN/extN-wantedi.diff b/lustre/extN/extN-wantedi.diff new file mode 100644 index 0000000..3be559f --- /dev/null +++ b/lustre/extN/extN-wantedi.diff @@ -0,0 +1,163 @@ +--- lustre/extN-clean/namei.c 2002-12-30 05:56:09.000000000 -0500 ++++ lustre/extN/namei.c 2002-12-30 06:29:39.000000000 -0500 +@@ -1224,7 +1224,8 @@ + if (IS_SYNC(dir)) + handle->h_sync = 1; + +- inode = extN_new_inode (handle, dir, mode); ++ inode = extN_new_inode (handle, dir, mode, ++ (unsigned long)dentry->d_fsdata); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + inode->i_op = &extN_file_inode_operations; +@@ -1254,7 +1254,8 @@ + if (IS_SYNC(dir)) + handle->h_sync = 1; + +- inode = extN_new_inode (handle, dir, mode); ++ inode = extN_new_inode (handle, dir, mode, ++ (unsigned long)dentry->d_fsdata); + err = PTR_ERR(inode); + if (!IS_ERR(inode)) { + init_special_inode(inode, mode, rdev); +@@ -1286,7 +1286,8 @@ + if (IS_SYNC(dir)) + handle->h_sync = 1; + +- inode = extN_new_inode (handle, dir, S_IFDIR | mode); ++ inode = extN_new_inode (handle, dir, S_IFDIR | mode, ++ (unsigned long)dentry->d_fsdata); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_stop; +@@ -1680,7 +1681,8 @@ + if (IS_SYNC(dir)) + handle->h_sync = 1; + +- inode = extN_new_inode (handle, dir, S_IFLNK|S_IRWXUGO); ++ inode = extN_new_inode (handle, dir, S_IFLNK|S_IRWXUGO, ++ (unsigned long)dentry->d_fsdata); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_stop; +--- lustre/extN-clean/ialloc.c 2002-12-28 23:56:42.000000000 -0500 ++++ lustre/extN/ialloc.c 2002-12-30 06:29:39.000000000 -0500 +@@ -329,8 +329,8 @@ + * For other inodes, search forward from the parent directory's block + * group to find a free inode. + */ +-struct inode * extN_new_inode (handle_t *handle, +- const struct inode * dir, int mode) ++struct inode *extN_new_inode(handle_t *handle, const struct inode *dir, ++ int mode, unsigned long goal) + { + struct super_block * sb; + struct buffer_head * bh; +@@ -360,6 +361,38 @@ + + lock_super (sb); + es = sbi->s_es; ++ ++ if (goal) { ++ i = (goal - 1) / EXTN_INODES_PER_GROUP(sb); ++ j = (goal - 1) % EXTN_INODES_PER_GROUP(sb); ++ gdp = extN_get_group_desc(sb, i, &bh2); ++ ++ bitmap_nr = load_inode_bitmap (sb, i); ++ if (bitmap_nr < 0) ++ goto fail; ++ ++ bh = sbi->s_inode_bitmap[bitmap_nr]; ++ ++ BUFFER_TRACE(bh, "get_write_access"); ++ err = extN_journal_get_write_access(handle, bh); ++ if (err) goto fail; ++ ++ if (extN_set_bit(j, bh->b_data)) { ++ printk(KERN_ERR "goal inode %lu unavailable", goal); ++ /* Oh well, we tried. */ ++ goto repeat; ++ } ++ ++ BUFFER_TRACE(bh, "call extN_journal_dirty_metadata"); ++ err = extN_journal_dirty_metadata(handle, bh); ++ if (err) goto fail; ++ ++ /* We've shortcircuited the allocation system successfully, ++ * now finish filling in the inode. ++ */ ++ goto have_bit_and_group; ++ } ++ + repeat: + gdp = NULL; + i = 0; +@@ -474,6 +509,7 @@ + } + goto repeat; + } ++have_bit_and_group: + j += i * sbi->s_inodes_per_group + 1; + if (j < sbi->s_first_ino || j > le32_to_cpu(es->s_inodes_count)) { + extN_error (sb, "extN_new_inode", +--- lustre/extN-clean/ioctl.c 2002-12-28 23:56:42.000000000 -0500 ++++ lustre/extN/ioctl.c 2002-12-30 06:29:39.000000000 -0500 +@@ -24,6 +24,31 @@ + extN_debug ("cmd = %u, arg = %lu\n", cmd, arg); + + switch (cmd) { ++ case EXTN_IOC_CREATE_INUM: { ++ char name[32]; ++ struct dentry *dchild, *dparent; ++ int rc = 0; ++ ++ dparent = list_entry(inode->i_dentry.next, struct dentry, ++ d_alias); ++ snprintf(name, sizeof name, "%lu", arg); ++ dchild = lookup_one_len(name, dparent, strlen(name)); ++ if (dchild->d_inode) { ++ printk(KERN_ERR "%*s/%lu already exists (ino %lu)\n", ++ dparent->d_name.len, dparent->d_name.name, arg, ++ dchild->d_inode->i_ino); ++ rc = -EEXIST; ++ } else { ++ dchild->d_fsdata = (void *)arg; ++ rc = vfs_create(inode, dchild, 0644); ++ if (rc) ++ printk(KERN_ERR "vfs_create: %d\n", rc); ++ else if (dchild->d_inode->i_ino != arg) ++ rc = -EEXIST; ++ } ++ dput(dchild); ++ return rc; ++ } + case EXTN_IOC_GETFLAGS: + flags = ei->i_flags & EXTN_FL_USER_VISIBLE; + return put_user(flags, (int *) arg); +--- lustre/include/linux/extN_fs.h~ 2002-12-30 06:01:43.000000000 -0500 ++++ lustre/include/linux/extN_fs.h 2002-12-30 06:02:51.000000000 -0500 +@@ -200,6 +200,7 @@ + #define EXTN_IOC_SETFLAGS _IOW('f', 2, long) + #define EXTN_IOC_GETVERSION _IOR('f', 3, long) + #define EXTN_IOC_SETVERSION _IOW('f', 4, long) ++/* EXTN_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */ + #define EXTN_IOC_GETVERSION_OLD _IOR('v', 1, long) + #define EXTN_IOC_SETVERSION_OLD _IOW('v', 2, long) + #ifdef CONFIG_JBD_DEBUG +@@ -632,7 +633,8 @@ + extern int extN_sync_file (struct file *, struct dentry *, int); + + /* ialloc.c */ +-extern struct inode * extN_new_inode (handle_t *, const struct inode *, int); ++extern struct inode * extN_new_inode (handle_t *, const struct inode *, int, ++ unsigned long); + extern void extN_free_inode (handle_t *, struct inode *); + extern struct inode * extN_orphan_get (struct super_block *, ino_t); + extern unsigned long extN_count_free_inodes (struct super_block *); +@@ -714,4 +716,6 @@ + + #endif /* __KERNEL__ */ + ++#define EXTN_IOC_CREATE_INUM _IOW('f', 5, long) ++ + #endif /* _LINUX_EXTN_FS_H */ diff --git a/lustre/include/linux/lustre_dlm.h b/lustre/include/linux/lustre_dlm.h index c1382a9..e552dfd 100644 --- a/lustre/include/linux/lustre_dlm.h +++ b/lustre/include/linux/lustre_dlm.h @@ -145,7 +145,7 @@ typedef int (*ldlm_blocking_callback)(struct ldlm_lock *lock, typedef int (*ldlm_completion_callback)(struct ldlm_lock *lock, int flags); struct ldlm_lock { - __u64 l_random; + struct portals_handle l_handle; // must be first in the structure atomic_t l_refc; struct ldlm_resource *l_resource; struct ldlm_lock *l_parent; @@ -183,8 +183,9 @@ struct ldlm_lock { }; typedef int (*ldlm_res_compat)(struct ldlm_lock *child, struct ldlm_lock *new); -typedef int (*ldlm_res_policy)(struct ldlm_lock *lock, void *req_cookie, - ldlm_mode_t mode, int flags, void *data); +typedef int (*ldlm_res_policy)(struct ldlm_namespace *, struct ldlm_lock *, + void *req_cookie, ldlm_mode_t mode, int flags, + void *data); #define LDLM_PLAIN 10 #define LDLM_EXTENT 11 @@ -246,22 +247,24 @@ extern char *ldlm_it2str(int it); do { \ if (lock->l_resource == NULL) { \ CDEBUG(D_DLMTRACE, "### " format \ - " ns: \?\? lock: %p lrc: %d/%d,%d mode: %s/%s " \ + " ns: \?\? lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s "\ "res: \?\? rrc=\?\? type: \?\?\? remote: "LPX64")\n" \ - , ## a, lock, lock->l_refc, lock->l_readers, \ - lock->l_writers, \ + , ## a, lock, lock->l_handle.h_cookie, \ + atomic_read(&lock->l_refc), \ + lock->l_readers, lock->l_writers, \ ldlm_lockname[lock->l_granted_mode], \ ldlm_lockname[lock->l_req_mode], \ - lock->l_remote_handle.addr); \ + lock->l_remote_handle.cookie); \ break; \ } \ if (lock->l_resource->lr_type == LDLM_EXTENT) { \ CDEBUG(D_DLMTRACE, "### " format \ - " ns: %s lock: %p lrc: %d/%d,%d mode: %s/%s res: "LPU64 \ - "/"LPU64" rrc: %d type: %s ["LPU64"->"LPU64"] remote: " \ - LPX64"\n" , ## a, \ + " ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s " \ + "res: "LPU64"/"LPU64" rrc: %d type: %s ["LPU64"->"LPU64\ + "] remote: "LPX64"\n" , ## a, \ lock->l_resource->lr_namespace->ns_name, lock, \ - lock->l_refc, lock->l_readers, lock->l_writers, \ + lock->l_handle.h_cookie, atomic_read(&lock->l_refc), \ + lock->l_readers, lock->l_writers, \ ldlm_lockname[lock->l_granted_mode], \ ldlm_lockname[lock->l_req_mode], \ lock->l_resource->lr_name[0], \ @@ -269,22 +272,24 @@ do { \ atomic_read(&lock->l_resource->lr_refcount), \ ldlm_typename[lock->l_resource->lr_type], \ lock->l_extent.start, lock->l_extent.end, \ - lock->l_remote_handle.addr); \ + lock->l_remote_handle.cookie); \ break; \ } \ { \ CDEBUG(D_DLMTRACE, "### " format \ - " ns: %s lock: %p lrc: %d/%d,%d mode: %s/%s res: "LPU64 \ - "/"LPU64" rrc: %d type: %s remote: "LPX64"\n" , ## a, \ - lock->l_resource->lr_namespace->ns_name, lock, \ - lock->l_refc, lock->l_readers, lock->l_writers, \ + " ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s " \ + "res: "LPU64"/"LPU64" rrc: %d type: %s remote: "LPX64 \ + "\n" , ## a, lock->l_resource->lr_namespace->ns_name, \ + lock, lock->l_handle.h_cookie, \ + atomic_read (&lock->l_refc), \ + lock->l_readers, lock->l_writers, \ ldlm_lockname[lock->l_granted_mode], \ ldlm_lockname[lock->l_req_mode], \ lock->l_resource->lr_name[0], \ lock->l_resource->lr_name[1], \ atomic_read(&lock->l_resource->lr_refcount), \ ldlm_typename[lock->l_resource->lr_type], \ - lock->l_remote_handle.addr); \ + lock->l_remote_handle.cookie); \ } \ } while (0) @@ -295,22 +300,25 @@ do { \ * Iterators. */ -#define LDLM_ITER_CONTINUE 0 /* keep iterating */ -#define LDLM_ITER_STOP 1 /* stop iterating */ +#define LDLM_ITER_CONTINUE 1 /* keep iterating */ +#define LDLM_ITER_STOP 0 /* stop iterating */ typedef int (*ldlm_iterator_t)(struct ldlm_lock *, void *); +typedef int (*ldlm_res_iterator_t)(struct ldlm_resource *, void *); int ldlm_resource_foreach(struct ldlm_resource *res, ldlm_iterator_t iter, void *closure); int ldlm_namespace_foreach(struct ldlm_namespace *ns, ldlm_iterator_t iter, void *closure); +int ldlm_namespace_foreach_res(struct ldlm_namespace *ns, + ldlm_res_iterator_t iter, void *closure); int ldlm_replay_locks(struct obd_import *imp); /* ldlm_extent.c */ int ldlm_extent_compat(struct ldlm_lock *, struct ldlm_lock *); -int ldlm_extent_policy(struct ldlm_lock *, void *, ldlm_mode_t, int flags, - void *); +int ldlm_extent_policy(struct ldlm_namespace *, struct ldlm_lock *, void *, + ldlm_mode_t, int flags, void *); /* ldlm_lockd.c */ int ldlm_handle_enqueue(struct ptlrpc_request *req); @@ -319,19 +327,17 @@ int ldlm_handle_cancel(struct ptlrpc_request *req); int ldlm_del_waiting_lock(struct ldlm_lock *lock); /* ldlm_lock.c */ -void ldlm_register_intent(int (*arg)(struct ldlm_lock *lock, void *req_cookie, - ldlm_mode_t mode, int flags, void *data)); +void ldlm_register_intent(ldlm_res_policy arg); void ldlm_unregister_intent(void); void ldlm_lock2handle(struct ldlm_lock *lock, struct lustre_handle *lockh); -struct ldlm_lock *__ldlm_handle2lock(struct lustre_handle *, int strict, - int flags); +struct ldlm_lock *__ldlm_handle2lock(struct lustre_handle *, int flags); void ldlm_cancel_callback(struct ldlm_lock *); int ldlm_lock_set_data(struct lustre_handle *, void *data, int datalen); void ldlm_lock_remove_from_lru(struct ldlm_lock *); static inline struct ldlm_lock *ldlm_handle2lock(struct lustre_handle *h) { - return __ldlm_handle2lock(h, 1, 0); + return __ldlm_handle2lock(h, 0); } #define LDLM_LOCK_PUT(lock) \ @@ -363,18 +369,19 @@ ldlm_lock_create(struct ldlm_namespace *ns, struct lustre_handle *parent_lock_handle, __u64 *res_id, __u32 type, ldlm_mode_t mode, void *data, __u32 data_len); -ldlm_error_t ldlm_lock_enqueue(struct ldlm_lock *lock, void *cookie, - int cookie_len, int *flags, +ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *, struct ldlm_lock *, + void *cookie, int cookie_len, int *flags, ldlm_completion_callback completion, ldlm_blocking_callback blocking); struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode, int *flags); void ldlm_lock_cancel(struct ldlm_lock *lock); void ldlm_cancel_locks_for_export(struct obd_export *export); -void ldlm_run_ast_work(struct list_head *rpc_list); +int ldlm_run_ast_work(struct list_head *rpc_list); void ldlm_reprocess_all(struct ldlm_resource *res); -void ldlm_lock_dump(struct ldlm_lock *lock); -void ldlm_lock_dump_handle(struct lustre_handle *); +void ldlm_reprocess_all_ns(struct ldlm_namespace *ns); +void ldlm_lock_dump(int level, struct ldlm_lock *lock); +void ldlm_lock_dump_handle(int level, struct lustre_handle *); /* ldlm_test.c */ int ldlm_test(struct obd_device *device, struct lustre_handle *connh); @@ -406,9 +413,11 @@ void ldlm_res2desc(struct ldlm_resource *res, struct ldlm_resource_desc *desc); void ldlm_dump_all_namespaces(void); void ldlm_namespace_dump(struct ldlm_namespace *); void ldlm_resource_dump(struct ldlm_resource *); -int ldlm_lock_change_resource(struct ldlm_lock *, __u64 new_resid[3]); +int ldlm_lock_change_resource(struct ldlm_namespace *, struct ldlm_lock *, + __u64 new_resid[3]); /* ldlm_request.c */ +int ldlm_expired_completion_wait(void *data); int ldlm_completion_ast(struct ldlm_lock *lock, int flags); int ldlm_cli_enqueue(struct lustre_handle *conn, struct ptlrpc_request *req, diff --git a/lustre/include/linux/lustre_export.h b/lustre/include/linux/lustre_export.h index ba9555c..342721c 100644 --- a/lustre/include/linux/lustre_export.h +++ b/lustre/include/linux/lustre_export.h @@ -18,9 +18,14 @@ #include struct lov_export_data { + spinlock_t led_lock; struct list_head led_open_head; }; +struct ost_export_data { + __u8 oed_uuid[37]; /* client UUID */ +}; + struct obd_export { __u64 exp_cookie; struct list_head exp_obd_chain; @@ -32,12 +37,14 @@ struct obd_export { struct mds_export_data eu_mds_data; struct filter_export_data eu_filter_data; struct lov_export_data eu_lov_data; + struct ost_export_data eu_ost_data; } u; }; #define exp_mds_data u.eu_mds_data #define exp_lov_data u.eu_lov_data #define exp_filter_data u.eu_filter_data +#define exp_ost_data u.eu_ost_data extern struct obd_export *class_conn2export(struct lustre_handle *conn); extern struct obd_device *class_conn2obd(struct lustre_handle *conn); diff --git a/lustre/include/linux/lustre_ha.h b/lustre/include/linux/lustre_ha.h index bfac4c3..87b0bf3 100644 --- a/lustre/include/linux/lustre_ha.h +++ b/lustre/include/linux/lustre_ha.h @@ -52,10 +52,12 @@ int recovd_setup(struct recovd_obd *mgr); int recovd_cleanup(struct recovd_obd *mgr); extern struct recovd_obd *ptlrpc_recovd; +struct ptlrpc_request; int ptlrpc_run_recovery_upcall(struct ptlrpc_connection *conn); -int ptlrpc_reconnect_import(struct obd_import *imp, int rq_opc); -int ptlrpc_replay(struct obd_import *imp, int send_last_flag); +int ptlrpc_reconnect_import(struct obd_import *imp, int rq_opc, + struct ptlrpc_request **reqptr); +int ptlrpc_replay(struct obd_import *imp); int ptlrpc_resend(struct obd_import *imp); void ptlrpc_free_committed(struct obd_import *imp); void ptlrpc_wake_delayed(struct obd_import *imp); diff --git a/lustre/include/linux/lustre_idl.h b/lustre/include/linux/lustre_idl.h index 0febd11..cc194ac14 100644 --- a/lustre/include/linux/lustre_idl.h +++ b/lustre/include/linux/lustre_idl.h @@ -72,6 +72,9 @@ typedef __u8 obd_uuid_t[37]; #define LDLM_CB_REPLY_PORTAL 16 #define LDLM_CANCEL_REQUEST_PORTAL 17 #define LDLM_CANCEL_REPLY_PORTAL 18 +#define PTLBD_REQUEST_PORTAL 19 +#define PTLBD_REPLY_PORTAL 20 +#define PTLBD_BULK_PORTAL 21 #define SVC_KILLED 1 #define SVC_EVENT 2 @@ -126,9 +129,12 @@ struct lustre_msg { #define MSG_OP_FLAG_SHIFT 16 /* Flags that apply to all requests are in the bottom 16 bits */ -#define MSG_GEN_FLAG_MASK 0x0000ffff -#define MSG_LAST_REPLAY 1 -#define MSG_RESENT 2 +#define MSG_GEN_FLAG_MASK 0x0000ffff +#define MSG_LAST_REPLAY 1 +#define MSG_RESENT 2 + +/* XXX horrible interim hack -- see bug 578 */ +#define MSG_REPLAY_IN_PROGRESS 4 static inline int lustre_msg_get_flags(struct lustre_msg *msg) { @@ -231,13 +237,11 @@ struct lov_object_id { /* per-child structure */ struct lov_mds_md { __u32 lmm_magic; - __u32 lmm_unused; /* was packed size of extended attribute */ __u64 lmm_object_id; /* lov object id */ - __u32 lmm_stripe_offset; /* starting stripe offset in lmd_objects */ - __u32 lmm_stripe_count; /* number of stipes in use for this object */ - __u64 lmm_stripe_size; /* size of the stripe */ - __u32 lmm_ost_count; /* how many OST idx are in this LOV md */ - __u32 lmm_stripe_pattern; /* per-lov object stripe pattern */ + __u32 lmm_stripe_size; /* size of the stripe */ + __u32 lmm_stripe_offset; /* starting stripe offset in lmm_objects */ + __u16 lmm_stripe_count; /* number of stipes in use for this object */ + __u16 lmm_ost_count; /* how many OST idx are in this LOV md */ struct lov_object_id lmm_objects[0]; }; @@ -334,6 +338,7 @@ struct ost_body { #define MDS_GETSTATUS 9 #define MDS_STATFS 10 #define MDS_GETLOVINFO 11 +#define MDS_GETATTR_NAME 12 #define REINT_SETATTR 1 #define REINT_CREATE 2 @@ -549,4 +554,33 @@ struct ldlm_reply { __u64 lock_policy_res1; __u64 lock_policy_res2; }; + +/* + * ptlbd, portal block device requests + */ +typedef enum { + PTLBD_QUERY = 200, + PTLBD_READ = 201, + PTLBD_WRITE = 202, +} ptlbd_cmd_t; + +struct ptlbd_op { + __u16 op_cmd; + __u16 op_lun; + __u16 op_niob_cnt; + __u16 op__padding; + __u32 op_block_cnt; +}; + +struct ptlbd_niob { + __u64 n_xid; + __u64 n_block_nr; + __u32 n_offset; + __u32 n_length; +}; + +struct ptlbd_rsp { + __u16 r_status; + __u16 r_error_cnt; +}; #endif diff --git a/lustre/include/linux/lustre_lib.h b/lustre/include/linux/lustre_lib.h index aa58c49..b1f9288 100644 --- a/lustre/include/linux/lustre_lib.h +++ b/lustre/include/linux/lustre_lib.h @@ -53,10 +53,13 @@ struct ptlrpc_request; struct obd_device; struct recovd_data; struct recovd_obd; +struct obd_export; #include int target_handle_connect(struct ptlrpc_request *req); int target_handle_disconnect(struct ptlrpc_request *req); +int target_handle_reconnect(struct lustre_handle *conn, struct obd_export *exp, + char *cluuid); int client_obd_connect(struct lustre_handle *conn, struct obd_device *obd, obd_uuid_t cluuid, struct recovd_obd *recovd, ptlrpc_recovery_cb_t recover); @@ -138,17 +141,6 @@ static inline void ldlm_object2handle(void *object, struct lustre_handle *handle handle->addr = (__u64)(unsigned long)object; } -struct obd_statfs; -struct statfs; -void statfs_pack(struct obd_statfs *osfs, struct statfs *sfs); -void statfs_unpack(struct statfs *sfs, struct obd_statfs *osfs); -void obd_statfs_pack(struct obd_statfs *tgt, struct obd_statfs *src); -static inline void -obd_statfs_unpack(struct obd_statfs *tgt, struct obd_statfs *src) -{ - obd_statfs_pack(tgt, src); -} - #include /* @@ -408,11 +400,13 @@ static inline int obd_ioctl_getdata(char **buf, int *len, void *arg) } if (data->ioc_inllen2) { - data->ioc_inlbuf2 = &data->ioc_bulk[0] + size_round(data->ioc_inllen1); + data->ioc_inlbuf2 = &data->ioc_bulk[0] + + size_round(data->ioc_inllen1); } if (data->ioc_inllen3) { - data->ioc_inlbuf3 = &data->ioc_bulk[0] + size_round(data->ioc_inllen1) + + data->ioc_inlbuf3 = &data->ioc_bulk[0] + + size_round(data->ioc_inllen1) + size_round(data->ioc_inllen2); } @@ -426,7 +420,7 @@ static inline int obd_ioctl_getdata(char **buf, int *len, void *arg) #define OBD_IOC_CLEANUP _IO ('f', 103 ) #define OBD_IOC_DESTROY _IOW ('f', 104, long) #define OBD_IOC_PREALLOCATE _IOWR('f', 105, long) -#define OBD_IOC_DEC_USE_COUNT _IO ('f', 106 ) + #define OBD_IOC_SETATTR _IOW ('f', 107, long) #define OBD_IOC_GETATTR _IOR ('f', 108, long) #define OBD_IOC_READ _IOWR('f', 109, long) diff --git a/lustre/include/linux/lustre_lite.h b/lustre/include/linux/lustre_lite.h index a965bcb..deb9656 100644 --- a/lustre/include/linux/lustre_lite.h +++ b/lustre/include/linux/lustre_lite.h @@ -224,7 +224,7 @@ struct ldlm_lock; int ll_lock_callback(struct ldlm_lock *, struct ldlm_lock_desc *, void *data, __u32 data_len, int flag); int ll_size_lock(struct inode *, struct lov_stripe_md *, obd_off start, - int mode, struct lustre_handle **); + int mode, struct lustre_handle *); int ll_size_unlock(struct inode *, struct lov_stripe_md *, int mode, struct lustre_handle *); int ll_file_size(struct inode *inode, struct lov_stripe_md *md); diff --git a/lustre/include/linux/lustre_mds.h b/lustre/include/linux/lustre_mds.h index 558c10b..7a02dae 100644 --- a/lustre/include/linux/lustre_mds.h +++ b/lustre/include/linux/lustre_mds.h @@ -155,7 +155,7 @@ int mds_pack_md(struct mds_obd *mds, struct ptlrpc_request *req, /* mds/mds_fs.c */ int mds_fs_setup(struct obd_device *obddev, struct vfsmount *mnt); -void mds_fs_cleanup(struct obd_device *obddev); +int mds_fs_cleanup(struct obd_device *obddev); /* mdc/mdc_request.c */ int mdc_enqueue(struct lustre_handle *conn, int lock_type, @@ -167,8 +167,11 @@ int mdc_getlovinfo(struct obd_device *obd, struct lustre_handle *mdc_connh, struct ptlrpc_request **request); int mdc_getstatus(struct lustre_handle *conn, struct ll_fid *rootfid); int mdc_getattr(struct lustre_handle *conn, - obd_id ino, int type, unsigned long valid, size_t ea_size, + obd_id ino, int type, unsigned long valid, unsigned int ea_size, struct ptlrpc_request **request); +int mdc_getattr_name(struct lustre_handle *conn, struct inode *parent, + char *filename, int namelen, unsigned long valid, + unsigned int ea_size, struct ptlrpc_request **request); int mdc_setattr(struct lustre_handle *conn, struct inode *, struct iattr *iattr, struct ptlrpc_request **); int mdc_open(struct lustre_handle *conn, obd_id ino, int type, int flags, diff --git a/lustre/include/linux/lustre_net.h b/lustre/include/linux/lustre_net.h index 142db3b..081492c 100644 --- a/lustre/include/linux/lustre_net.h +++ b/lustre/include/linux/lustre_net.h @@ -48,22 +48,28 @@ #define LDLM_NUM_THREADS 4 #define LDLM_NEVENTS 1024 -#define LDLM_NBUFS 20 -#define LDLM_BUFSIZE (32 * 1024) +#define LDLM_NBUFS 100 +#define LDLM_BUFSIZE (8 * 1024) #define LDLM_MAXREQSIZE 1024 #define MDT_NUM_THREADS 8 #define MDS_NEVENTS 1024 -#define MDS_NBUFS 20 -#define MDS_BUFSIZE (32 * 1024) +#define MDS_NBUFS 100 +#define MDS_BUFSIZE (8 * 1024) #define MDS_MAXREQSIZE 1024 #define OST_NUM_THREADS 6 #define OST_NEVENTS min(num_physpages / 16, 32768UL) -#define OST_NBUFS min(OST_NEVENTS / 128, 256UL) -#define OST_BUFSIZE ((OST_NEVENTS > 4096UL ? 128 : 32) * 1024) +#define OST_NBUFS min(OST_NEVENTS / 128, 1280UL) +#define OST_BUFSIZE ((OST_NEVENTS > 4096UL ? 32 : 8) * 1024) #define OST_MAXREQSIZE (8 * 1024) +#define PTLBD_NUM_THREADS 4 +#define PTLBD_NEVENTS 1024 +#define PTLBD_NBUFS 20 +#define PTLBD_BUFSIZE (32 * 1024) +#define PTLBD_MAXREQSIZE 1024 + #define CONN_INVALID 1 struct ptlrpc_connection { @@ -137,7 +143,6 @@ struct ptlrpc_request { __u64 rq_xid; int rq_level; - time_t rq_timeout; // void * rq_reply_handle; wait_queue_head_t rq_wait_for_rep; @@ -160,13 +165,14 @@ struct ptlrpc_request { #define DEBUG_REQ(level, req, fmt, args...) \ do { \ CDEBUG(level, \ - "@@@ " fmt " req x"LPD64"/t"LPD64" o%d->%s:%d lens %d/%d ref %d fl " \ - "%x\n" , ## args, req->rq_xid, req->rq_transno, \ + "@@@ " fmt " req@%p x"LPD64"/t"LPD64" o%d->%s:%d lens %d/%d ref %d fl " \ + "%x\n" , ## args, req, req->rq_xid, req->rq_reqmsg->transno, \ req->rq_reqmsg ? req->rq_reqmsg->opc : -1, \ req->rq_connection ? (char *)req->rq_connection->c_remote_uuid : "", \ (req->rq_import && req->rq_import->imp_client) ? \ req->rq_import->imp_client->cli_request_portal : -1, \ - req->rq_reqlen, req->rq_replen, req->rq_refcount, req->rq_flags); \ + req->rq_reqlen, req->rq_replen, \ + atomic_read (&req->rq_refcount), req->rq_flags); \ } while (0) struct ptlrpc_bulk_page { diff --git a/lustre/include/linux/obd.h b/lustre/include/linux/obd.h index 9612846..94ffd4f 100644 --- a/lustre/include/linux/obd.h +++ b/lustre/include/linux/obd.h @@ -12,20 +12,29 @@ struct lov_oinfo { /* per-child structure */ __u64 loi_id; /* object ID on the target OST */ - struct lustre_handle *loi_handle; /* handle for object on OST */ + struct lustre_handle *loi_handle; /* open file handle for obj on OST */ int loi_ost_idx; /* OST stripe index in lmd_objects array */ }; struct lov_stripe_md { - __u32 lsm_magic; __u64 lsm_object_id; /* lov object id */ - __u64 lsm_stripe_size; /* size of the stripe */ - __u32 lsm_stripe_pattern; /* per-lov object stripe pattern */ + __u32 lsm_magic; + __u32 lsm_stripe_size; /* size of the stripe */ int lsm_stripe_offset; /* offset of first stripe in lmd_objects */ int lsm_stripe_count; /* how many objects are being striped on */ struct lov_oinfo lsm_oinfo[0]; }; +#define IOC_OSC_TYPE 'h' +#define IOC_OSC_MIN_NR 20 +#define IOC_OSC_REGISTER_LOV _IOWR(IOC_OSC_TYPE, 20, struct obd_device *) +#define IOC_OSC_MAX_NR 50 + +#define IOC_MDC_TYPE 'i' +#define IOC_MDC_MIN_NR 20 +#define IOC_MDC_LOOKUP _IOWR(IOC_MDC_TYPE, 20, struct obd_device *) +#define IOC_MDC_MAX_NR 50 + #ifdef __KERNEL__ # include # include @@ -46,9 +55,9 @@ struct obd_type { }; struct brw_page { - struct page *pg; - obd_size count; obd_off off; + struct page *pg; + int count; obd_flag flag; }; @@ -95,6 +104,7 @@ struct filter_obd { struct dentry *fo_dentry_O_mode[16]; spinlock_t fo_objidlock; /* protects fo_lastobjid increment */ __u64 fo_lastobjid; + __u64 fo_last_committed; struct file_operations *fo_fop; struct inode_operations *fo_iop; struct address_space_operations *fo_aops; @@ -115,11 +125,6 @@ struct client_obd { struct obd_device *cl_containing_lov; }; -#define IOC_OSC_TYPE 'h' -#define IOC_OSC_MIN_NR 20 -#define IOC_OSC_REGISTER_LOV _IOWR('h', 20, struct obd_device *) -#define IOC_OSC_MAX_NR 50 - struct mds_obd { struct ptlrpc_service *mds_service; @@ -146,6 +151,9 @@ struct mds_obd { struct list_head mds_delayed_reply_queue; spinlock_t mds_processing_task_lock; pid_t mds_processing_task; + + int mds_has_lov_desc; + struct lov_desc mds_lov_desc; }; struct ldlm_obd { @@ -169,6 +177,19 @@ struct echo_obd { atomic_t eo_write; }; +/* + * this struct does double-duty acting as either a client or + * server instance .. maybe not wise. + */ +struct ptlbd_obd { + /* server's */ + struct ptlrpc_service *ptlbd_service; + /* client's */ + struct ptlrpc_client bd_client; + struct obd_import bd_import; + int refcount; /* XXX sigh */ +}; + struct recovd_obd { spinlock_t recovd_lock; struct list_head recovd_managed_items; /* items managed */ @@ -202,6 +223,11 @@ struct echo_client_obd { struct lustre_handle conn; /* the local connection to osc/lov */ }; +struct cache_obd { + struct lustre_handle cobd_target; /* local connection to target obd */ + struct lustre_handle cobd_cache; /* local connection to cache obd */ +}; + struct lov_tgt_desc { obd_uuid_t uuid; struct lustre_handle conn; @@ -260,6 +286,8 @@ struct obd_device { struct recovd_obd recovd; struct trace_obd trace; struct lov_obd lov; + struct cache_obd cobd; + struct ptlbd_obd ptlbd; #if 0 struct snap_obd snap; #endif @@ -270,6 +298,7 @@ struct obd_device { }; struct obd_ops { + struct module *o_owner; int (*o_iocontrol)(unsigned int cmd, struct lustre_handle *, int len, void *karg, void *uarg); int (*o_get_info)(struct lustre_handle *, obd_count keylen, void *key, diff --git a/lustre/include/linux/obd_cache.h b/lustre/include/linux/obd_cache.h new file mode 100644 index 0000000..e75b9f4 --- /dev/null +++ b/lustre/include/linux/obd_cache.h @@ -0,0 +1,13 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + */ + +#ifndef _OBD_CACHE_H__ +#define _OBD_CACHE_H__ + +#ifdef __KERNEL__ + +#define OBD_CACHE_DEVICENAME "cobd" + +#endif +#endif diff --git a/lustre/include/linux/obd_class.h b/lustre/include/linux/obd_class.h index 197de84..ed3eb99 100644 --- a/lustre/include/linux/obd_class.h +++ b/lustre/include/linux/obd_class.h @@ -673,8 +673,6 @@ static inline void iattr_from_obdo(struct iattr *attr, struct obdo *oa, static inline void obdo_from_inode(struct obdo *dst, struct inode *src, obd_flag valid) { -// if (valid & OBD_MD_FLID) -// dst->o_id = src->i_ino; if (valid & OBD_MD_FLATIME) dst->o_atime = src->i_atime; if (valid & OBD_MD_FLMTIME) @@ -710,8 +708,8 @@ static inline void obdo_from_inode(struct obdo *dst, struct inode *src, static inline void obdo_to_inode(struct inode *dst, struct obdo *src, obd_flag valid) { -// if (valid & OBD_MD_FLID) -// dst->i_ino = src->o_id; + valid &= src->o_valid; + if (valid & OBD_MD_FLATIME) dst->i_atime = src->o_atime; if (valid & OBD_MD_FLMTIME) @@ -847,7 +845,8 @@ int class_name2dev(char *name); int class_uuid2dev(char *uuid); struct obd_device *class_uuid2obd(char *uuid); struct obd_export *class_new_export(struct obd_device *obddev); -struct obd_type *class_nm_to_type(char* name); +struct obd_type *class_get_type(char *name); +void class_put_type(struct obd_type *type); void class_destroy_export(struct obd_export *exp); int class_connect(struct lustre_handle *conn, struct obd_device *obd, obd_uuid_t cluuid); @@ -866,6 +865,13 @@ static inline struct ptlrpc_connection *class_rd2conn(struct recovd_data *rd) return list_entry(rd, struct ptlrpc_connection, c_recovd_data); } +struct obd_statfs; +struct statfs; +void statfs_pack(struct obd_statfs *osfs, struct statfs *sfs); +void statfs_unpack(struct statfs *sfs, struct obd_statfs *osfs); +void obd_statfs_pack(struct obd_statfs *tgt, struct obd_statfs *src); +void obd_statfs_unpack(struct obd_statfs *tgt, struct obd_statfs *src); + #endif /* sysctl.c */ diff --git a/lustre/include/linux/obd_ost.h b/lustre/include/linux/obd_ost.h index e999451..5de0a25 100644 --- a/lustre/include/linux/obd_ost.h +++ b/lustre/include/linux/obd_ost.h @@ -36,7 +36,7 @@ void ost_pack_niobuf(void **tmp, __u64 offset, __u32 len, __u32 flags, __u32 xid); void ost_unpack_niobuf(void **tmp, struct niobuf_remote **nbp); -void ost_pack_ioo(void **tmp, struct lov_stripe_md *oa, int bufcnt); -void ost_unpack_ioo(void **tmp, struct obd_ioobj **ioop); +void ost_pack_ioo(struct obd_ioobj **ioop, struct lov_stripe_md *oa,int bufcnt); +void ost_unpack_ioo(struct obd_ioobj **tmp, struct obd_ioobj **ioop); #endif diff --git a/lustre/include/linux/obd_ptlbd.h b/lustre/include/linux/obd_ptlbd.h new file mode 100644 index 0000000..b4f9fe9 --- /dev/null +++ b/lustre/include/linux/obd_ptlbd.h @@ -0,0 +1,30 @@ +#ifndef _OBD_PTLBD_H +#define _OBD_PTLBD_H + +#include +/* + * Copyright (C) 2002 Cluster File Systems, Inc. + * + * This code is issued under the GNU General Public License. + * See the file COPYING in this distribution + */ + +#define OBD_PTLBD_SV_DEVICENAME "ptlbd_server" +#define OBD_PTLBD_CL_DEVICENAME "ptlbd_client" + +/* XXX maybe this isn't the best header to be dumping all this in.. */ + +extern int ptlbd_blk_init(void); +extern int ptlbd_cl_init(void); +extern int ptlbd_sv_init(void); + +extern void ptlbd_blk_exit(void); +extern void ptlbd_cl_exit(void); +extern void ptlbd_sv_exit(void); + +extern void ptlbd_blk_register(struct ptlbd_obd *ptlbd); +extern int ptlbd_send_req(struct ptlbd_obd *, ptlbd_cmd_t cmd, + struct buffer_head *); +extern int ptlbd_parse_req(struct ptlrpc_request *req); + +#endif diff --git a/lustre/include/linux/obd_support.h b/lustre/include/linux/obd_support.h index f6c2770..e3e23f4 100644 --- a/lustre/include/linux/obd_support.h +++ b/lustre/include/linux/obd_support.h @@ -31,6 +31,7 @@ /* global variables */ extern atomic_t obd_memory; +extern int obd_memmax; extern unsigned long obd_fail_loc; extern unsigned long obd_timeout; extern char obd_recovery_upcall[128]; @@ -66,6 +67,7 @@ extern char obd_recovery_upcall[128]; #define OBD_FAIL_MDS_GETSTATUS_PACK 0x11c #define OBD_FAIL_MDS_STATFS_PACK 0x11d #define OBD_FAIL_MDS_STATFS_NET 0x11e +#define OBD_FAIL_MDS_GETATTR_NAME_NET 0x11f #define OBD_FAIL_OST 0x200 #define OBD_FAIL_OST_CONNECT_NET 0x201 @@ -156,13 +158,17 @@ do { \ int s = (size); \ (ptr) = lptr = kmalloc(s, GFP_KERNEL); \ if (lptr == NULL) { \ - CERROR("kmalloc of '" #ptr "' (%ld bytes) failed " \ + CERROR("kmalloc of '" #ptr "' (%d bytes) failed " \ "at %s:%d\n", s, __FILE__, __LINE__); \ } else { \ + int obd_curmem; \ memset(lptr, 0, s); \ atomic_add(s, &obd_memory); \ + obd_curmem = atomic_read(&obd_memory); \ + if (obd_curmem > obd_memmax) \ + obd_memmax = obd_curmem; \ CDEBUG(D_MALLOC, "kmalloced '" #ptr "': %d at %p " \ - "(tot %d)\n", s, lptr, atomic_read(&obd_memory));\ + "(tot %d)\n", s, lptr, obd_curmem); \ } \ } while (0) diff --git a/lustre/ldlm/l_lock.c b/lustre/ldlm/l_lock.c index 680d4f0..e8ffd5b 100644 --- a/lustre/ldlm/l_lock.c +++ b/lustre/ldlm/l_lock.c @@ -65,6 +65,10 @@ void l_lock(struct lustre_lock *lock) owner = 1; spin_unlock(&lock->l_spin); + /* This is safe to increment outside the spinlock because we + * can only have 1 CPU running on the current task + * (i.e. l_owner == current), regardless of the number of CPUs. + */ if (owner) { ++lock->l_depth; } else { diff --git a/lustre/ldlm/ldlm_extent.c b/lustre/ldlm/ldlm_extent.c index 468eb2b..ae1153f 100644 --- a/lustre/ldlm/ldlm_extent.c +++ b/lustre/ldlm/ldlm_extent.c @@ -67,7 +67,8 @@ static void policy_internal(struct list_head *queue, struct ldlm_extent *req_ex, } /* apply the internal policy by walking all the lists */ -int ldlm_extent_policy(struct ldlm_lock *lock, void *req_cookie, +int ldlm_extent_policy(struct ldlm_namespace *ns, struct ldlm_lock *lock, + void *req_cookie, ldlm_mode_t mode, int flags, void *data) { struct ldlm_resource *res = lock->l_resource; @@ -79,11 +80,11 @@ int ldlm_extent_policy(struct ldlm_lock *lock, void *req_cookie, if (!res) LBUG(); - l_lock(&res->lr_namespace->ns_lock); + l_lock(&ns->ns_lock); policy_internal(&res->lr_granted, req_ex, &new_ex, mode); policy_internal(&res->lr_converting, req_ex, &new_ex, mode); policy_internal(&res->lr_waiting, req_ex, &new_ex, mode); - l_unlock(&res->lr_namespace->ns_lock); + l_unlock(&ns->ns_lock); memcpy(&lock->l_extent, &new_ex, sizeof(new_ex)); diff --git a/lustre/ldlm/ldlm_lock.c b/lustre/ldlm/ldlm_lock.c index 81b3b5d..a1220ab 100644 --- a/lustre/ldlm/ldlm_lock.c +++ b/lustre/ldlm/ldlm_lock.c @@ -25,7 +25,6 @@ #include #include -#include #include #include #include @@ -102,12 +101,13 @@ ldlm_res_compat ldlm_res_compat_table[] = { static ldlm_res_policy ldlm_intent_policy_func; -static int ldlm_plain_policy(struct ldlm_lock *lock, void *req_cookie, - ldlm_mode_t mode, int flags, void *data) +static int ldlm_plain_policy(struct ldlm_namespace *ns, struct ldlm_lock *lock, + void *req_cookie, ldlm_mode_t mode, int flags, + void *data) { if ((flags & LDLM_FL_HAS_INTENT) && ldlm_intent_policy_func) { - return ldlm_intent_policy_func(lock, req_cookie, mode, flags, - data); + return ldlm_intent_policy_func(ns, lock, req_cookie, mode, + flags, data); } return ELDLM_OK; @@ -186,6 +186,8 @@ void ldlm_lock_remove_from_lru(struct ldlm_lock *lock) EXIT; } +/* Only called with strict == 0 by recovery, to mark in-use locks as + * should-be-destroyed */ void ldlm_lock_destroy(struct ldlm_lock *lock) { ENTRY; @@ -194,16 +196,16 @@ void ldlm_lock_destroy(struct ldlm_lock *lock) if (!list_empty(&lock->l_children)) { LDLM_DEBUG(lock, "still has children (%p)!", lock->l_children.next); - ldlm_lock_dump(lock); + ldlm_lock_dump(D_ERROR, lock); LBUG(); } if (lock->l_readers || lock->l_writers) { LDLM_DEBUG(lock, "lock still has references"); - ldlm_lock_dump(lock); + ldlm_lock_dump(D_OTHER, lock); } if (!list_empty(&lock->l_res_link)) { - ldlm_lock_dump(lock); + ldlm_lock_dump(D_ERROR, lock); LBUG(); } @@ -217,6 +219,7 @@ void ldlm_lock_destroy(struct ldlm_lock *lock) list_del_init(&lock->l_export_chain); ldlm_lock_remove_from_lru(lock); + portals_handle_unhash(&lock->l_handle); #if 0 /* Wake anyone waiting for this lock */ @@ -257,7 +260,6 @@ static struct ldlm_lock *ldlm_lock_new(struct ldlm_lock *parent, if (lock == NULL) RETURN(NULL); - get_random_bytes(&lock->l_random, sizeof(__u64)); lock->l_resource = ldlm_resource_getref(resource); atomic_set(&lock->l_refc, 2); @@ -279,12 +281,15 @@ static struct ldlm_lock *ldlm_lock_new(struct ldlm_lock *parent, l_unlock(&parent->l_resource->lr_namespace->ns_lock); } + INIT_LIST_HEAD(&lock->l_handle.h_link); + portals_handle_hash(&lock->l_handle, lock_handle_addref); + RETURN(lock); } -int ldlm_lock_change_resource(struct ldlm_lock *lock, __u64 new_resid[3]) +int ldlm_lock_change_resource(struct ldlm_namespace *ns, struct ldlm_lock *lock, + __u64 new_resid[3]) { - struct ldlm_namespace *ns = lock->l_resource->lr_namespace; struct ldlm_resource *oldres = lock->l_resource; ENTRY; @@ -321,66 +326,63 @@ int ldlm_lock_change_resource(struct ldlm_lock *lock, __u64 new_resid[3]) void ldlm_lock2handle(struct ldlm_lock *lock, struct lustre_handle *lockh) { - lockh->addr = (__u64) (unsigned long)lock; - lockh->cookie = lock->l_random; + //lockh->addr = (__u64)(unsigned long)lock; + memset(&lockh->addr, 0x69, sizeof(lockh->addr)); + lockh->cookie = lock->l_handle.h_cookie; } -/* - * if flags: atomically get the lock and set the flags. - * Return NULL if flag already set +/* if flags: atomically get the lock and set the flags. + * Return NULL if flag already set */ -struct ldlm_lock *__ldlm_handle2lock(struct lustre_handle *handle, int strict, - int flags) +struct ldlm_lock *__ldlm_handle2lock(struct lustre_handle *handle, int flags) { struct ldlm_lock *lock = NULL, *retval = NULL; ENTRY; LASSERT(handle); - if (!handle->addr) + lock = portals_handle2object(handle->cookie); + if (lock == NULL) RETURN(NULL); - lock = (struct ldlm_lock *)(unsigned long)(handle->addr); - if (!kmem_cache_validate(ldlm_lock_slab, (void *)lock)) { - //CERROR("bogus lock %p\n", lock); - GOTO(out2, retval); - } - - if (lock->l_random != handle->cookie) { - //CERROR("bogus cookie: lock %p has "LPX64" vs. handle "LPX64 - // "\n", lock, lock->l_random, handle->cookie); - GOTO(out2, NULL); - } - if (!lock->l_resource) { - CERROR("trying to lock bogus resource: lock %p\n", lock); - //LDLM_DEBUG(lock, "ldlm_handle2lock(%p)", lock); - GOTO(out2, retval); - } - if (!lock->l_resource->lr_namespace) { - CERROR("trying to lock bogus namespace: lock %p\n", lock); - //LDLM_DEBUG(lock, "ldlm_handle2lock(%p)", lock); - GOTO(out2, retval); - } + LASSERT(lock->l_resource != NULL); + LASSERT(lock->l_resource->lr_namespace != NULL); l_lock(&lock->l_resource->lr_namespace->ns_lock); - if (strict && lock->l_destroyed) { + + /* It's unlikely but possible that someone marked the lock as + * destroyed after we did handle2object on it */ + if (lock->l_destroyed) { CERROR("lock already destroyed: lock %p\n", lock); - //LDLM_DEBUG(lock, "ldlm_handle2lock(%p)", lock); - GOTO(out, NULL); + LDLM_LOCK_PUT(lock); + GOTO(out, retval); } - if (flags && (lock->l_flags & flags)) - GOTO(out, NULL); + if (flags && (lock->l_flags & flags)) { + LDLM_LOCK_PUT(lock); + GOTO(out, retval); + } if (flags) lock->l_flags |= flags; - retval = LDLM_LOCK_GET(lock); + retval = lock; EXIT; out: l_unlock(&lock->l_resource->lr_namespace->ns_lock); - out2: + return retval; +} + +struct ldlm_lock *ldlm_handle2lock_ns(struct ldlm_namespace *ns, + struct lustre_handle *handle) +{ + struct ldlm_lock *retval = NULL; + + l_lock(&ns->ns_lock); + retval = __ldlm_handle2lock(handle, 0); + l_unlock(&ns->ns_lock); + return retval; } @@ -456,7 +458,7 @@ int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns, void ldlm_lock_decref(struct lustre_handle *lockh, __u32 mode) { - struct ldlm_lock *lock = __ldlm_handle2lock(lockh, 0, 0); + struct ldlm_lock *lock = __ldlm_handle2lock(lockh, 0); struct ldlm_namespace *ns; ENTRY; @@ -466,10 +468,13 @@ void ldlm_lock_decref(struct lustre_handle *lockh, __u32 mode) LDLM_DEBUG(lock, "ldlm_lock_decref(%s)", ldlm_lockname[mode]); ns = lock->l_resource->lr_namespace; l_lock(&lock->l_resource->lr_namespace->ns_lock); - if (mode == LCK_NL || mode == LCK_CR || mode == LCK_PR) + if (mode == LCK_NL || mode == LCK_CR || mode == LCK_PR) { + LASSERT(lock->l_readers > 0); lock->l_readers--; - else + } else { + LASSERT(lock->l_writers > 0); lock->l_writers--; + } /* If we received a blocked AST and this was the last reference, * run the callback. */ @@ -493,8 +498,9 @@ void ldlm_lock_decref(struct lustre_handle *lockh, __u32 mode) ns->ns_nr_unused++; l_unlock(&lock->l_resource->lr_namespace->ns_lock); ldlm_cancel_lru(ns); - } else + } else { l_unlock(&lock->l_resource->lr_namespace->ns_lock); + } LDLM_LOCK_PUT(lock); /* matches the ldlm_lock_get in addref */ LDLM_LOCK_PUT(lock); /* matches the handle2lock above */ @@ -711,8 +717,8 @@ struct ldlm_lock *ldlm_lock_create(struct ldlm_namespace *ns, return lock; } -/* Must be called with lock->l_lock and lock->l_resource->lr_lock not held */ -ldlm_error_t ldlm_lock_enqueue(struct ldlm_lock * lock, +ldlm_error_t ldlm_lock_enqueue(struct ldlm_namespace *ns, + struct ldlm_lock *lock, void *cookie, int cookie_len, int *flags, ldlm_completion_callback completion, @@ -734,7 +740,7 @@ ldlm_error_t ldlm_lock_enqueue(struct ldlm_lock * lock, if (!local && !(*flags & LDLM_FL_REPLAY) && (policy = ldlm_res_policy_table[res->lr_type])) { int rc; - rc = policy(lock, cookie, lock->l_req_mode, *flags, NULL); + rc = policy(ns, lock, cookie, lock->l_req_mode, *flags, NULL); if (rc == ELDLM_LOCK_CHANGED) { res = lock->l_resource; @@ -745,7 +751,7 @@ ldlm_error_t ldlm_lock_enqueue(struct ldlm_lock * lock, } } - l_lock(&res->lr_namespace->ns_lock); + l_lock(&ns->ns_lock); if (local && lock->l_req_mode == lock->l_granted_mode) { /* The server returned a blocked lock, but it was granted before * we got a chance to actually enqueue it. We don't need to do @@ -767,7 +773,7 @@ ldlm_error_t ldlm_lock_enqueue(struct ldlm_lock * lock, * FIXME (bug 268): Detect obvious lies by checking compatibility in * granted/converting queues. */ ldlm_resource_unlink_lock(lock); - if (local || (*flags & LDLM_FL_REPLAY)) { + if (local) { if (*flags & LDLM_FL_BLOCK_CONV) ldlm_resource_add_lock(res, res->lr_converting.prev, lock); @@ -776,6 +782,19 @@ ldlm_error_t ldlm_lock_enqueue(struct ldlm_lock * lock, else ldlm_grant_lock(lock); GOTO(out, ELDLM_OK); + } else if (*flags & LDLM_FL_REPLAY) { + if (*flags & LDLM_FL_BLOCK_CONV) { + ldlm_resource_add_lock(res, res->lr_converting.prev, + lock); + GOTO(out, ELDLM_OK); + } else if (*flags & LDLM_FL_BLOCK_WAIT) { + ldlm_resource_add_lock(res, res->lr_waiting.prev, lock); + GOTO(out, ELDLM_OK); + } else if (*flags & LDLM_FL_BLOCK_GRANTED) { + ldlm_grant_lock(lock); + GOTO(out, ELDLM_OK); + } + /* If no flags, fall through to normal enqueue path. */ } /* FIXME: We may want to optimize by checking lr_most_restr */ @@ -798,7 +817,7 @@ ldlm_error_t ldlm_lock_enqueue(struct ldlm_lock * lock, ldlm_grant_lock(lock); EXIT; out: - l_unlock(&res->lr_namespace->ns_lock); + l_unlock(&ns->ns_lock); /* Don't set 'completion_ast' until here so that if the lock is granted * immediately we don't do an unnecessary completion call. */ lock->l_completion_ast = completion; @@ -828,10 +847,10 @@ static int ldlm_reprocess_queue(struct ldlm_resource *res, RETURN(0); } -void ldlm_run_ast_work(struct list_head *rpc_list) +int ldlm_run_ast_work(struct list_head *rpc_list) { struct list_head *tmp, *pos; - int rc; + int rc, retval = 0; ENTRY; list_for_each_safe(tmp, pos, rpc_list) { @@ -844,20 +863,34 @@ void ldlm_run_ast_work(struct list_head *rpc_list) w->w_datalen, LDLM_CB_BLOCKING); else rc = w->w_lock->l_completion_ast(w->w_lock, w->w_flags); - if (rc) + if (rc == -ERESTART) + retval = rc; + else if (rc) CERROR("Failed AST - should clean & disconnect " "client\n"); LDLM_LOCK_PUT(w->w_lock); list_del(&w->w_list); OBD_FREE(w, sizeof(*w)); } - EXIT; + RETURN(retval); +} + +static int reprocess_one_queue(struct ldlm_resource *res, void *closure) +{ + ldlm_reprocess_all(res); + return LDLM_ITER_CONTINUE; +} + +void ldlm_reprocess_all_ns(struct ldlm_namespace *ns) +{ + (void)ldlm_namespace_foreach_res(ns, reprocess_one_queue, NULL); } /* Must be called with resource->lr_lock not taken. */ void ldlm_reprocess_all(struct ldlm_resource *res) { struct list_head rpc_list = LIST_HEAD_INIT(rpc_list); + int rc; ENTRY; /* Local lock trees don't get reprocessed. */ @@ -866,6 +899,7 @@ void ldlm_reprocess_all(struct ldlm_resource *res) return; } + restart: l_lock(&res->lr_namespace->ns_lock); res->lr_tmp = &rpc_list; @@ -876,7 +910,9 @@ void ldlm_reprocess_all(struct ldlm_resource *res) res->lr_tmp = NULL; l_unlock(&res->lr_namespace->ns_lock); - ldlm_run_ast_work(&rpc_list); + rc = ldlm_run_ast_work(&rpc_list); + if (rc == -ERESTART) + goto restart; EXIT; } @@ -905,10 +941,12 @@ void ldlm_lock_cancel(struct ldlm_lock *lock) ns = res->lr_namespace; l_lock(&ns->ns_lock); + /* Please do not, no matter how tempting, remove this LBUG without + * talking to me first. -phik */ if (lock->l_readers || lock->l_writers) { LDLM_DEBUG(lock, "lock still has references"); - ldlm_lock_dump(lock); - //LBUG(); + ldlm_lock_dump(D_OTHER, lock); + LBUG(); } ldlm_cancel_callback(lock); @@ -1001,18 +1039,18 @@ struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode, RETURN(res); } -void ldlm_lock_dump(struct ldlm_lock *lock) +void ldlm_lock_dump(int level, struct ldlm_lock *lock) { char ver[128]; - if (!(portal_debug & D_OTHER)) + if (!(portal_debug & level)) return; if (RES_VERSION_SIZE != 4) LBUG(); if (!lock) { - CDEBUG(D_OTHER, " NULL LDLM lock\n"); + CDEBUG(level, " NULL LDLM lock\n"); return; } @@ -1020,27 +1058,26 @@ void ldlm_lock_dump(struct ldlm_lock *lock) lock->l_version[0], lock->l_version[1], lock->l_version[2], lock->l_version[3]); - CDEBUG(D_OTHER, " -- Lock dump: %p (%s)\n", lock, ver); + CDEBUG(level, " -- Lock dump: %p (%s)\n", lock, ver); if (lock->l_export && lock->l_export->exp_connection) - CDEBUG(D_OTHER, " Node: NID %x (rhandle: "LPX64")\n", + CDEBUG(level, " Node: NID %x (rhandle: "LPX64")\n", lock->l_export->exp_connection->c_peer.peer_nid, - lock->l_remote_handle.addr); + lock->l_remote_handle.cookie); else - CDEBUG(D_OTHER, " Node: local\n"); - CDEBUG(D_OTHER, " Parent: %p\n", lock->l_parent); - CDEBUG(D_OTHER, " Resource: %p ("LPD64")\n", lock->l_resource, + CDEBUG(level, " Node: local\n"); + CDEBUG(level, " Parent: %p\n", lock->l_parent); + CDEBUG(level, " Resource: %p ("LPD64")\n", lock->l_resource, lock->l_resource->lr_name[0]); - CDEBUG(D_OTHER, " Requested mode: %d, granted mode: %d\n", + CDEBUG(level, " Requested mode: %d, granted mode: %d\n", (int)lock->l_req_mode, (int)lock->l_granted_mode); - CDEBUG(D_OTHER, " Readers: %u ; Writers; %u\n", + CDEBUG(level, " Readers: %u ; Writers; %u\n", lock->l_readers, lock->l_writers); if (lock->l_resource->lr_type == LDLM_EXTENT) - CDEBUG(D_OTHER, " Extent: %Lu -> %Lu\n", - (unsigned long long)lock->l_extent.start, - (unsigned long long)lock->l_extent.end); + CDEBUG(level, " Extent: "LPU64" -> "LPU64"\n", + lock->l_extent.start, lock->l_extent.end); } -void ldlm_lock_dump_handle(struct lustre_handle *lockh) +void ldlm_lock_dump_handle(int level, struct lustre_handle *lockh) { struct ldlm_lock *lock; @@ -1048,7 +1085,7 @@ void ldlm_lock_dump_handle(struct lustre_handle *lockh) if (lock == NULL) return; - ldlm_lock_dump(lock); + ldlm_lock_dump(D_OTHER, lock); LDLM_LOCK_PUT(lock); } diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index b76fbcd..d826db1 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -57,6 +57,7 @@ static void waiting_locks_callback(unsigned long unused) l_pending_chain); if (l->l_callback_timeout > jiffies) break; + CERROR("lock timer expired, lock %p\n", l); LDLM_DEBUG(l, "timer expired, recovering exp %p on conn %p", l->l_export, l->l_export->exp_connection); recovd_conn_fail(l->l_export->exp_connection); @@ -162,14 +163,26 @@ static int ldlm_server_blocking_ast(struct ldlm_lock *lock, memcpy(&body->lock_desc, desc, sizeof(*desc)); LDLM_DEBUG(lock, "server preparing blocking AST"); - req->rq_replen = 0; /* no reply needed */ + req->rq_replen = lustre_msg_size(0, NULL); ldlm_add_waiting_lock(lock); l_unlock(&lock->l_resource->lr_namespace->ns_lock); - (void)ptl_send_rpc(req); + req->rq_level = LUSTRE_CONN_RECOVD; + rc = ptlrpc_queue_wait(req); + if (rc == -ETIMEDOUT || rc == -EINTR) { + ldlm_expired_completion_wait(lock); + } else if (rc) { + CERROR("client returned %d from blocking AST for lock %p\n", + req->rq_status, lock); + LDLM_DEBUG(lock, "client returned error %d from blocking AST", + req->rq_status); + ldlm_lock_cancel(lock); + /* Server-side AST functions are called from ldlm_reprocess_all, + * which needs to be told to please restart its reprocessing. */ + rc = -ERESTART; + } - /* not waiting for reply */ ptlrpc_req_finished(req); RETURN(rc); @@ -199,11 +212,22 @@ static int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags) ldlm_lock2desc(lock, &body->lock_desc); LDLM_DEBUG(lock, "server preparing completion AST"); - req->rq_replen = 0; /* no reply needed */ - - (void)ptl_send_rpc(req); - - /* not waiting for reply */ + req->rq_replen = lustre_msg_size(0, NULL); + + req->rq_level = LUSTRE_CONN_RECOVD; + rc = ptlrpc_queue_wait(req); + if (rc == -ETIMEDOUT || rc == -EINTR) { + ldlm_expired_completion_wait(lock); + } else if (rc) { + CERROR("client returned %d from completion AST for lock %p\n", + req->rq_status, lock); + LDLM_DEBUG(lock, "client returned error %d from completion AST", + req->rq_status); + ldlm_lock_cancel(lock); + /* Server-side AST functions are called from ldlm_reprocess_all, + * which needs to be told to please restart its reprocessing. */ + rc = -ERESTART; + } ptlrpc_req_finished(req); RETURN(rc); @@ -265,8 +289,8 @@ int ldlm_handle_enqueue(struct ptlrpc_request *req) &lock->l_export->exp_ldlm_data.led_held_locks); l_unlock(&lock->l_resource->lr_namespace->ns_lock); - err = ldlm_lock_enqueue(lock, cookie, cookielen, &flags, - ldlm_server_completion_ast, + err = ldlm_lock_enqueue(obddev->obd_namespace, lock, cookie, cookielen, + &flags, ldlm_server_completion_ast, ldlm_server_blocking_ast); if (err != ELDLM_OK) GOTO(out, err); @@ -384,7 +408,11 @@ int ldlm_handle_cancel(struct ptlrpc_request *req) RETURN(0); } -static int ldlm_handle_bl_callback(struct ptlrpc_request *req) +struct ldlm_lock *ldlm_handle2lock_ns(struct ldlm_namespace *ns, + struct lustre_handle *handle); + +static int ldlm_handle_bl_callback(struct ptlrpc_request *req, + struct ldlm_namespace *ns) { struct ldlm_request *dlm_req; struct ldlm_lock *lock; @@ -395,11 +423,11 @@ static int ldlm_handle_bl_callback(struct ptlrpc_request *req) dlm_req = lustre_msg_buf(req->rq_reqmsg, 0); - lock = ldlm_handle2lock(&dlm_req->lock_handle1); + lock = ldlm_handle2lock_ns(ns, &dlm_req->lock_handle1); if (!lock) { CERROR("blocking callback on lock "LPX64" - lock disappeared\n", - dlm_req->lock_handle1.addr); - RETURN(0); + dlm_req->lock_handle1.cookie); + RETURN(-EINVAL); } LDLM_DEBUG(lock, "client blocking AST callback handler START"); @@ -426,7 +454,8 @@ static int ldlm_handle_bl_callback(struct ptlrpc_request *req) RETURN(0); } -static int ldlm_handle_cp_callback(struct ptlrpc_request *req) +static int ldlm_handle_cp_callback(struct ptlrpc_request *req, + struct ldlm_namespace *ns) { struct list_head ast_list = LIST_HEAD_INIT(ast_list); struct ldlm_request *dlm_req; @@ -437,16 +466,16 @@ static int ldlm_handle_cp_callback(struct ptlrpc_request *req) dlm_req = lustre_msg_buf(req->rq_reqmsg, 0); - lock = ldlm_handle2lock(&dlm_req->lock_handle1); + lock = ldlm_handle2lock_ns(ns, &dlm_req->lock_handle1); if (!lock) { CERROR("completion callback on lock "LPX64" - lock " - "disappeared\n", dlm_req->lock_handle1.addr); - RETURN(0); + "disappeared\n", dlm_req->lock_handle1.cookie); + RETURN(-EINVAL); } LDLM_DEBUG(lock, "client completion callback handler START"); - l_lock(&lock->l_resource->lr_namespace->ns_lock); + l_lock(&ns->ns_lock); /* If we receive the completion AST before the actual enqueue returned, * then we might need to switch lock modes, resources, or extents. */ @@ -461,14 +490,14 @@ static int ldlm_handle_cp_callback(struct ptlrpc_request *req) if (memcmp(dlm_req->lock_desc.l_resource.lr_name, lock->l_resource->lr_name, sizeof(__u64) * RES_NAME_SIZE) != 0) { - ldlm_lock_change_resource(lock, + ldlm_lock_change_resource(ns, lock, dlm_req->lock_desc.l_resource.lr_name); LDLM_DEBUG(lock, "completion AST, new resource"); } lock->l_resource->lr_tmp = &ast_list; ldlm_grant_lock(lock); lock->l_resource->lr_tmp = NULL; - l_unlock(&lock->l_resource->lr_namespace->ns_lock); + l_unlock(&ns->ns_lock); LDLM_DEBUG(lock, "callback handler finished, about to run_ast_work"); LDLM_LOCK_PUT(lock); @@ -481,12 +510,13 @@ static int ldlm_handle_cp_callback(struct ptlrpc_request *req) static int ldlm_callback_handler(struct ptlrpc_request *req) { + struct ldlm_namespace *ns; int rc; ENTRY; rc = lustre_unpack_msg(req->rq_reqmsg, req->rq_reqlen); if (rc) { - CERROR("lustre_ldlm: Invalid request: %d\n", rc); + CERROR("Invalid request: %d\n", rc); RETURN(rc); } @@ -501,32 +531,44 @@ static int ldlm_callback_handler(struct ptlrpc_request *req) dlm_req = lustre_msg_buf(req->rq_reqmsg, 0); CERROR("--> lock addr: "LPX64", cookie: "LPX64"\n", dlm_req->lock_handle1.addr,dlm_req->lock_handle1.cookie); - CERROR("--> ignoring this error as a temporary workaround! " - "beware!\n"); - //RETURN(-ENOTCONN); + RETURN(-ENOTCONN); } + LASSERT(req->rq_export != NULL); + LASSERT(req->rq_export->exp_obd != NULL); + ns = req->rq_export->exp_obd->obd_namespace; + LASSERT(ns != NULL); + switch (req->rq_reqmsg->opc) { case LDLM_BL_CALLBACK: CDEBUG(D_INODE, "blocking ast\n"); OBD_FAIL_RETURN(OBD_FAIL_LDLM_BL_CALLBACK, 0); - rc = ldlm_handle_bl_callback(req); - RETURN(rc); + rc = ldlm_handle_bl_callback(req, ns); + break; case LDLM_CP_CALLBACK: CDEBUG(D_INODE, "completion ast\n"); OBD_FAIL_RETURN(OBD_FAIL_LDLM_CP_CALLBACK, 0); - rc = ldlm_handle_cp_callback(req); - RETURN(rc); - + rc = ldlm_handle_cp_callback(req, ns); + break; default: CERROR("invalid opcode %d\n", req->rq_reqmsg->opc); RETURN(-EINVAL); } + req->rq_status = rc; + if (rc) { + ptlrpc_error(req->rq_svc, req); + } else { + rc = lustre_pack_msg(0, NULL, NULL, &req->rq_replen, + &req->rq_repmsg); + if (rc) + RETURN(rc); + ptlrpc_reply(req->rq_svc, req); + } + RETURN(0); } - static int ldlm_cancel_handler(struct ptlrpc_request *req) { int rc; @@ -539,11 +581,14 @@ static int ldlm_cancel_handler(struct ptlrpc_request *req) } if (req->rq_export == NULL) { + struct ldlm_request *dlm_req; CERROR("operation %d with bad export (ptl req %d/rep %d)\n", req->rq_reqmsg->opc, req->rq_request_portal, req->rq_reply_portal); CERROR("--> export addr: "LPX64", cookie: "LPX64"\n", req->rq_reqmsg->addr, req->rq_reqmsg->cookie); + dlm_req = lustre_msg_buf(req->rq_reqmsg, 0); + ldlm_lock_dump_handle(D_ERROR, &dlm_req->lock_handle1); CERROR("--> ignoring this error as a temporary workaround! " "beware!\n"); //RETURN(-ENOTCONN); @@ -568,7 +613,6 @@ static int ldlm_cancel_handler(struct ptlrpc_request *req) RETURN(0); } - static int ldlm_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len, void *karg, void *uarg) { @@ -579,7 +623,7 @@ static int ldlm_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len, if (_IOC_TYPE(cmd) != IOC_LDLM_TYPE || _IOC_NR(cmd) < IOC_LDLM_MIN_NR || _IOC_NR(cmd) > IOC_LDLM_MAX_NR) { - CDEBUG(D_IOCTL, "invalid ioctl (type %ld, nr %ld, size %ld)\n", + CDEBUG(D_IOCTL, "invalid ioctl (type %d, nr %d, size %d)\n", _IOC_TYPE(cmd), _IOC_NR(cmd), _IOC_SIZE(cmd)); RETURN(-EINVAL); } @@ -619,11 +663,9 @@ static int ldlm_setup(struct obd_device *obddev, obd_count len, void *buf) if (ldlm_already_setup) RETURN(-EALREADY); - MOD_INC_USE_COUNT; - rc = ldlm_proc_setup(obddev); if (rc != 0) - GOTO(out_dec, rc); + RETURN(rc); ldlm->ldlm_cb_service = ptlrpc_init_svc(LDLM_NEVENTS, LDLM_NBUFS, LDLM_BUFSIZE, @@ -689,8 +731,6 @@ static int ldlm_setup(struct obd_device *obddev, obd_count len, void *buf) out_proc: ldlm_proc_cleanup(obddev); - out_dec: - MOD_DEC_USE_COUNT; return rc; } @@ -711,7 +751,6 @@ static int ldlm_cleanup(struct obd_device *obddev) ldlm_proc_cleanup(obddev); ldlm_already_setup = 0; - MOD_DEC_USE_COUNT; RETURN(0); } @@ -723,6 +762,7 @@ static int ldlm_connect(struct lustre_handle *conn, struct obd_device *src, } struct obd_ops ldlm_obd_ops = { + o_owner: THIS_MODULE, o_iocontrol: ldlm_iocontrol, o_setup: ldlm_setup, o_cleanup: ldlm_cleanup, @@ -798,7 +838,9 @@ EXPORT_SYMBOL(ldlm_namespace_dump); EXPORT_SYMBOL(ldlm_cancel_locks_for_export); EXPORT_SYMBOL(ldlm_replay_locks); EXPORT_SYMBOL(ldlm_resource_foreach); +EXPORT_SYMBOL(ldlm_reprocess_all_ns); EXPORT_SYMBOL(ldlm_namespace_foreach); +EXPORT_SYMBOL(ldlm_namespace_foreach_res); EXPORT_SYMBOL(l_lock); EXPORT_SYMBOL(l_unlock); diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c index 7a972b9..b71dd20 100644 --- a/lustre/ldlm/ldlm_request.c +++ b/lustre/ldlm/ldlm_request.c @@ -30,7 +30,7 @@ static int interrupted_completion_wait(void *data) RETURN(1); } -static int expired_completion_wait(void *data) +int ldlm_expired_completion_wait(void *data) { struct ldlm_lock *lock = data; struct ptlrpc_connection *conn; @@ -48,6 +48,7 @@ static int expired_completion_wait(void *data) LDLM_DEBUG(lock, "timed out waiting for completion"); CERROR("lock %p timed out from %s\n", lock, conn->c_remote_uuid); + ldlm_lock_dump(D_ERROR, lock); class_signal_connection_failure(conn); } RETURN(0); @@ -56,7 +57,7 @@ static int expired_completion_wait(void *data) int ldlm_completion_ast(struct ldlm_lock *lock, int flags) { struct l_wait_info lwi = - LWI_TIMEOUT_INTR(obd_timeout * HZ, expired_completion_wait, + LWI_TIMEOUT_INTR(obd_timeout * HZ, ldlm_expired_completion_wait, interrupted_completion_wait, lock); int rc = 0; ENTRY; @@ -75,7 +76,7 @@ int ldlm_completion_ast(struct ldlm_lock *lock, int flags) LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, " "sleeping"); - ldlm_lock_dump(lock); + ldlm_lock_dump(D_OTHER, lock); ldlm_reprocess_all(lock->l_resource); noreproc: @@ -131,7 +132,7 @@ static int ldlm_cli_enqueue_local(struct ldlm_namespace *ns, ldlm_lock2handle(lock, lockh); lock->l_connh = NULL; - err = ldlm_lock_enqueue(lock, cookie, cookielen, flags, completion, + err = ldlm_lock_enqueue(ns, lock, cookie, cookielen, flags, completion, blocking); if (err != ELDLM_OK) GOTO(out, err); @@ -243,7 +244,7 @@ int ldlm_cli_enqueue(struct lustre_handle *connh, /* FIXME: if we've already received a completion AST, this will * LBUG! */ ldlm_lock_destroy(lock); - GOTO(out, rc); + GOTO(out_req, rc); } reply = lustre_msg_buf(req->rq_repmsg, 0); @@ -282,28 +283,28 @@ int ldlm_cli_enqueue(struct lustre_handle *connh, (long)reply->lock_resource_name[0], (long)lock->l_resource->lr_name[0]); - ldlm_lock_change_resource(lock, + ldlm_lock_change_resource(ns, lock, reply->lock_resource_name); if (lock->l_resource == NULL) { LBUG(); - RETURN(-ENOMEM); + GOTO(out_req, rc = -ENOMEM); } LDLM_DEBUG(lock, "client-side enqueue, new resource"); } } if (!is_replay) { - rc = ldlm_lock_enqueue(lock, cookie, cookielen, flags, + rc = ldlm_lock_enqueue(ns, lock, cookie, cookielen, flags, completion, blocking); if (lock->l_completion_ast) lock->l_completion_ast(lock, *flags); } - if (!req_passed_in) - ptlrpc_req_finished(req); - LDLM_DEBUG(lock, "client-side enqueue END"); EXIT; + out_req: + if (!req_passed_in) + ptlrpc_req_finished(req); out: LDLM_LOCK_PUT(lock); out_nolock: @@ -437,7 +438,7 @@ int ldlm_cli_cancel(struct lustre_handle *lockh) ENTRY; /* concurrent cancels on the same handle can happen */ - lock = __ldlm_handle2lock(lockh, 0, LDLM_FL_CANCELING); + lock = __ldlm_handle2lock(lockh, LDLM_FL_CANCELING); if (lock == NULL) RETURN(0); @@ -620,6 +621,9 @@ int ldlm_cli_cancel_unused(struct ldlm_namespace *ns, __u64 *res_id, int i; ENTRY; + if (ns == NULL) + RETURN(ELDLM_OK); + if (res_id) RETURN(ldlm_cli_cancel_unused_resource(ns, res_id, flags)); @@ -698,11 +702,22 @@ static int ldlm_iter_helper(struct ldlm_lock *lock, void *closure) return helper->iter(lock, helper->closure); } +static int ldlm_res_iter_helper(struct ldlm_resource *res, void *closure) +{ + return ldlm_resource_foreach(res, ldlm_iter_helper, closure); +} + int ldlm_namespace_foreach(struct ldlm_namespace *ns, ldlm_iterator_t iter, void *closure) { - int i, rc = LDLM_ITER_CONTINUE; struct iter_helper_data helper = { iter: iter, closure: closure }; + return ldlm_namespace_foreach_res(ns, ldlm_res_iter_helper, &helper); +} + +int ldlm_namespace_foreach_res(struct ldlm_namespace *ns, + ldlm_res_iterator_t iter, void *closure) +{ + int i, rc = LDLM_ITER_CONTINUE; l_lock(&ns->ns_lock); for (i = 0; i < RES_HASH_SIZE; i++) { @@ -712,8 +727,7 @@ int ldlm_namespace_foreach(struct ldlm_namespace *ns, ldlm_iterator_t iter, list_entry(tmp, struct ldlm_resource, lr_hash); ldlm_resource_getref(res); - rc = ldlm_resource_foreach(res, ldlm_iter_helper, - &helper); + rc = iter(res, closure); ldlm_resource_putref(res); if (rc == LDLM_ITER_STOP) GOTO(out, rc); @@ -735,22 +749,44 @@ static int ldlm_chain_lock_for_replay(struct ldlm_lock *lock, void *closure) return LDLM_ITER_CONTINUE; } -static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock, - int last) +static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock) { struct ptlrpc_request *req; struct ldlm_request *body; struct ldlm_reply *reply; int rc, size; - int flags = LDLM_FL_REPLAY; - - flags |= lock->l_flags & - (LDLM_FL_BLOCK_GRANTED|LDLM_FL_BLOCK_CONV|LDLM_FL_BLOCK_WAIT); - + int flags; + + /* + * If granted mode matches the requested mode, this lock is granted. + * + * If they differ, but we have a granted mode, then we were granted + * one mode and now want another: ergo, converting. + * + * If we haven't been granted anything and are on a resource list, + * then we're blocked/waiting. + * + * If we haven't been granted anything and we're NOT on a resource list, + * then we haven't got a reply yet and don't have a known disposition. + * This happens whenever a lock enqueue is the request that triggers + * recovery. + */ + if (lock->l_granted_mode == lock->l_req_mode) + flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_GRANTED; + else if (lock->l_granted_mode) + flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_CONV; + else if (!list_empty(&lock->l_res_link)) + flags = LDLM_FL_REPLAY | LDLM_FL_BLOCK_WAIT; + else + flags = LDLM_FL_REPLAY; + size = sizeof(*body); req = ptlrpc_prep_req(imp, LDLM_ENQUEUE, 1, &size, NULL); if (!req) RETURN(-ENOMEM); + + /* We're part of recovery, so don't wait for it. */ + req->rq_level = LUSTRE_CONN_RECOVD; body = lustre_msg_buf(req->rq_reqmsg, 0); ldlm_lock2desc(lock, &body->lock_desc); @@ -760,9 +796,6 @@ static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock, size = sizeof(*reply); req->rq_replen = lustre_msg_size(1, &size); - if (last) - req->rq_reqmsg->flags |= MSG_LAST_REPLAY; - LDLM_DEBUG(lock, "replaying lock:"); rc = ptlrpc_queue_wait(req); if (rc != ELDLM_OK) @@ -792,7 +825,7 @@ int ldlm_replay_locks(struct obd_import *imp) list_for_each_safe(pos, next, &list) { lock = list_entry(pos, struct ldlm_lock, l_pending_chain); - rc = replay_one_lock(imp, lock, (next == &list)); + rc = replay_one_lock(imp, lock); if (rc) break; /* or try to do the rest? */ } diff --git a/lustre/ldlm/ldlm_resource.c b/lustre/ldlm/ldlm_resource.c index d1f5b61..e5960bd 100644 --- a/lustre/ldlm/ldlm_resource.c +++ b/lustre/ldlm/ldlm_resource.c @@ -134,7 +134,9 @@ out_ns: extern struct ldlm_lock *ldlm_lock_get(struct ldlm_lock *lock); -/* If 'local_only' is true, don't try to tell the server, just cleanup. */ +/* If 'local_only' is true, don't try to tell the server, just cleanup. + * This is currently only used for recovery, and we make certain assumptions + * as a result--notably, that we shouldn't cancel locks with refs. -phil */ static void cleanup_resource(struct ldlm_resource *res, struct list_head *q, int local_only) { @@ -147,6 +149,18 @@ static void cleanup_resource(struct ldlm_resource *res, struct list_head *q, lock = list_entry(tmp, struct ldlm_lock, l_res_link); LDLM_LOCK_GET(lock); + if (local_only && (lock->l_readers || lock->l_writers)) { + /* This is a little bit gross, but much better than the + * alternative: pretend that we got a blocking AST from + * the server, so that when the lock is decref'd, it + * will go away ... */ + lock->l_flags |= LDLM_FL_CBPENDING; + /* ... without sending a CANCEL message. */ + lock->l_flags |= LDLM_FL_CANCELING; + LDLM_LOCK_PUT(lock); + continue; + } + /* At shutdown time, don't call the cancellation callback */ lock->l_flags |= LDLM_FL_CANCEL; @@ -170,12 +184,18 @@ static void cleanup_resource(struct ldlm_resource *res, struct list_head *q, } LDLM_LOCK_PUT(lock); } + EXIT; } int ldlm_namespace_cleanup(struct ldlm_namespace *ns, int local_only) { int i; + if (ns == NULL) { + CDEBUG(D_INFO, "NULL ns, skipping cleanup\n"); + return ELDLM_OK; + } + l_lock(&ns->ns_lock); for (i = 0; i < RES_HASH_SIZE; i++) { struct list_head *tmp, *pos; @@ -431,7 +451,8 @@ void ldlm_resource_add_lock(struct ldlm_resource *res, struct list_head *head, l_lock(&res->lr_namespace->ns_lock); ldlm_resource_dump(res); - ldlm_lock_dump(lock); + CDEBUG(D_OTHER, "About to grant this lock:\n"); + ldlm_lock_dump(D_OTHER, lock); LASSERT(list_empty(&lock->l_res_link)); @@ -510,20 +531,20 @@ void ldlm_resource_dump(struct ldlm_resource *res) list_for_each(tmp, &res->lr_granted) { struct ldlm_lock *lock; lock = list_entry(tmp, struct ldlm_lock, l_res_link); - ldlm_lock_dump(lock); + ldlm_lock_dump(D_OTHER, lock); } CDEBUG(D_OTHER, "Converting locks:\n"); list_for_each(tmp, &res->lr_converting) { struct ldlm_lock *lock; lock = list_entry(tmp, struct ldlm_lock, l_res_link); - ldlm_lock_dump(lock); + ldlm_lock_dump(D_OTHER, lock); } CDEBUG(D_OTHER, "Waiting locks:\n"); list_for_each(tmp, &res->lr_waiting) { struct ldlm_lock *lock; lock = list_entry(tmp, struct ldlm_lock, l_res_link); - ldlm_lock_dump(lock); + ldlm_lock_dump(D_OTHER, lock); } } diff --git a/lustre/ldlm/ldlm_test.c b/lustre/ldlm/ldlm_test.c index ce7a73d..b34c9ab 100644 --- a/lustre/ldlm/ldlm_test.c +++ b/lustre/ldlm/ldlm_test.c @@ -172,7 +172,7 @@ int ldlm_test_basics(struct obd_device *obddev) lock1 = ldlm_lock_create(ns, NULL, res_id, LDLM_PLAIN, LCK_CR, NULL, 0); if (lock1 == NULL) LBUG(); - err = ldlm_lock_enqueue(lock1, NULL, 0, &flags, + err = ldlm_lock_enqueue(ns, lock1, NULL, 0, &flags, ldlm_completion_ast, ldlm_blocking_ast); if (err != ELDLM_OK) LBUG(); @@ -180,7 +180,7 @@ int ldlm_test_basics(struct obd_device *obddev) lock = ldlm_lock_create(ns, NULL, res_id, LDLM_PLAIN, LCK_EX, NULL, 0); if (lock == NULL) LBUG(); - err = ldlm_lock_enqueue(lock, NULL, 0, &flags, + err = ldlm_lock_enqueue(ns, lock, NULL, 0, &flags, ldlm_completion_ast, ldlm_blocking_ast); if (err != ELDLM_OK) LBUG(); @@ -222,7 +222,8 @@ int ldlm_test_extents(struct obd_device *obddev) 0); if (lock1 == NULL) LBUG(); - err = ldlm_lock_enqueue(lock1, &ext1, sizeof(ext1), &flags, NULL, NULL); + err = ldlm_lock_enqueue(ns, lock1, &ext1, sizeof(ext1), &flags, NULL, + NULL); if (err != ELDLM_OK) LBUG(); if (!(flags & LDLM_FL_LOCK_CHANGED)) @@ -231,7 +232,8 @@ int ldlm_test_extents(struct obd_device *obddev) flags = 0; lock2 = ldlm_lock_create(ns, NULL, res_id, LDLM_EXTENT, LCK_PR, NULL, 0); - err = ldlm_lock_enqueue(lock2, &ext2, sizeof(ext2), &flags, NULL, NULL); + err = ldlm_lock_enqueue(ns, lock2, &ext2, sizeof(ext2), &flags, NULL, + NULL); if (err != ELDLM_OK) LBUG(); if (!(flags & LDLM_FL_LOCK_CHANGED)) @@ -241,7 +243,7 @@ int ldlm_test_extents(struct obd_device *obddev) lock = ldlm_lock_create(ns, NULL, res_id, LDLM_EXTENT, LCK_EX, NULL, 0); if (lock == NULL) LBUG(); - err = ldlm_lock_enqueue(lock, &ext3, sizeof(ext3), &flags, + err = ldlm_lock_enqueue(ns, lock, &ext3, sizeof(ext3), &flags, NULL, NULL); if (err != ELDLM_OK) LBUG(); @@ -293,7 +295,7 @@ static int ldlm_test_network(struct obd_device *obddev, CERROR("ldlm_cli_convert: %d\n", err); lock = ldlm_handle2lock(&lockh1); - ldlm_lock_dump(lock); + ldlm_lock_dump(D_OTHER, lock); ldlm_lock_put(lock); /* Need to decrement old mode. Don't bother incrementing new @@ -432,6 +434,7 @@ static int ldlm_do_convert(void) static int ldlm_test_main(void *data) { struct ldlm_test_thread *thread = data; + unsigned long flags; ENTRY; lock_kernel(); @@ -440,10 +443,10 @@ static int ldlm_test_main(void *data) sigfillset(¤t->blocked); recalc_sigpending(); #else - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irqsave(¤t->sigmask_lock, flags); sigfillset(¤t->blocked); recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + spin_unlock_irqrestore(¤t->sigmask_lock, flags); #endif sprintf(current->comm, "ldlm_test"); diff --git a/lustre/lib/Makefile.am b/lustre/lib/Makefile.am index da31808..1bcc388 100644 --- a/lustre/lib/Makefile.am +++ b/lustre/lib/Makefile.am @@ -1,4 +1,4 @@ -EXTRA_DIST = mds_updates.c obd_pack.c ll_pack.c simple.c +EXTRA_DIST = mds_updates.c obd_pack.c simple.c EXTRA_DIST += client.c target.c include $(top_srcdir)/Rules diff --git a/lustre/lib/client.c b/lustre/lib/client.c index 03fa4e2..5bf0d4a 100644 --- a/lustre/lib/client.c +++ b/lustre/lib/client.c @@ -44,12 +44,12 @@ struct obd_device *client_tgtuuid2obd(char *tgtuuid) { int i; - for (i=0; i < MAX_OBD_DEVICES; i++) { + for (i = 0; i < MAX_OBD_DEVICES; i++) { struct obd_device *obd = &obd_dev[i]; if ((strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) == 0) || (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0)) { struct client_obd *cli = &obd->u.cli; - if (strncmp(tgtuuid, cli->cl_target_uuid, + if (strncmp(tgtuuid, cli->cl_target_uuid, sizeof(cli->cl_target_uuid)) == 0) return obd; } @@ -107,7 +107,7 @@ int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf) imp->imp_connection = ptlrpc_uuid_to_connection(server_uuid); if (!imp->imp_connection) RETURN(-ENOENT); - + INIT_LIST_HEAD(&imp->imp_replay_list); INIT_LIST_HEAD(&imp->imp_sending_list); INIT_LIST_HEAD(&imp->imp_delayed_list); @@ -120,7 +120,6 @@ int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf) cli->cl_max_mds_easize = sizeof(struct lov_mds_md); - MOD_INC_USE_COUNT; RETURN(0); } @@ -131,7 +130,6 @@ int client_obd_cleanup(struct obd_device * obddev) ptlrpc_cleanup_client(&obd->cl_import); ptlrpc_put_connection(obd->cl_import.imp_connection); - MOD_DEC_USE_COUNT; return 0; } @@ -150,12 +148,10 @@ int client_obd_connect(struct lustre_handle *conn, struct obd_device *obd, ENTRY; down(&cli->cl_sem); - MOD_INC_USE_COUNT; rc = class_connect(conn, obd, cluuid); - if (rc) { - MOD_DEC_USE_COUNT; + if (rc) GOTO(out_sem, rc); - } + cli->cl_conn_count++; if (cli->cl_conn_count > 1) GOTO(out_sem, rc); @@ -217,7 +213,6 @@ out_ldlm: out_disco: cli->cl_conn_count--; class_disconnect(conn); - MOD_DEC_USE_COUNT; } } out_sem: @@ -251,20 +246,20 @@ int client_obd_disconnect(struct lustre_handle *conn) cli->cl_conn_count--; if (cli->cl_conn_count) - GOTO(out_disco, rc = 0); + GOTO(out_no_disconnect, rc = 0); ldlm_namespace_free(obd->obd_namespace); obd->obd_namespace = NULL; request = ptlrpc_prep_req(&cli->cl_import, rq_opc, 0, NULL, NULL); if (!request) - GOTO(out_disco, rc = -ENOMEM); - + GOTO(out_req, rc = -ENOMEM); + request->rq_replen = lustre_msg_size(0, NULL); /* Process disconnects even if we're waiting for recovery. */ request->rq_level = LUSTRE_CONN_RECOVD; - + rc = ptlrpc_queue_wait(request); if (rc) GOTO(out_req, rc); @@ -273,12 +268,11 @@ int client_obd_disconnect(struct lustre_handle *conn) out_req: if (request) ptlrpc_req_finished(request); - out_disco: + list_del_init(&cli->cl_import.imp_chain); + out_no_disconnect: err = class_disconnect(conn); if (!rc && err) rc = err; - list_del_init(&cli->cl_import.imp_chain); - MOD_DEC_USE_COUNT; out_sem: up(&cli->cl_sem); RETURN(rc); diff --git a/lustre/lib/obd_pack.c b/lustre/lib/obd_pack.c index 8b3c33a..a03d2bb 100644 --- a/lustre/lib/obd_pack.c +++ b/lustre/lib/obd_pack.c @@ -27,21 +27,21 @@ #include #include -void ost_pack_ioo(void **tmp, struct lov_stripe_md *lsm, int bufcnt) +void ost_pack_ioo(struct obd_ioobj **tmp, struct lov_stripe_md *lsm,int bufcnt) { struct obd_ioobj *ioo = *tmp; - char *c = *tmp; + void *p = *tmp; ioo->ioo_id = HTON__u64(lsm->lsm_object_id); ioo->ioo_gr = HTON__u64(0); ioo->ioo_type = HTON__u32(S_IFREG); ioo->ioo_bufcnt = HTON__u32(bufcnt); - *tmp = c + sizeof(*ioo); + *tmp = p + sizeof(*ioo); } -void ost_unpack_ioo(void **tmp, struct obd_ioobj **ioop) +void ost_unpack_ioo(struct obd_ioobj **tmp, struct obd_ioobj **ioop) { - char *c = *tmp; + void *p = *tmp; struct obd_ioobj *ioo = *tmp; *ioop = *tmp; @@ -49,7 +49,7 @@ void ost_unpack_ioo(void **tmp, struct obd_ioobj **ioop) ioo->ioo_gr = NTOH__u64(ioo->ioo_gr); ioo->ioo_type = NTOH__u32(ioo->ioo_type); ioo->ioo_bufcnt = NTOH__u32(ioo->ioo_bufcnt); - *tmp = c + sizeof(*ioo); + *tmp = p + sizeof(*ioo); } void ost_pack_niobuf(void **tmp, __u64 offset, __u32 len, __u32 flags, diff --git a/lustre/lib/simple.c b/lustre/lib/simple.c index cb4ccda..73a4383 100644 --- a/lustre/lib/simple.c +++ b/lustre/lib/simple.c @@ -28,12 +28,9 @@ #ifdef OBD_CTXT_DEBUG /* Debugging check only needed during development */ -#define ASSERT_CTXT_MAGIC(magic) do { if ((magic) != OBD_RUN_CTXT_MAGIC) { \ - CERROR("bad ctxt magic\n"); LBUG(); } } while(0) -#define ASSERT_NOT_KERNEL_CTXT(msg) do { if (segment_eq(get_fs(), get_ds())) { \ - CERROR(msg); LBUG(); } } while(0) -#define ASSERT_KERNEL_CTXT(msg) do { if (!segment_eq(get_fs(), get_ds())) { \ - CERROR(msg); LBUG(); } } while(0) +#define ASSERT_CTXT_MAGIC(magic) LASSERT((magic) == OBD_RUN_CTXT_MAGIC) +#define ASSERT_NOT_KERNEL_CTXT(msg) LASSERT(!segment_eq(get_fs(), get_ds())) +#define ASSERT_KERNEL_CTXT(msg) LASSERT(segment_eq(get_fs(), get_ds())) #else #define ASSERT_CTXT_MAGIC(magic) do {} while(0) #define ASSERT_NOT_KERNEL_CTXT(msg) do {} while(0) @@ -56,6 +53,8 @@ void push_ctxt(struct obd_run_ctxt *save, struct obd_run_ctxt *new_ctx, */ save->fs = get_fs(); + LASSERT(atomic_read(¤t->fs->pwd->d_count)); + LASSERT(atomic_read(&new_ctx->pwd->d_count)); save->pwd = dget(current->fs->pwd); save->pwdmnt = mntget(current->fs->pwdmnt); @@ -218,14 +217,19 @@ int lustre_fread(struct file *file, char *str, int len, loff_t *off) */ int lustre_fwrite(struct file *file, const char *str, int len, loff_t *off) { + ENTRY; ASSERT_KERNEL_CTXT("kernel doing write outside kernel context\n"); - if (!file || !file->f_op || !off) + if (!file) + RETURN(-ENOENT); + if (!file->f_op) RETURN(-ENOSYS); + if (!off) + RETURN(-EINVAL); if (!file->f_op->write) RETURN(-EROFS); - return file->f_op->write(file, str, len, off); + RETURN(file->f_op->write(file, str, len, off)); } /* @@ -234,9 +238,10 @@ int lustre_fwrite(struct file *file, const char *str, int len, loff_t *off) */ int lustre_fsync(struct file *file) { + ENTRY; ASSERT_KERNEL_CTXT("kernel doing sync outside kernel context\n"); if (!file || !file->f_op || !file->f_op->fsync) RETURN(-ENOSYS); - return file->f_op->fsync(file, file->f_dentry, 0); + RETURN(file->f_op->fsync(file, file->f_dentry, 0)); } diff --git a/lustre/lib/target.c b/lustre/lib/target.c index 141e155..3889f1c 100644 --- a/lustre/lib/target.c +++ b/lustre/lib/target.c @@ -32,6 +32,42 @@ #include #include +int target_handle_reconnect(struct lustre_handle *conn, struct obd_export *exp, + char *cluuid) +{ + if (exp->exp_connection) { + struct lustre_handle *hdl; + hdl = &exp->exp_ldlm_data.led_import.imp_handle; + /* Might be a re-connect after a partition. */ + if (!memcmp(conn, hdl, sizeof *conn)) { + CERROR("%s reconnecting\n", cluuid); + conn->addr = (__u64) (unsigned long)exp; + conn->cookie = exp->exp_cookie; + RETURN(EALREADY); + } else { + CERROR("%s reconnecting from %s, " + "handle mismatch (ours "LPX64"/"LPX64", " + "theirs "LPX64"/"LPX64")\n", cluuid, + exp->exp_connection->c_remote_uuid, hdl->addr, + hdl->cookie, conn->addr, conn->cookie); + /* XXX disconnect them here? */ + memset(conn, 0, sizeof *conn); + /* This is a little scary, but right now we build this + * file separately into each server module, so I won't + * go _immediately_ to hell. + */ + RETURN(-EALREADY); + } + } + + conn->addr = (__u64) (unsigned long)exp; + conn->cookie = exp->exp_cookie; + CDEBUG(D_INFO, "existing export for UUID '%s' at %p\n", cluuid, exp); + CDEBUG(D_IOCTL,"connect: addr %Lx cookie %Lx\n", + (long long)conn->addr, (long long)conn->cookie); + RETURN(0); +} + int target_handle_connect(struct ptlrpc_request *req) { struct obd_device *target; @@ -73,6 +109,9 @@ int target_handle_connect(struct ptlrpc_request *req) if (rc && rc != EALREADY) GOTO(out, rc); + /* If all else goes well, this is our RPC return code. */ + req->rq_status = rc; + rc = lustre_pack_msg(0, NULL, NULL, &req->rq_replen, &req->rq_repmsg); if (rc) GOTO(out, rc); @@ -100,10 +139,15 @@ int target_handle_connect(struct ptlrpc_request *req) dlmimp->imp_handle.addr = req->rq_reqmsg->addr; dlmimp->imp_handle.cookie = req->rq_reqmsg->cookie; dlmimp->imp_obd = /* LDLM! */ NULL; + dlmimp->imp_recover = NULL; + INIT_LIST_HEAD(&dlmimp->imp_replay_list); + INIT_LIST_HEAD(&dlmimp->imp_sending_list); + INIT_LIST_HEAD(&dlmimp->imp_delayed_list); spin_lock_init(&dlmimp->imp_lock); dlmimp->imp_level = LUSTRE_CONN_FULL; out: - req->rq_status = rc; + if (rc) + req->rq_status = rc; RETURN(rc); } diff --git a/lustre/llite/Makefile.am b/lustre/llite/Makefile.am index 071c0fd..c536a0a 100644 --- a/lustre/llite/Makefile.am +++ b/lustre/llite/Makefile.am @@ -9,13 +9,8 @@ MODULE = llite modulefs_DATA = llite.o EXTRA_PROGRAMS = llite -LINX= ll_pack.c - llite_SOURCES = dcache.c commit_callback.c super.c rw.c super25.c -llite_SOURCES += file.c dir.c sysctl.c symlink.c $(LINX) +llite_SOURCES += file.c dir.c sysctl.c symlink.c llite_SOURCES += recover.c namei.c lproc_llite.c -ll_pack.c: - test -e ll_pack.c || ln -sf $(top_srcdir)/lib/ll_pack.c . - include $(top_srcdir)/Rules diff --git a/lustre/llite/commit_callback.c b/lustre/llite/commit_callback.c index e5a595a..a62716b 100644 --- a/lustre/llite/commit_callback.c +++ b/lustre/llite/commit_callback.c @@ -1,10 +1,10 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * The daemon that causes completed but not committed transactions + * The daemon that causes completed but not committed transactions * on the MDS to be flushed periodically when they are committed. - * A gratuitous getattr RPC is made to the MDS to discover the - * last committed record. + * A gratuitous getattr RPC is made to the MDS to discover the + * last committed record. * * Lustre High Availability Daemon * @@ -37,32 +37,32 @@ static int ll_commitcbd_check_event(struct ll_sb_info *sbi) { - int rc = 0; + int rc = 0; ENTRY; - spin_lock(&sbi->ll_commitcbd_lock); - if (sbi->ll_commitcbd_flags & LL_COMMITCBD_STOPPING) { + spin_lock(&sbi->ll_commitcbd_lock); + if (sbi->ll_commitcbd_flags & LL_COMMITCBD_STOPPING) GOTO(out, rc = 1); - } + EXIT; out: spin_unlock(&sbi->ll_commitcbd_lock); - RETURN(rc); + return rc; } static int ll_commitcbd_main(void *arg) { struct ll_sb_info *sbi = (struct ll_sb_info *)arg; - + unsigned long flags; ENTRY; lock_kernel(); daemonize(); #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irqsave(¤t->sigmask_lock, flags); sigfillset(¤t->blocked); our_recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + spin_unlock_irqrestore(¤t->sigmask_lock, flags); #else sigfillset(¤t->blocked); our_recalc_sigpending(current); @@ -80,19 +80,19 @@ static int ll_commitcbd_main(void *arg) /* And now, loop forever on requests */ while (1) { - wait_event(sbi->ll_commitcbd_waitq, + wait_event(sbi->ll_commitcbd_waitq, ll_commitcbd_check_event(sbi)); spin_lock(&sbi->ll_commitcbd_lock); if (sbi->ll_commitcbd_flags & LL_COMMITCBD_STOPPING) { spin_unlock(&sbi->ll_commitcbd_lock); - CERROR("lustre_commitd quitting\n"); + CERROR("lustre_commitd quitting\n"); EXIT; break; } schedule_timeout(sbi->ll_commitcbd_timeout); - CERROR("commit callback daemon woken up - FIXME\n"); + CERROR("commit callback daemon woken up - FIXME\n"); spin_unlock(&sbi->ll_commitcbd_lock); } @@ -116,7 +116,7 @@ int ll_commitcbd_setup(struct ll_sb_info *sbi) CERROR("cannot start thread\n"); RETURN(rc); } - wait_event(sbi->ll_commitcbd_ctl_waitq, + wait_event(sbi->ll_commitcbd_ctl_waitq, sbi->ll_commitcbd_flags & LL_COMMITCBD_RUNNING); RETURN(0); } diff --git a/lustre/llite/dir.c b/lustre/llite/dir.c index ab9596f..921eea2 100644 --- a/lustre/llite/dir.c +++ b/lustre/llite/dir.c @@ -91,7 +91,7 @@ static int ll_dir_readpage(struct file *file, struct page *page) unlock_page(page); RETURN(rc); } - ldlm_lock_dump_handle(&lockh); + ldlm_lock_dump_handle(D_OTHER, &lockh); if (PageUptodate(page)) { CERROR("Explain this please?\n"); @@ -745,7 +745,69 @@ not_empty: return 0; } +static int ll_dir_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct obd_ioctl_data *data; + ENTRY; + + switch(cmd) { + case IOC_MDC_LOOKUP: { + struct ptlrpc_request *request = NULL; + char *buf = NULL; + char *filename; + int namelen, rc, err, len = 0; + int ea_size = 0; // obd_size_wiremd(&sbi->ll_osc_conn, NULL); + unsigned long valid; + + rc = obd_ioctl_getdata(&buf, &len, (void *)arg); + if (rc) + RETURN(rc); + data = (void *)buf; + + filename = data->ioc_inlbuf1; + namelen = data->ioc_inllen1; + + if (namelen < 1) { + CERROR("IOC_MDC_LOOKUP missing filename\n"); + GOTO(out, rc = -EINVAL); + } + + valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE; + rc = mdc_getattr_name(&sbi->ll_mdc_conn, inode, filename, + namelen, valid, ea_size, &request); + if (rc < 0) { + CERROR("mdc_getattr_name: %d\n", rc); + GOTO(out, rc); + } else { + struct mds_body *body; + body = lustre_msg_buf(request->rq_repmsg, 0); + /* surely there's a better way -phik */ + data->ioc_obdo1.o_mode = body->mode; + data->ioc_obdo1.o_uid = body->uid; + data->ioc_obdo1.o_gid = body->gid; + } + + err = copy_to_user((void *)arg, buf, len); + if (err) + GOTO(out_req, rc = -EFAULT); + + EXIT; + out_req: + ptlrpc_req_finished(request); + out: + OBD_FREE(buf, len); + return rc; + } + default: + CERROR("unrecognized ioctl %#x\n", cmd); + RETURN(-ENOTTY); + } +} + struct file_operations ll_dir_operations = { read: generic_read_dir, - readdir: ll_readdir + readdir: ll_readdir, + ioctl: ll_dir_ioctl }; diff --git a/lustre/llite/file.c b/lustre/llite/file.c index 87c9012..6b37d99 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -33,9 +33,96 @@ int ll_inode_setattr(struct inode *inode, struct iattr *attr, int do_trunc); extern int ll_setattr(struct dentry *de, struct iattr *attr); -int ll_create_objects(struct super_block *sb, obd_id id, uid_t uid, gid_t gid, - struct lov_stripe_md **lsmp) +static int ll_mdc_open(struct lustre_handle *mdc_conn, struct inode *inode, + struct file *file, struct lov_mds_md *lmm, int lmm_size) { + struct ptlrpc_request *req = NULL; + struct ll_file_data *fd; + int rc; + ENTRY; + + LASSERT(!file->private_data); + + fd = kmem_cache_alloc(ll_file_data_slab, SLAB_KERNEL); + if (!fd) + RETURN(-ENOMEM); + + memset(fd, 0, sizeof(*fd)); + fd->fd_mdshandle.addr = (__u64)(unsigned long)file; + get_random_bytes(&fd->fd_mdshandle.cookie, + sizeof(fd->fd_mdshandle.cookie)); + + rc = mdc_open(mdc_conn, inode->i_ino, S_IFREG | inode->i_mode, + file->f_flags, lmm, lmm_size, &fd->fd_mdshandle, &req); + + /* This is the "reply" refcount. */ + ptlrpc_req_finished(req); + + if (rc) + GOTO(out_fd, rc); + + fd->fd_req = req; + file->private_data = fd; + + if (!fd->fd_mdshandle.addr || + fd->fd_mdshandle.addr == (__u64)(unsigned long)file) { + CERROR("hmm, mdc_open didn't assign fd_mdshandle?\n"); + /* XXX handle this how, abort or is it non-fatal? */ + } + + file->f_flags &= ~O_LOV_DELAY_CREATE; + RETURN(0); + +out_fd: + fd->fd_mdshandle.cookie = DEAD_HANDLE_MAGIC; + kmem_cache_free(ll_file_data_slab, fd); + + return -abs(rc); +} + +static int ll_mdc_close(struct lustre_handle *mdc_conn, struct inode *inode, + struct file *file) +{ + struct ll_file_data *fd = file->private_data; + struct ptlrpc_request *req = NULL; + unsigned long flags; + struct obd_import *imp = fd->fd_req->rq_import; + int rc; + + /* Complete the open request and remove it from replay list */ + DEBUG_REQ(D_HA, fd->fd_req, "matched open req %p", fd->fd_req); + rc = mdc_close(&ll_i2sbi(inode)->ll_mdc_conn, inode->i_ino, + inode->i_mode, &fd->fd_mdshandle, &req); + + if (rc) + CERROR("inode %lu close failed: rc = %d\n", inode->i_ino, rc); + ptlrpc_req_finished(req); + + spin_lock_irqsave(&imp->imp_lock, flags); + if (fd->fd_req->rq_transno) { + /* This caused an EA to be written, need to replay as a normal + * transaction now. Our reference is now effectively owned + * by the imp_replay_list, and we'll be committed just like + * other transno-having requests now. + */ + fd->fd_req->rq_flags &= ~PTL_RPC_FL_REPLAY; + spin_unlock_irqrestore(&imp->imp_lock, flags); + } else { + /* No transno means that we can just drop our ref. */ + spin_unlock_irqrestore(&imp->imp_lock, flags); + ptlrpc_req_finished(fd->fd_req); + } + fd->fd_mdshandle.cookie = DEAD_HANDLE_MAGIC; + file->private_data = NULL; + kmem_cache_free(ll_file_data_slab, fd); + + return -abs(rc); +} + +static int ll_osc_open(struct lustre_handle *conn, struct inode *inode, + struct file *file, struct lov_stripe_md *lsm) +{ + struct ll_file_data *fd; struct obdo *oa; int rc; ENTRY; @@ -43,231 +130,245 @@ int ll_create_objects(struct super_block *sb, obd_id id, uid_t uid, gid_t gid, oa = obdo_alloc(); if (!oa) RETURN(-ENOMEM); + oa->o_id = lsm->lsm_object_id; + oa->o_mode = S_IFREG; + oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE | + OBD_MD_FLBLOCKS; + rc = obd_open(conn, oa, lsm); + if (rc) + GOTO(out, rc); - oa->o_mode = S_IFREG | 0600; - oa->o_id = id; - oa->o_uid = uid; - oa->o_gid = gid; - oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLMODE | - OBD_MD_FLUID | OBD_MD_FLGID; - rc = obd_create(ll_s2obdconn(sb), oa, lsmp); - obdo_free(oa); + obdo_to_inode(inode, oa, OBD_MD_FLSIZE | OBD_MD_FLBLOCKS); - if (!rc) - LASSERT(*lsmp && (*lsmp)->lsm_object_id); + fd = file->private_data; + obd_oa2handle(&fd->fd_osthandle, oa); + + atomic_inc(&ll_i2info(inode)->lli_open_count); +out: + obdo_free(oa); RETURN(rc); } -static int ll_file_open(struct inode *inode, struct file *file) +/* Caller must hold lli_open_sem to protect lli->lli_smd from changing and + * duplicate objects from being created. We only install lsm to lli_smd if + * the mdc open was successful (hence stored stripe MD on MDS), otherwise + * other nodes could try to create different objects for the same file. + */ +static int ll_create_open_obj(struct lustre_handle *conn, struct inode *inode, + struct file *file, struct lov_stripe_md *lsm) { - struct ll_sb_info *sbi = ll_i2sbi(inode); struct ll_inode_info *lli = ll_i2info(inode); - struct lustre_handle *conn = ll_i2obdconn(inode); - struct ptlrpc_request *req = NULL; - struct ll_file_data *fd; - struct obdo *oa; - struct lov_stripe_md *lsm; struct lov_mds_md *lmm = NULL; int lmm_size = 0; - int rc = 0; + struct obdo *oa; + int rc, err; ENTRY; - LASSERT(!file->private_data); - - lsm = lli->lli_smd; + oa = obdo_alloc(); + if (!oa) + RETURN(-ENOMEM); - /* delayed create of object (intent created inode) */ - /* XXX object needs to be cleaned up if mdc_open fails */ - /* XXX error handling appropriate here? */ - if (lsm == NULL) { - if (file->f_flags & O_LOV_DELAY_CREATE) { - CDEBUG(D_INODE, "delaying object creation\n"); - RETURN(0); - } - down(&lli->lli_open_sem); - /* Check to see if we lost the race */ - if (!lli->lli_smd) - rc = ll_create_objects(inode->i_sb, inode->i_ino, 0, 0, - &lli->lli_smd); - up(&lli->lli_open_sem); - if (rc) - RETURN(rc); + oa->o_mode = S_IFREG | 0600; + oa->o_id = inode->i_ino; + /* Keep these 0 for now, because chown/chgrp does not change the + * ownership on the OST, and we don't want to allow BA OST NFS + * users to access these objects by mistake. + */ + oa->o_uid = 0; + oa->o_gid = 0; + oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLMODE | + OBD_MD_FLUID | OBD_MD_FLGID; - lsm = lli->lli_smd; + rc = obd_create(conn, oa, &lsm); + if (rc) { + CERROR("error creating objects for inode %lu: rc = %d\n", + inode->i_ino, rc); + GOTO(out_oa, rc); } - /* XXX We should only send this to MDS if we just created these - * objects, except we also need to handle the user-stripe case. - */ - rc = obd_packmd(conn, &lmm, lli->lli_smd); + LASSERT(lsm && lsm->lsm_object_id); + rc = obd_packmd(conn, &lmm, lsm); if (rc < 0) - GOTO(out, rc); + GOTO(out_destroy, rc); lmm_size = rc; - fd = kmem_cache_alloc(ll_file_data_slab, SLAB_KERNEL); - if (!fd) { - if (lmm) - obd_free_wiremd(conn, &lmm); - GOTO(out, rc = -ENOMEM); - } - memset(fd, 0, sizeof(*fd)); + rc = ll_mdc_open(&ll_i2sbi(inode)->ll_mdc_conn,inode,file,lmm,lmm_size); - fd->fd_mdshandle.addr = (__u64)(unsigned long)file; - get_random_bytes(&fd->fd_mdshandle.cookie, - sizeof(fd->fd_mdshandle.cookie)); - rc = mdc_open(&sbi->ll_mdc_conn, inode->i_ino, S_IFREG | inode->i_mode, - file->f_flags, lmm, lmm_size, &fd->fd_mdshandle, &req); - if (lmm) - obd_free_wiremd(conn, &lmm); - fd->fd_req = req; + obd_free_wiremd(conn, &lmm); - /* This is the "reply" refcount. */ - ptlrpc_req_finished(req); - if (rc) - GOTO(out_req, -abs(rc)); - if (!fd->fd_mdshandle.addr || - fd->fd_mdshandle.addr == (__u64)(unsigned long)file) { - CERROR("hmm, mdc_open didn't assign fd_mdshandle?\n"); - /* XXX handle this how, abort or is it non-fatal? */ + /* If we couldn't complete mdc_open() and store the stripe MD on the + * MDS, we need to destroy the objects now or they will be leaked. + */ + if (rc) { + CERROR("error MDS opening %lu with delayed create: rc %d\n", + inode->i_ino, rc); + GOTO(out_destroy, rc); } + lli->lli_smd = lsm; - oa = obdo_alloc(); - if (!oa) - GOTO(out_mdc, rc = -EINVAL); + EXIT; +out_oa: + obdo_free(oa); + return rc; +out_destroy: + obdo_from_inode(oa, inode, OBD_MD_FLTYPE); oa->o_id = lsm->lsm_object_id; - oa->o_mode = S_IFREG; - oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE | - OBD_MD_FLBLOCKS; - rc = obd_open(ll_i2obdconn(inode), oa, lsm); - obdo_to_inode(inode, oa, oa->o_valid & (OBD_MD_FLSIZE|OBD_MD_FLBLOCKS)); + oa->o_valid |= OBD_MD_FLID; + err = obd_destroy(conn, oa, lsm); + obd_free_memmd(conn, &lsm); + if (err) + CERROR("error uncreating inode %lu objects: rc %d\n", + inode->i_ino, err); + goto out_oa; +} - obd_oa2handle(&fd->fd_osthandle, oa); - obdo_free(oa); +/* Open a file, and (for the very first open) create objects on the OSTs at + * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object + * creation or open until ll_lov_setstripe() ioctl is called. We grab + * lli_open_sem to ensure no other process will create objects, send the + * stripe MD to the MDS, or try to destroy the objects if that fails. + * + * If we already have the stripe MD locally, we don't request it in + * mdc_open() by passing a lmm_size = 0. + * + * It is up to the application to ensure no other processes open this file + * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be + * used. We might be able to avoid races of that sort by getting lli_open_sem + * before returning in the O_LOV_DELAY_CREATE case and dropping it here + * or in ll_file_release(), but I'm not sure that is desirable/necessary. + */ +static int ll_file_open(struct inode *inode, struct file *file) +{ + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ll_inode_info *lli = ll_i2info(inode); + struct lustre_handle *conn = ll_i2obdconn(inode); + struct lov_stripe_md *lsm; + int rc = 0; + ENTRY; - if (rc) - GOTO(out_mdc, rc = -abs(rc)); + lsm = lli->lli_smd; + if (lsm == NULL) { + if (file->f_flags & O_LOV_DELAY_CREATE) { + CDEBUG(D_INODE, "delaying object creation\n"); + RETURN(0); + } - atomic_inc(&lli->lli_open_count); + down(&lli->lli_open_sem); + if (!lli->lli_smd) { + rc = ll_create_open_obj(conn, inode, file, NULL); + up(&lli->lli_open_sem); + } else { + CERROR("stripe already set on ino %lu\n", inode->i_ino); + up(&lli->lli_open_sem); + rc = ll_mdc_open(&sbi->ll_mdc_conn, inode, file,NULL,0); + } + lsm = lli->lli_smd; + } else + rc = ll_mdc_open(&sbi->ll_mdc_conn, inode, file, NULL, 0); - file->private_data = fd; + if (rc) + RETURN(rc); + rc = ll_osc_open(conn, inode, file, lsm); + if (rc) + GOTO(out_close, rc); RETURN(0); -out_mdc: - mdc_close(&sbi->ll_mdc_conn, inode->i_ino, - S_IFREG, &fd->fd_mdshandle, &req); -out_req: - ptlrpc_req_finished(req); /* once for an early "commit" */ -//out_fd: - fd->fd_mdshandle.cookie = DEAD_HANDLE_MAGIC; - kmem_cache_free(ll_file_data_slab, fd); -out: +out_close: + ll_mdc_close(&sbi->ll_mdc_conn, inode, file); return rc; } int ll_size_lock(struct inode *inode, struct lov_stripe_md *lsm, obd_off start, - int mode, struct lustre_handle **lockhs_p) + int mode, struct lustre_handle *lockh) { struct ll_sb_info *sbi = ll_i2sbi(inode); struct ldlm_extent extent; - struct lustre_handle *lockhs = NULL; - int rc, flags = 0, stripe_count; + int rc, flags = 0; ENTRY; - if (sbi->ll_flags & LL_SBI_NOLCK) { - *lockhs_p = NULL; + /* XXX phil: can we do this? won't it screw the file size up? */ + if (sbi->ll_flags & LL_SBI_NOLCK) RETURN(0); - } - - stripe_count = lsm->lsm_stripe_count; - if (!stripe_count) - stripe_count = 1; - - OBD_ALLOC(lockhs, stripe_count * sizeof(*lockhs)); - if (lockhs == NULL) - RETURN(-ENOMEM); extent.start = start; extent.end = OBD_OBJECT_EOF; rc = obd_enqueue(&sbi->ll_osc_conn, lsm, NULL, LDLM_EXTENT, &extent, sizeof(extent), mode, &flags, ll_lock_callback, - inode, sizeof(*inode), lockhs); - if (rc != ELDLM_OK) { - CERROR("lock enqueue: %d\n", rc); - OBD_FREE(lockhs, stripe_count * sizeof(*lockhs)); - } else - *lockhs_p = lockhs; + inode, sizeof(*inode), lockh); RETURN(rc); } int ll_size_unlock(struct inode *inode, struct lov_stripe_md *lsm, int mode, - struct lustre_handle *lockhs) + struct lustre_handle *lockh) { struct ll_sb_info *sbi = ll_i2sbi(inode); - int rc, stripe_count; + int rc; ENTRY; + /* XXX phil: can we do this? won't it screw the file size up? */ if (sbi->ll_flags & LL_SBI_NOLCK) RETURN(0); - if (lockhs == NULL) { - LBUG(); - RETURN(-EINVAL); - } - - rc = obd_cancel(&sbi->ll_osc_conn, lsm, mode, lockhs); + rc = obd_cancel(&sbi->ll_osc_conn, lsm, mode, lockh); if (rc != ELDLM_OK) { CERROR("lock cancel: %d\n", rc); LBUG(); } - stripe_count = lsm->lsm_stripe_count; - if (!stripe_count) - stripe_count = 1; - - OBD_FREE(lockhs, stripe_count * sizeof(*lockhs)); RETURN(rc); } int ll_file_size(struct inode *inode, struct lov_stripe_md *lsm) { struct ll_sb_info *sbi = ll_i2sbi(inode); - struct lustre_handle *lockhs; + //struct lustre_handle lockh = { 0, 0 }; struct obdo oa; - int err, rc; + //int err; + int rc; ENTRY; LASSERT(lsm); LASSERT(sbi); - rc = ll_size_lock(inode, lsm, 0, LCK_PR, &lockhs); + /* XXX do not yet need size lock - OST size always correct (sync write) + rc = ll_size_lock(inode, lsm, 0, LCK_PR, &lockh); if (rc != ELDLM_OK) { CERROR("lock enqueue: %d\n", rc); RETURN(rc); } + */ memset(&oa, 0, sizeof oa); oa.o_id = lsm->lsm_object_id; oa.o_mode = S_IFREG; oa.o_valid = OBD_MD_FLID|OBD_MD_FLTYPE|OBD_MD_FLSIZE|OBD_MD_FLBLOCKS; rc = obd_getattr(&sbi->ll_osc_conn, &oa, lsm); - if (!rc) - obdo_to_inode(inode, &oa, - oa.o_valid & ~(OBD_MD_FLTYPE | OBD_MD_FLMODE)); - - err = ll_size_unlock(inode, lsm, LCK_PR, lockhs); + if (!rc) { + obdo_to_inode(inode, &oa, OBD_MD_FLSIZE | OBD_MD_FLBLOCKS); + CDEBUG(D_INODE, LPX64" size %Lu/%Lu\n", + lsm->lsm_object_id, inode->i_size, inode->i_size); + } + /* XXX do not need size lock, because OST size always correct (sync write) + err = ll_size_unlock(inode, lsm, LCK_PR, &lockh); if (err != ELDLM_OK) { CERROR("lock cancel: %d\n", err); - LBUG(); + if (!rc) + rc = err; } + */ RETURN(rc); } +/* While this returns an error code, fput() the caller does not, so we need + * to make every effort to clean up all of our state here. Also, applications + * rarely check close errors and even if an error is returned they will not + * re-try the close call. + */ static int ll_file_release(struct inode *inode, struct file *file) { - struct ptlrpc_request *req = NULL; struct ll_file_data *fd; struct obdo oa; struct ll_sb_info *sbi = ll_i2sbi(inode); @@ -278,93 +379,34 @@ static int ll_file_release(struct inode *inode, struct file *file) ENTRY; fd = (struct ll_file_data *)file->private_data; - if (!fd) { - LASSERT(file->f_flags & O_LOV_DELAY_CREATE); - GOTO(out, rc = 0); - } + if (!fd) /* no process opened the file after an mcreate */ + RETURN(rc = 0); memset(&oa, 0, sizeof(oa)); oa.o_id = lsm->lsm_object_id; oa.o_mode = S_IFREG; oa.o_valid = OBD_MD_FLTYPE | OBD_MD_FLID; obd_handle2oa(&oa, &fd->fd_osthandle); - rc = obd_close(ll_i2obdconn(inode), &oa, lsm); + rc = obd_close(&sbi->ll_osc_conn, &oa, lsm); if (rc) - GOTO(out_mdc, rc = -abs(rc)); - -#if 0 -#error "This should only be done on the node that already has the EOF lock" -#error "and only in the case where the file size actually changed. For now" -#error "we don't care about the size on the MDS, since we never use it (the" -#error "OST always has the authoritative size and we don't even use the MDS." - /* If this fails and we goto out_fd, the file size on the MDS is out of - * date. Is that a big deal? */ - if (file->f_mode & FMODE_WRITE) { - struct lustre_handle *lockhs; - - rc = ll_size_lock(inode, lsm, 0, LCK_PR, &lockhs); - if (rc) - GOTO(out_mdc, -abs(rc)); - - oa.o_id = lsm->lsm_object_id; - oa.o_mode = S_IFREG; - oa.o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE | - OBD_MD_FLBLOCKS; - rc = obd_getattr(&sbi->ll_osc_conn, &oa, lsm); - if (!rc) { - struct iattr attr; - attr.ia_valid = (ATTR_MTIME | ATTR_CTIME | ATTR_ATIME | - ATTR_SIZE); - attr.ia_mtime = inode->i_mtime; - attr.ia_ctime = inode->i_ctime; - attr.ia_atime = inode->i_atime; - attr.ia_size = oa.o_size; - - inode->i_blocks = oa.o_blocks; - - /* XXX: this introduces a small race that we should - * evaluate */ - rc = ll_inode_setattr(inode, &attr, 0); - } - rc2 = ll_size_unlock(inode, lli->lli_smd, LCK_PR, lockhs); - if (rc2) { - CERROR("lock cancel: %d\n", rc); - LBUG(); - if (!rc) - rc = rc2; - } - } -#endif + CERROR("inode %lu object close failed: rc = %d\n", + inode->i_ino, rc); -out_mdc: - rc2 = mdc_close(&sbi->ll_mdc_conn, inode->i_ino, - S_IFREG, &fd->fd_mdshandle, &req); - ptlrpc_req_finished(req); - if (rc2) { - if (!rc) - rc = -abs(rc2); - GOTO(out_fd, rc); - } - DEBUG_REQ(D_HA, fd->fd_req, "matched open for this close: "); - ptlrpc_req_finished(fd->fd_req); + rc2 = ll_mdc_close(&sbi->ll_mdc_conn, inode, file); + if (rc2 && !rc) + rc = rc2; if (atomic_dec_and_test(&lli->lli_open_count)) { CDEBUG(D_INFO, "last close, cancelling unused locks\n"); - rc = obd_cancel_unused(ll_i2obdconn(inode), lsm, 0); - if (rc) + rc2 = obd_cancel_unused(&sbi->ll_osc_conn, lsm, 0); + if (rc2 && !rc) { + rc = rc2; CERROR("obd_cancel_unused: %d\n", rc); - } else { + } + } else CDEBUG(D_INFO, "not last close, not cancelling unused locks\n"); - } - - EXIT; -out_fd: - fd->fd_mdshandle.cookie = DEAD_HANDLE_MAGIC; - file->private_data = NULL; - kmem_cache_free(ll_file_data_slab, fd); -out: - return rc; + RETURN(rc); } static inline void ll_remove_suid(struct inode *inode) @@ -401,7 +443,7 @@ int ll_lock_callback(struct ldlm_lock *lock, struct ldlm_lock_desc *new, void *data, __u32 data_len, int flag) { struct inode *inode = data; - struct lustre_handle lockh; + struct lustre_handle lockh = { 0, 0 }; int rc; ENTRY; @@ -438,7 +480,7 @@ static ssize_t ll_file_read(struct file *filp, char *buf, size_t count, struct ll_file_data *fd = (struct ll_file_data *)filp->private_data; struct inode *inode = filp->f_dentry->d_inode; struct ll_sb_info *sbi = ll_i2sbi(inode); - struct lustre_handle *lockhs = NULL; + struct lustre_handle lockh = { 0, 0 }; struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd; int flags = 0; ldlm_error_t err; @@ -449,17 +491,13 @@ static ssize_t ll_file_read(struct file *filp, char *buf, size_t count, * call us */ retval = ll_file_size(inode, lsm); if (retval < 0) { - CERROR("ll_file_size: %d\n", retval); + CERROR("ll_file_size: "LPSZ"\n", retval); RETURN(retval); } if (!(fd->fd_flags & LL_FILE_IGNORE_LOCK) && !(sbi->ll_flags & LL_SBI_NOLCK)) { struct ldlm_extent extent; - OBD_ALLOC(lockhs, lsm->lsm_stripe_count * sizeof(*lockhs)); - if (!lockhs) - RETURN(-ENOMEM); - extent.start = *ppos; extent.end = *ppos + count; CDEBUG(D_INFO, "Locking inode %lu, start "LPU64" end "LPU64"\n", @@ -468,15 +506,14 @@ static ssize_t ll_file_read(struct file *filp, char *buf, size_t count, err = obd_enqueue(&sbi->ll_osc_conn, lsm, NULL, LDLM_EXTENT, &extent, sizeof(extent), LCK_PR, &flags, ll_lock_callback, inode, sizeof(*inode), - lockhs); + &lockh); if (err != ELDLM_OK) { - OBD_FREE(lockhs, lsm->lsm_stripe_count*sizeof(*lockhs)); CERROR("lock enqueue: err: %d\n", err); RETURN(err); } } - CDEBUG(D_INFO, "Reading inode %lu, %d bytes, offset %Ld\n", + CDEBUG(D_INFO, "Reading inode %lu, "LPSZ" bytes, offset %Ld\n", inode->i_ino, count, *ppos); retval = generic_file_read(filp, buf, count, ppos); @@ -485,15 +522,13 @@ static ssize_t ll_file_read(struct file *filp, char *buf, size_t count, if (!(fd->fd_flags & LL_FILE_IGNORE_LOCK) && !(sbi->ll_flags & LL_SBI_NOLCK)) { - err = obd_cancel(&sbi->ll_osc_conn, lsm, LCK_PR, lockhs); + err = obd_cancel(&sbi->ll_osc_conn, lsm, LCK_PR, &lockh); if (err != ELDLM_OK) { CERROR("lock cancel: err: %d\n", err); retval = err; } } - if (lockhs) - OBD_FREE(lockhs, lsm->lsm_stripe_count * sizeof(*lockhs)); RETURN(retval); } @@ -506,7 +541,7 @@ ll_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos) struct ll_file_data *fd = (struct ll_file_data *)file->private_data; struct inode *inode = file->f_dentry->d_inode; struct ll_sb_info *sbi = ll_i2sbi(inode); - struct lustre_handle *lockhs = NULL, *eof_lockhs = NULL; + struct lustre_handle lockh = { 0, 0 }, eof_lockh = { 0, 0 }; struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd; int flags = 0; ldlm_error_t err; @@ -520,7 +555,7 @@ ll_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos) if (!oa) RETURN(-ENOMEM); - err = ll_size_lock(inode, lsm, 0, LCK_PW, &eof_lockhs); + err = ll_size_lock(inode, lsm, 0, LCK_PW, &eof_lockh); if (err) { obdo_free(oa); RETURN(err); @@ -545,9 +580,6 @@ ll_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos) if (!(fd->fd_flags & LL_FILE_IGNORE_LOCK) && !(sbi->ll_flags & LL_SBI_NOLCK)) { struct ldlm_extent extent; - OBD_ALLOC(lockhs, lsm->lsm_stripe_count * sizeof(*lockhs)); - if (!lockhs) - GOTO(out_eof, retval = -ENOMEM); extent.start = *ppos; extent.end = *ppos + count; CDEBUG(D_INFO, "Locking inode %lu, start "LPU64" end "LPU64"\n", @@ -556,35 +588,31 @@ ll_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos) err = obd_enqueue(&sbi->ll_osc_conn, lsm, NULL, LDLM_EXTENT, &extent, sizeof(extent), LCK_PW, &flags, ll_lock_callback, inode, sizeof(*inode), - lockhs); + &lockh); if (err != ELDLM_OK) { CERROR("lock enqueue: err: %d\n", err); - GOTO(out_free, retval = err); + GOTO(out_eof, retval = err); } } - CDEBUG(D_INFO, "Writing inode %lu, %ld bytes, offset "LPD64"\n", - inode->i_ino, (long)count, *ppos); + CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n", + inode->i_ino, count, *ppos); retval = generic_file_write(file, buf, count, ppos); if (!(fd->fd_flags & LL_FILE_IGNORE_LOCK) || sbi->ll_flags & LL_SBI_NOLCK) { - err = obd_cancel(&sbi->ll_osc_conn, lsm, LCK_PW, lockhs); + err = obd_cancel(&sbi->ll_osc_conn, lsm, LCK_PW, &lockh); if (err != ELDLM_OK) { CERROR("lock cancel: err: %d\n", err); - GOTO(out_free, retval = err); + GOTO(out_eof, retval = err); } } EXIT; - out_free: - if (lockhs) - OBD_FREE(lockhs, lsm->lsm_stripe_count * sizeof(*lockhs)); - out_eof: if (!S_ISBLK(inode->i_mode) && file->f_flags & O_APPEND) { - err = ll_size_unlock(inode, lsm, LCK_PW, eof_lockhs); + err = ll_size_unlock(inode, lsm, LCK_PW, &eof_lockh); if (err && !retval) retval = err; } @@ -592,121 +620,54 @@ ll_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos) return retval; } -/* Retrieve object striping information. - * - * @arg is a pointer to a user struct with one or more of the fields set to - * indicate the application preference: lmm_stripe_count, lmm_stripe_size, - * lmm_stripe_offset, and lmm_stripe_pattern. lmm_magic must be LOV_MAGIC. - */ static int ll_lov_setstripe(struct inode *inode, struct file *file, unsigned long arg) { struct ll_inode_info *lli = ll_i2info(inode); - struct lov_mds_md *lmm = NULL, *lmmu = (void *)arg; - struct lustre_handle *conn = ll_i2obdconn(inode); + struct lustre_handle *conn; + struct lov_stripe_md *lsm; int rc; + ENTRY; - rc = obd_alloc_wiremd(conn, &lmm); - if (rc < 0) - RETURN(rc); - - rc = copy_from_user(lmm, lmmu, sizeof(*lmm)); - if (rc) - GOTO(out_free, rc = -EFAULT); + down(&lli->lli_open_sem); + lsm = lli->lli_smd; + if (lsm) { + up(&lli->lli_open_sem); + CERROR("stripe already set for ino %lu\n", inode->i_ino); + /* If we haven't already done the open, do so now */ + if (file->f_flags & O_LOV_DELAY_CREATE) { + int rc2 = ll_file_open(inode, file); + if (rc2) + RETURN(rc2); + } - if (lmm->lmm_magic != LOV_MAGIC) { - CERROR("bad LOV magic %X\n", lmm->lmm_magic); - GOTO(out_free, rc = -EINVAL); + RETURN(-EALREADY); } - down(&lli->lli_open_sem); - if (lli->lli_smd) { - CERROR("striping data already set for %lu\n", inode->i_ino); - GOTO(out_lov_up, rc = -EPERM); - } - rc = obd_unpackmd(conn, &lli->lli_smd, lmm); - if (rc < 0) { - CERROR("error setting LOV striping on %lu: rc = %d\n", - inode->i_ino, rc); - GOTO(out_lov_up, rc); - } + conn = ll_i2obdconn(inode); + + rc = obd_iocontrol(LL_IOC_LOV_SETSTRIPE, conn, 0, &lsm, (void *)arg); + if (!rc) + rc = ll_create_open_obj(conn, inode, file, lsm); + up(&lli->lli_open_sem); - rc = ll_create_objects(inode->i_sb, inode->i_ino, 0, 0, &lli->lli_smd); if (rc) { - obd_free_memmd(conn, &lli->lli_smd); - } else { - file->f_flags &= ~O_LOV_DELAY_CREATE; - rc = ll_file_open(inode, file); + obd_free_memmd(conn, &lsm); + RETURN(rc); } -out_lov_up: - up(&lli->lli_open_sem); -out_free: - obd_free_wiremd(conn, &lmm); - return rc; + rc = ll_osc_open(conn, inode, file, lli->lli_smd); + RETURN(rc); } -/* Retrieve object striping information. - * - * @arg is a pointer to a user struct with lmm_ost_count indicating - * the maximum number of OST indices which will fit in the user buffer. - * lmm_magic must be LOV_MAGIC. - */ static int ll_lov_getstripe(struct inode *inode, unsigned long arg) { - struct lov_mds_md lmm, *lmmu = (void *)arg, *lmmk = NULL; struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd; struct lustre_handle *conn = ll_i2obdconn(inode); - int ost_count, rc, lmm_size; if (!lsm) RETURN(-ENODATA); - rc = copy_from_user(&lmm, lmmu, sizeof(lmm)); - if (rc) - RETURN(-EFAULT); - - if (lmm.lmm_magic != LOV_MAGIC) - RETURN(-EINVAL); - - if (lsm->lsm_stripe_count == 0) - ost_count = 1; - else { - struct obd_device *obd = class_conn2obd(conn); - struct lov_obd *lov = &obd->u.lov; - ost_count = lov->desc.ld_tgt_count; - } - - /* XXX we _could_ check if indices > user lmm_ost_count are zero */ - if (lmm.lmm_ost_count < ost_count) - RETURN(-EOVERFLOW); - - rc = obd_packmd(conn, &lmmk, lsm); - if (rc < 0) - RETURN(rc); - - lmm_size = rc; - - /* LOV STACKING layering violation to make LOV/OSC return same data */ - if (lsm->lsm_stripe_count == 0) { - struct lov_object_id *loi; - - loi = (void *)lmmu + offsetof(typeof(*lmmu), lmm_objects); - rc = copy_to_user(loi, &lsm->lsm_object_id, sizeof(*loi)); - if (rc) { - lmm_size = 0; - rc = -EFAULT; - } else { - lmmk->lmm_magic = LOV_MAGIC; - lmmk->lmm_ost_count = lmmk->lmm_stripe_count = 1; - } - } - - if (lmm_size && copy_to_user(lmmu, lmmk, lmm_size)) - rc = -EFAULT; - - obd_free_wiremd(conn, &lmmk); - - RETURN(rc); + return obd_iocontrol(LL_IOC_LOV_GETSTRIPE, conn, 0, lsm, (void *)arg); } int ll_file_ioctl(struct inode *inode, struct file *file, unsigned int cmd, @@ -822,7 +783,7 @@ static int ll_inode_revalidate(struct dentry *dentry) rc = mdc_getattr(&sbi->ll_mdc_conn, inode->i_ino, inode->i_mode, valid, datalen, &req); if (rc) { - CERROR("failure %d inode "LPX64"\n", rc, inode->i_ino); + CERROR("failure %d inode %lu\n", rc, inode->i_ino); ptlrpc_req_finished(req); RETURN(-abs(rc)); } diff --git a/lustre/llite/namei.c b/lustre/llite/namei.c index 54a81a4..81a5aad 100644 --- a/lustre/llite/namei.c +++ b/lustre/llite/namei.c @@ -287,7 +287,9 @@ int ll_intent_lock(struct inode *parent, struct dentry **de, GOTO(out, flag = LL_LOOKUP_POSITIVE); } - /* Do a getattr now that we have the lock */ + /* Do a getattr now that we have the lock, and fetch the + * up-to-date stripe MD at the same time. + */ valid = OBD_MD_FLNOTOBD; if (it->it_op == IT_READLINK) { datalen = mds_body->size; @@ -340,7 +342,6 @@ int ll_intent_lock(struct inode *parent, struct dentry **de, } } - EXIT; out: if (intent_finish != NULL) { rc = intent_finish(flag, request, de, it, offset, ino); @@ -485,17 +486,18 @@ static struct dentry *ll_lookup2(struct inode *parent, struct dentry *dentry, { struct dentry *save = dentry; int rc; + ENTRY; rc = ll_intent_lock(parent, &dentry, it, lookup2_finish); if (rc < 0) { CERROR("ll_intent_lock: %d\n", rc); - return ERR_PTR(rc); + RETURN(ERR_PTR(rc)); } if (dentry == save) - return NULL; + RETURN(NULL); else - return dentry; + RETURN(dentry); } static struct inode *ll_create_node(struct inode *dir, const char *name, diff --git a/lustre/llite/recover.c b/lustre/llite/recover.c index 3310c34..4c7ad42 100644 --- a/lustre/llite/recover.c +++ b/lustre/llite/recover.c @@ -35,9 +35,10 @@ int ll_recover(struct recovd_data *rd, int phase) list_entry(tmp, struct obd_import, imp_chain); if (phase == PTLRPC_RECOVD_PHASE_PREPARE) { - spin_lock(&imp->imp_lock); + unsigned long flags; + spin_lock_irqsave(&imp->imp_lock, flags); imp->imp_level = LUSTRE_CONN_RECOVD; - spin_unlock(&imp->imp_lock); + spin_unlock_irqrestore(&imp->imp_lock, flags); } imp->imp_recover(imp, phase); } diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c index 7f486fb..e1402d1 100644 --- a/lustre/llite/rw.c +++ b/lustre/llite/rw.c @@ -107,6 +107,16 @@ static int ll_brw(int cmd, struct inode *inode, struct page *page, int create) else pg.count = PAGE_SIZE; + CDEBUG(D_PAGE, "%s %d bytes ino %lu at "LPU64"/"LPX64"\n", + cmd & OBD_BRW_WRITE ? "write" : "read", pg.count, inode->i_ino, + pg.off, pg.off); + if (pg.count == 0) { + CERROR("ZERO COUNT: ino %lu: size %p:%Lu(%p:%Lu) idx %lu off " + LPU64"\n", + inode->i_ino, inode, inode->i_size, page->mapping->host, + page->mapping->host->i_size, page->index, pg.off); + } + pg.flag = create ? OBD_BRW_CREATE : 0; set->brw_callback = ll_brw_sync_wait; @@ -160,7 +170,7 @@ void ll_truncate(struct inode *inode) { struct obdo oa = {0}; struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd; - struct lustre_handle *lockhs = NULL; + struct lustre_handle lockh = { 0, 0 }; int err; ENTRY; @@ -174,10 +184,10 @@ void ll_truncate(struct inode *inode) oa.o_mode = inode->i_mode; oa.o_valid = OBD_MD_FLID | OBD_MD_FLMODE | OBD_MD_FLTYPE; - CDEBUG(D_INFO, "calling punch for "LPX64" (all bytes after "LPD64")\n", + CDEBUG(D_INFO, "calling punch for "LPX64" (all bytes after %Lu)\n", oa.o_id, inode->i_size); - err = ll_size_lock(inode, lsm, inode->i_size, LCK_PW, &lockhs); + err = ll_size_lock(inode, lsm, inode->i_size, LCK_PW, &lockh); if (err) { CERROR("ll_size_lock failed: %d\n", err); return; @@ -191,7 +201,7 @@ void ll_truncate(struct inode *inode) else obdo_to_inode(inode, &oa, oa.o_valid); - err = ll_size_unlock(inode, lsm, LCK_PW, lockhs); + err = ll_size_unlock(inode, lsm, LCK_PW, &lockh); if (err) CERROR("ll_size_unlock failed: %d\n", err); @@ -280,6 +290,7 @@ static int ll_commit_write(struct file *file, struct page *page, pg.pg = page; pg.count = to; + /* XXX make the starting offset "from" */ pg.off = (((obd_off)page->index) << PAGE_SHIFT); pg.flag = create ? OBD_BRW_CREATE : 0; @@ -292,7 +303,7 @@ static int ll_commit_write(struct file *file, struct page *page, if (!PageLocked(page)) LBUG(); - CDEBUG(D_INODE, "commit_page writing (off "LPD64"), count "LPD64"\n", + CDEBUG(D_INODE, "commit_page writing (off "LPD64"), count %d\n", pg.off, pg.count); set->brw_callback = ll_brw_sync_wait; diff --git a/lustre/llite/super.c b/lustre/llite/super.c index cb3ae90..73b6ea5 100644 --- a/lustre/llite/super.c +++ b/lustre/llite/super.c @@ -95,8 +95,8 @@ static void ll_options(char *options, char **ost, char **mds, int *flags) #define log2(n) ffz(~(n)) #endif -static struct super_block * ll_read_super(struct super_block *sb, - void *data, int silent) +static struct super_block *ll_read_super(struct super_block *sb, + void *data, int silent) { struct inode *root = 0; struct obd_device *obd; @@ -112,13 +112,10 @@ static struct super_block * ll_read_super(struct super_block *sb, class_uuid_t uuid; ENTRY; - MOD_INC_USE_COUNT; OBD_ALLOC(sbi, sizeof(*sbi)); - if (!sbi) { - MOD_DEC_USE_COUNT; + if (!sbi) RETURN(NULL); - } INIT_LIST_HEAD(&sbi->ll_conn_chain); INIT_LIST_HEAD(&sbi->ll_orphan_dentry_list); @@ -238,7 +235,6 @@ out_mdc: out_free: OBD_FREE(sbi, sizeof(*sbi)); - MOD_DEC_USE_COUNT; goto out_dev; } /* ll_read_super */ @@ -275,7 +271,6 @@ static void ll_put_super(struct super_block *sb) OBD_FREE(sbi, sizeof(*sbi)); - MOD_DEC_USE_COUNT; EXIT; } /* ll_put_super */ @@ -300,16 +295,16 @@ static void ll_clear_inode(struct inode *inode) } } - if (atomic_read(&inode->i_count) == 0) { - char *symlink_name = lli->lli_symlink_name; + if (atomic_read(&inode->i_count) != 0) + CERROR("clearing in-use inode %lu: count = %d\n", + inode->i_ino, atomic_read(&inode->i_count)); - if (lli->lli_smd) - obd_free_memmd(&sbi->ll_osc_conn, &lli->lli_smd); + if (lli->lli_smd) + obd_free_memmd(&sbi->ll_osc_conn, &lli->lli_smd); - if (symlink_name) { - OBD_FREE(symlink_name, strlen(symlink_name) + 1); - lli->lli_symlink_name = NULL; - } + if (lli->lli_symlink_name) { + OBD_FREE(lli->lli_symlink_name,strlen(lli->lli_symlink_name)+1); + lli->lli_symlink_name = NULL; } EXIT; @@ -323,8 +318,9 @@ static void ll_delete_inode(struct inode *inode) struct obdo *oa; struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd; + /* mcreate with no open */ if (!lsm) - GOTO(out, -EINVAL); + GOTO(out, 0); if (lsm->lsm_object_id == 0) { CERROR("This really happens\n"); @@ -337,13 +333,13 @@ static void ll_delete_inode(struct inode *inode) GOTO(out, -ENOMEM); oa->o_id = lsm->lsm_object_id; - oa->o_mode = inode->i_mode; - oa->o_valid = OBD_MD_FLID | OBD_MD_FLEASIZE | OBD_MD_FLTYPE; + obdo_from_inode(oa, inode, OBD_MD_FLID | OBD_MD_FLTYPE); err = obd_destroy(ll_i2obdconn(inode), oa, lsm); obdo_free(oa); - CDEBUG(D_SUPER, "obd destroy of objid "LPX64" error %d\n", - lsm->lsm_object_id, err); + if (err) + CDEBUG(D_SUPER, "obd destroy objid "LPX64" error %d\n", + lsm->lsm_object_id, err); } out: clear_inode(inode); @@ -386,18 +382,23 @@ int ll_inode_setattr(struct inode *inode, struct iattr *attr, int do_trunc) { struct ptlrpc_request *request = NULL; struct ll_sb_info *sbi = ll_i2sbi(inode); - int err; - + int err = 0; ENTRY; /* change incore inode */ ll_attr2inode(inode, attr, do_trunc); - err = mdc_setattr(&sbi->ll_mdc_conn, inode, attr, &request); - if (err) - CERROR("mdc_setattr fails (%d)\n", err); + /* Don't send size changes to MDS to avoid "fast EA" problems, and + * also avoid a pointless RPC (we get file size from OST anyways). + */ + attr->ia_valid &= ~ATTR_SIZE; + if (attr->ia_valid) { + err = mdc_setattr(&sbi->ll_mdc_conn, inode, attr, &request); + if (err) + CERROR("mdc_setattr fails (%d)\n", err); - ptlrpc_req_finished(request); + ptlrpc_req_finished(request); + } RETURN(err); } @@ -503,7 +504,6 @@ static void ll_read_inode2(struct inode *inode, void *opaque) /* core attributes first */ ll_update_inode(inode, body); - //if (body->valid & OBD_MD_FLEASIZE) LASSERT(!lli->lli_smd); if (lic && lic->lic_lmm) obd_unpackmd(ll_i2obdconn(inode), &lli->lli_smd, lic->lic_lmm); @@ -515,8 +515,7 @@ static void ll_read_inode2(struct inode *inode, void *opaque) rc = ll_file_size(inode, lli->lli_smd); if (rc) { CERROR("ll_file_size: %d\n", rc); - /* FIXME: need to somehow prevent inode creation */ - LBUG(); + ll_clear_inode(inode); make_bad_inode(inode); } } @@ -548,8 +547,8 @@ static inline void invalidate_request_list(struct list_head *req_list) list_for_each_safe(tmp, n, req_list) { struct ptlrpc_request *req = list_entry(tmp, struct ptlrpc_request, rq_list); - CERROR("invalidating req xid "LPD64" op %d to %s:%d\n", - (unsigned long long)req->rq_xid, req->rq_reqmsg->opc, + CERROR("invalidating req xid "LPU64" op %d to %s:%d\n", + req->rq_xid, req->rq_reqmsg->opc, req->rq_connection->c_remote_uuid, req->rq_import->imp_client->cli_request_portal); req->rq_flags |= PTL_RPC_FL_ERR; @@ -591,8 +590,11 @@ struct super_operations ll_super_operations = umount_begin: ll_umount_begin }; -struct file_system_type lustre_lite_fs_type = { - "lustre_lite", 0, ll_read_super, NULL +static struct file_system_type lustre_lite_fs_type = { + name: "lustre_lite", + fs_flags: 0, + read_super: ll_read_super, + owner: THIS_MODULE, }; static int __init init_lustre_lite(void) diff --git a/lustre/llite/super25.c b/lustre/llite/super25.c index cd6544a..557d715 100644 --- a/lustre/llite/super25.c +++ b/lustre/llite/super25.c @@ -114,13 +114,10 @@ static int ll_fill_super(struct super_block *sb, void *data, int silent) class_uuid_t uuid; ENTRY; - MOD_INC_USE_COUNT; OBD_ALLOC(sbi, sizeof(*sbi)); - if (!sbi) { - MOD_DEC_USE_COUNT; + if (!sbi) RETURN(-ENOMEM); - } INIT_LIST_HEAD(&sbi->ll_conn_chain); generate_random_uuid(uuid); @@ -238,7 +235,6 @@ out_mdc: out_free: OBD_FREE(sbi, sizeof(*sbi)); - MOD_DEC_USE_COUNT; goto out_dev; } /* ll_fill_super */ @@ -272,25 +268,45 @@ static void ll_put_super(struct super_block *sb) obd_disconnect(&sbi->ll_mdc_conn); OBD_FREE(sbi, sizeof(*sbi)); - MOD_DEC_USE_COUNT; EXIT; } /* ll_put_super */ static void ll_clear_inode(struct inode *inode) { + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ll_inode_info *lli = ll_i2info(inode); + int rc; ENTRY; - if (atomic_read(&inode->i_count) == 0) { - struct ll_inode_info *lli = ll_i2info(inode); - char *symlink_name = lli->lli_symlink_name; +#warning "Is there a reason we don't do this in 2.5, but we do in 2.4?" +#if 0 + rc = mdc_cancel_unused(&sbi->ll_mdc_conn, inode, LDLM_FL_NO_CALLBACK); + if (rc < 0) { + CERROR("mdc_cancel_unused: %d\n", rc); + /* XXX FIXME do something dramatic */ + } - if (lli->lli_smd) - obd_free_memmd(&sbi->ll_osc_conn, &lli->lli_smd); - if (symlink_name) { - OBD_FREE(symlink_name, strlen(symlink_name) + 1); - lli->lli_symlink_name = NULL; + if (lli->lli_smd) { + rc = obd_cancel_unused(&sbi->ll_osc_conn, lli->lli_smd, 0); + if (rc < 0) { + CERROR("obd_cancel_unused: %d\n", rc); + /* XXX FIXME do something dramatic */ } } +#endif + + if (atomic_read(&inode->i_count) != 0) + CERROR("clearing in-use inode %lu: count = %d\n", + inode->i_ino, atomic_read(&inode->i_count)); + + if (lli->lli_smd) + obd_free_memmd(&sbi->ll_osc_conn, &lli->lli_smd); + + if (lli->lli_symlink_name) { + OBD_FREE(lli->lli_symlink_name,strlen(lli->lli_symlink_name)+1); + lli->lli_symlink_name = NULL; + } + EXIT; } @@ -302,8 +318,9 @@ static void ll_delete_inode(struct inode *inode) struct obdo *oa; struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd; + /* mcreate with no open */ if (!lsm) - GOTO(out, -EINVAL); + GOTO(out, 0); if (lsm->lsm_object_id == 0) { CERROR("This really happens\n"); @@ -317,12 +334,13 @@ static void ll_delete_inode(struct inode *inode) oa->o_id = lsm->lsm_object_id; oa->o_mode = inode->i_mode; - oa->o_valid = OBD_MD_FLID | OBD_MD_FLEASIZE | OBD_MD_FLTYPE; + oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE; err = obd_destroy(ll_i2obdconn(inode), oa, lsm); obdo_free(oa); - CDEBUG(D_SUPER, "obd destroy of objid "LPX64" error %d\n", - lsm->lsm_object_id, err); + if (err) + CDEBUG(D_SUPER, "obd destroy objid "LPX64" error %d\n", + lsm->lsm_object_id, err); } out: clear_inode(inode); @@ -365,18 +383,24 @@ int ll_inode_setattr(struct inode *inode, struct iattr *attr, int do_trunc) { struct ptlrpc_request *request = NULL; struct ll_sb_info *sbi = ll_i2sbi(inode); - int err; + int err = 0; ENTRY; /* change incore inode */ ll_attr2inode(inode, attr, do_trunc); - err = mdc_setattr(&sbi->ll_mdc_conn, inode, attr, &request); - if (err) - CERROR("mdc_setattr fails (%d)\n", err); + /* Don't send size changes to MDS to avoid "fast EA" problems, and + * also avoid a pointless RPC (we get file size from OST anyways). + */ + attr->ia_valid &= ~ATTR_SIZE; + if (attr->ia_valid) { + err = mdc_setattr(&sbi->ll_mdc_conn, inode, attr, &request); + if (err) + CERROR("mdc_setattr fails (%d)\n", err); - ptlrpc_req_finished(request); + ptlrpc_req_finished(request); + } RETURN(err); } @@ -482,7 +506,6 @@ int ll_read_inode2(struct inode *inode, void *opaque) /* core attributes first */ ll_update_inode(inode, body); - //if (body->valid & OBD_MD_FLEASIZE) LASSERT(!lli->lli_smd); if (lic && lic->lic_lmm) obd_unpackmd(ll_i2obdconn(inode), &lli->lli_smd, lic->lic_lmm); @@ -492,9 +515,9 @@ int ll_read_inode2(struct inode *inode, void *opaque) rc = ll_file_size(inode, lli->lli_smd); if (rc) { CERROR("ll_file_size: %d\n", rc); - /* FIXME: need to somehow prevent inode creation */ - LBUG(); + ll_clear_inode(inode); make_bad_inode(inode); + RETURN(rc); } } diff --git a/lustre/llite/symlink.c b/lustre/llite/symlink.c index ef86d58..5be4717 100644 --- a/lustre/llite/symlink.c +++ b/lustre/llite/symlink.c @@ -89,7 +89,7 @@ static int ll_follow_link(struct dentry *dentry, struct nameidata *nd, struct inode *inode = dentry->d_inode; struct ll_inode_info *lli = ll_i2info(inode); struct ptlrpc_request *request; - int op, mode, rc; + int op = 0, mode = 0, rc; char *symname; ENTRY; diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c index fe5aad4..7135743 100644 --- a/lustre/lov/lov_obd.c +++ b/lustre/lov/lov_obd.c @@ -21,6 +21,7 @@ #include #include #include +#include /* for LL_IOC_LOV_[GS]ETSTRIPE */ #include #include #include @@ -42,10 +43,19 @@ struct lov_file_handles { struct lustre_handle *lfh_handles; }; +struct lov_lock_handles { + __u64 llh_cookie; + struct lustre_handle llh_handles[0]; +}; + extern int lov_packmd(struct lustre_handle *conn, struct lov_mds_md **lmm, struct lov_stripe_md *lsm); extern int lov_unpackmd(struct lustre_handle *conn, struct lov_stripe_md **lsm, struct lov_mds_md *lmm); +extern int lov_setstripe(struct lustre_handle *conn, + struct lov_stripe_md **lsmp, struct lov_mds_md *lmmu); +extern int lov_getstripe(struct lustre_handle *conn, struct lov_mds_md *lmmu, + struct lov_stripe_md *lsm); /* obd methods */ int lov_attach(struct obd_device *dev, obd_count len, void *data) @@ -72,10 +82,9 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd, int rc, rc2, i; ENTRY; - MOD_INC_USE_COUNT; rc = class_connect(conn, obd, cluuid); if (rc) - GOTO(out_dec, rc); + RETURN(rc); /* We don't want to actually do the underlying connections more than * once, so keep track. */ @@ -84,6 +93,7 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd, RETURN(0); exp = class_conn2export(conn); + spin_lock_init(&exp->exp_lov_data.led_lock); INIT_LIST_HEAD(&exp->exp_lov_data.led_open_head); /* retrieve LOV metadata from MDS */ @@ -159,7 +169,6 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd, for (i = 0; i < desc->ld_tgt_count; i++) { struct obd_device *tgt = client_tgtuuid2obd(uuidarray[i]); - int rc2; if (!tgt) { CERROR("Target %s not attached\n", uuidarray[i]); @@ -174,26 +183,20 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd, rc = obd_connect(&lov->tgts[i].conn, tgt, NULL, recovd, recover); - /* Register even if connect failed, so that we get reactivation - * notices. - */ - rc2 = obd_iocontrol(IOC_OSC_REGISTER_LOV, &lov->tgts[i].conn, - sizeof(struct obd_device *), obd, NULL); - if (rc2) { - CERROR("Target %s REGISTER_LOV error %d\n", - uuidarray[i], rc2); - GOTO(out_disc, rc2); + if (rc) { + CERROR("Target %s connect error %d\n", uuidarray[i], + rc); + GOTO(out_disc, rc); } - - /* But mark failed-connect OSCs as inactive! */ + + rc = obd_iocontrol(IOC_OSC_REGISTER_LOV, &lov->tgts[i].conn, + sizeof(struct obd_device *), obd, NULL); if (rc) { - CDEBUG(D_INFO, "Target %s connect error %d\n", + CERROR("Target %s REGISTER_LOV error %d\n", uuidarray[i], rc); - LASSERT(lov->tgts[i].active == 0); - rc = 0; - continue; + GOTO(out_disc, rc); } - + desc->ld_active_tgt_count++; lov->tgts[i].active = 1; } @@ -205,6 +208,7 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd, RETURN(rc); out_disc: + i--; /* skip failed-connect OSC */ while (i-- > 0) { desc->ld_active_tgt_count--; lov->tgts[i].active = 0; @@ -216,8 +220,6 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd, OBD_FREE(lov->tgts, lov->bufsize); out_conn: class_disconnect(conn); - out_dec: - MOD_DEC_USE_COUNT; goto out; } @@ -256,6 +258,7 @@ static int lov_disconnect(struct lustre_handle *conn) lov->tgts = NULL; exp = class_conn2export(conn); + spin_lock(&exp->exp_lov_data.led_lock); list_for_each_safe(p, n, &exp->exp_lov_data.led_open_head) { /* XXX close these, instead of just discarding them? */ struct lov_file_handles *lfh; @@ -267,11 +270,10 @@ static int lov_disconnect(struct lustre_handle *conn) lfh->lfh_count * sizeof(*lfh->lfh_handles)); kmem_cache_free(lov_file_cache, lfh); } + spin_unlock(&exp->exp_lov_data.led_lock); out_local: rc = class_disconnect(conn); - if (!rc) - MOD_DEC_USE_COUNT; return rc; } @@ -286,6 +288,7 @@ static int lov_set_osc_active(struct lov_obd *lov, obd_uuid_t uuid, int activate) { struct obd_device *obd; + struct lov_tgt_desc *tgt; int i, rc = 0; ENTRY; @@ -293,27 +296,31 @@ static int lov_set_osc_active(struct lov_obd *lov, obd_uuid_t uuid, lov, uuid, activate); spin_lock(&lov->lov_lock); - for (i = 0; i < lov->desc.ld_tgt_count; i++) - if (strncmp(uuid, lov->tgts[i].uuid, - sizeof(lov->tgts[i].uuid)) == 0) + for (i = 0, tgt = lov->tgts; i < lov->desc.ld_tgt_count; i++, tgt++) { + CDEBUG(D_INFO, "lov idx %d is %s conn "LPX64"\n", + i, tgt->uuid, tgt->conn.addr); + if (strncmp(uuid, tgt->uuid, sizeof(tgt->uuid)) == 0) break; + } if (i == lov->desc.ld_tgt_count) GOTO(out, rc = -EINVAL); - obd = class_conn2obd(&lov->tgts[i].conn); + obd = class_conn2obd(&tgt->conn); if (obd == NULL) { LBUG(); GOTO(out, rc = -ENOTCONN); } - CDEBUG(D_INFO, "Found OBD %p type %s\n", obd, obd->obd_type->typ_name); + CDEBUG(D_INFO, "Found OBD %s=%s device %d (%p) type %s at LOV idx %d\n", + obd->obd_name, obd->obd_uuid, obd->obd_minor, obd, + obd->obd_type->typ_name, i); if (strcmp(obd->obd_type->typ_name, "osc") != 0) { LBUG(); GOTO(out, rc = -EBADF); } - if (lov->tgts[i].active == activate) { + if (tgt->active == activate) { CDEBUG(D_INFO, "OBD %p already %sactive!\n", obd, activate ? "" : "in"); GOTO(out, rc = -EALREADY); @@ -321,7 +328,7 @@ static int lov_set_osc_active(struct lov_obd *lov, obd_uuid_t uuid, CDEBUG(D_INFO, "Marking OBD %p %sactive\n", obd, activate ? "" : "in"); - lov->tgts[i].active = activate; + tgt->active = activate; if (activate) { /* * foreach(export) @@ -341,6 +348,7 @@ static int lov_set_osc_active(struct lov_obd *lov, obd_uuid_t uuid, lov->desc.ld_active_tgt_count--; } +#warning "FIXME: walk open files list for objects that need opening" EXIT; out: spin_unlock(&lov->lov_lock); @@ -400,7 +408,8 @@ static int lov_create(struct lustre_handle *conn, struct obdo *oa, struct lov_stripe_md *lsm; struct lov_oinfo *loi; struct obdo *tmp; - int ost_count, ost_idx = 1; + int ost_count, ost_idx; + int first = 1, obj_alloc = 0; int rc = 0, i; ENTRY; @@ -409,119 +418,111 @@ static int lov_create(struct lustre_handle *conn, struct obdo *oa, if (!export) RETURN(-EINVAL); - tmp = obdo_alloc(); - if (!tmp) - RETURN(-ENOMEM); - lov = &export->exp_obd->u.lov; if (!lov->desc.ld_active_tgt_count) RETURN(-EIO); - spin_lock(&lov->lov_lock); - ost_count = lov->desc.ld_tgt_count; + tmp = obdo_alloc(); + if (!tmp) + RETURN(-ENOMEM); lsm = *ea; - /* Can't create more stripes than we have targets (incl inactive). */ - if (lsm && lsm->lsm_stripe_count > lov->desc.ld_tgt_count) - GOTO(out_tmp, rc = -EINVAL); - - /* Free the user lsm if it needs to be changed, to avoid memory leaks */ - if (!lsm || (lsm && - lsm->lsm_stripe_count > lov->desc.ld_active_tgt_count)) { - struct lov_stripe_md *lsm_new = NULL; - rc = obd_alloc_memmd(conn, &lsm_new); - if (rc < 0) { - spin_unlock(&lov->lov_lock); - if (lsm) - obd_free_memmd(conn, &lsm); + if (!lsm) { + rc = obd_alloc_memmd(conn, &lsm); + if (rc < 0) GOTO(out_tmp, rc); - } - if (lsm) { - LASSERT(lsm->lsm_magic == LOV_MAGIC); - CERROR("replace user LOV MD: stripes %u > %u active\n", - lsm->lsm_stripe_count, - lov->desc.ld_active_tgt_count); - lsm_new->lsm_stripe_offset = lsm->lsm_stripe_offset; - lsm_new->lsm_stripe_size = lsm->lsm_stripe_size; - lsm_new->lsm_stripe_pattern = lsm->lsm_stripe_pattern; - obd_free_memmd(conn, &lsm); - } - lsm = lsm_new; - ost_idx = 0; /* if lsm->lsm_stripe_offset is set yet */ + + rc = 0; lsm->lsm_magic = LOV_MAGIC; } + ost_count = lov->desc.ld_tgt_count; + LASSERT(oa->o_valid & OBD_MD_FLID); lsm->lsm_object_id = oa->o_id; if (!lsm->lsm_stripe_size) lsm->lsm_stripe_size = lov->desc.ld_default_stripe_size; - /* Because of 64-bit divide/mod operations only work with a 32-bit - * divisor in a 32-bit kernel, we cannot support a stripe width - * of 4GB or larger on 32-bit CPUs. - */ - if (lsm->lsm_stripe_size * lsm->lsm_stripe_count > ~0UL) { - CERROR("LOV: stripe width "LPU64"x%u > %lu on 32-bit system\n", - lsm->lsm_stripe_size, lsm->lsm_stripe_count, ~0UL); - spin_unlock(&lov->lov_lock); - GOTO(out_free, rc = -EINVAL); - } - - if (!ost_idx || lsm->lsm_stripe_offset >= ost_count) { + if (!*ea || lsm->lsm_stripe_offset >= ost_count) { int mult = lsm->lsm_object_id * lsm->lsm_stripe_count; int stripe_offset = mult % ost_count; int sub_offset = (mult / ost_count) % lsm->lsm_stripe_count; - lsm->lsm_stripe_offset = stripe_offset + sub_offset; - } - - /* Start with lsm_stripe_offset on an active OSC to avoid confusion */ - while (!lov->tgts[lsm->lsm_stripe_offset].active) - lsm->lsm_stripe_offset = (lsm->lsm_stripe_offset+1) % ost_count; - - /* Pick the OSTs before we release the lock */ - ost_idx = lsm->lsm_stripe_offset; - for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) { - CDEBUG(D_INODE, "objid "LPX64"[%d] is ost_idx %d (uuid %s)\n", - lsm->lsm_object_id, i, ost_idx, lov->tgts[ost_idx].uuid); - loi->loi_ost_idx = ost_idx; - do { - ost_idx = (ost_idx + 1) % ost_count; - } while (!lov->tgts[ost_idx].active); - } - - spin_unlock(&lov->lov_lock); + ost_idx = stripe_offset + sub_offset; + } else + ost_idx = lsm->lsm_stripe_offset; CDEBUG(D_INODE, "allocating %d subobjs for objid "LPX64" at idx %d\n", - lsm->lsm_stripe_count,lsm->lsm_object_id,lsm->lsm_stripe_offset); + lsm->lsm_stripe_count, lsm->lsm_object_id, ost_idx); - for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) { + loi = lsm->lsm_oinfo; + for (i = 0; i < ost_count; i++, ost_idx = (ost_idx + 1) % ost_count) { struct lov_stripe_md obj_md; struct lov_stripe_md *obj_mdp = &obj_md; + int err; - ost_idx = loi->loi_ost_idx; + if (lov->tgts[ost_idx].active == 0) { + CDEBUG(D_HA, "lov idx %d inactive\n", ost_idx); + continue; + } /* create data objects with "parent" OA */ memcpy(tmp, oa, sizeof(*tmp)); /* XXX: LOV STACKING: use real "obj_mdp" sub-data */ - rc = obd_create(&lov->tgts[ost_idx].conn, tmp, &obj_mdp); - if (rc) { - CERROR("error creating objid "LPX64" sub-object on " - "OST idx %d: rc = %d\n", oa->o_id, ost_idx, rc); - GOTO(out_cleanup, rc); + err = obd_create(&lov->tgts[ost_idx].conn, tmp, &obj_mdp); + if (err) { + if (lov->tgts[ost_idx].active) { + CERROR("error creating objid "LPX64" sub-object" + "on OST idx %d: rc = %d\n", + oa->o_id, ost_idx, err); + if (!rc) + rc = err; + } + continue; } loi->loi_id = tmp->o_id; + loi->loi_ost_idx = ost_idx; CDEBUG(D_INODE, "objid "LPX64" has subobj "LPX64" at idx %d\n", lsm->lsm_object_id, loi->loi_id, ost_idx); + + if (first) { + lsm->lsm_stripe_offset = ost_idx; + first = 0; + } + + ++obj_alloc; + ++loi; + + /* If we have allocated enough objects, we are OK */ + if (obj_alloc == lsm->lsm_stripe_count) { + rc = 0; + GOTO(out_done, rc); + } } + if (*ea) + GOTO(out_cleanup, rc); + else { + struct lov_stripe_md *lsm_new; + /* XXX LOV STACKING call into osc for sizes */ + int size = lov_stripe_md_size(obj_alloc); + + OBD_ALLOC(lsm_new, size); + if (!lsm_new) + GOTO(out_cleanup, rc = -ENOMEM); + memcpy(lsm_new, lsm, size); + /* XXX LOV STACKING call into osc for sizes */ + OBD_FREE(lsm, lov_stripe_md_size(lsm->lsm_stripe_count)); + lsm = lsm_new; + } + out_done: *ea = lsm; out_tmp: obdo_free(tmp); - RETURN(rc); + return rc; out_cleanup: while (i-- > 0) { @@ -538,7 +539,6 @@ static int lov_create(struct lustre_handle *conn, struct obdo *oa, oa->o_id, loi->loi_id, loi->loi_ost_idx, err); } - out_free: if (!*ea) obd_free_memmd(conn, &lsm); goto out_tmp; @@ -561,7 +561,7 @@ static int lov_destroy(struct lustre_handle *conn, struct obdo *oa, } if (lsm->lsm_magic != LOV_MAGIC) { - CERROR("LOV striping magic bad %#lx != %#lx\n", + CERROR("LOV striping magic bad %#x != %#x\n", lsm->lsm_magic, LOV_MAGIC); RETURN(-EINVAL); } @@ -576,6 +576,7 @@ static int lov_destroy(struct lustre_handle *conn, struct obdo *oa, for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) { int err; if (lov->tgts[loi->loi_ost_idx].active == 0) { + CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx); /* Orphan clean up will (someday) fix this up. */ continue; } @@ -667,7 +668,7 @@ static int lov_getattr(struct lustre_handle *conn, struct obdo *oa, } if (lsm->lsm_magic != LOV_MAGIC) { - CERROR("LOV striping magic bad %#lx != %#lx\n", + CERROR("LOV striping magic bad %#x != %#x\n", lsm->lsm_magic, LOV_MAGIC); RETURN(-EINVAL); } @@ -680,14 +681,15 @@ static int lov_getattr(struct lustre_handle *conn, struct obdo *oa, if (oa->o_valid & OBD_MD_FLHANDLE) lfh = lov_handle2lfh(obdo_handle(oa)); + CDEBUG(D_INFO, "objid "LPX64": %ux%u byte stripes\n", + lsm->lsm_object_id, lsm->lsm_stripe_count, lsm->lsm_stripe_size); for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) { int err; - if (loi->loi_id == 0) - continue; - - if (lov->tgts[loi->loi_ost_idx].active == 0) + if (lov->tgts[loi->loi_ost_idx].active == 0) { + CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx); continue; + } CDEBUG(D_INFO, "objid "LPX64"[%d] has subobj "LPX64" at idx " "%u\n", oa->o_id, i, loi->loi_id, loi->loi_ost_idx); @@ -739,7 +741,7 @@ static int lov_setattr(struct lustre_handle *conn, struct obdo *oa, } if (lsm->lsm_magic != LOV_MAGIC) { - CERROR("LOV striping magic bad %#lx != %#lx\n", + CERROR("LOV striping magic bad %#x != %#x\n", lsm->lsm_magic, LOV_MAGIC); RETURN(-EINVAL); } @@ -803,7 +805,7 @@ static int lov_open(struct lustre_handle *conn, struct obdo *oa, } if (lsm->lsm_magic != LOV_MAGIC) { - CERROR("LOV striping magic bad %#lx != %#lx\n", + CERROR("LOV striping magic bad %#x != %#x\n", lsm->lsm_magic, LOV_MAGIC); RETURN(-EINVAL); } @@ -829,6 +831,7 @@ static int lov_open(struct lustre_handle *conn, struct obdo *oa, for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) { if (lov->tgts[loi->loi_ost_idx].active == 0) { + CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx); continue; } @@ -863,7 +866,9 @@ static int lov_open(struct lustre_handle *conn, struct obdo *oa, handle->addr = (__u64)(unsigned long)lfh; handle->cookie = lfh->lfh_cookie; oa->o_valid |= OBD_MD_FLHANDLE; + spin_lock(&export->exp_lov_data.led_lock); list_add(&lfh->lfh_list, &export->exp_lov_data.led_open_head); + spin_unlock(&export->exp_lov_data.led_lock); out_tmp: obdo_free(tmp); @@ -914,7 +919,7 @@ static int lov_close(struct lustre_handle *conn, struct obdo *oa, } if (lsm->lsm_magic != LOV_MAGIC) { - CERROR("LOV striping magic bad %#lx != %#lx\n", + CERROR("LOV striping magic bad %#x != %#x\n", lsm->lsm_magic, LOV_MAGIC); RETURN(-EINVAL); } @@ -928,9 +933,11 @@ static int lov_close(struct lustre_handle *conn, struct obdo *oa, lov = &export->exp_obd->u.lov; for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) { int err; - - if (lov->tgts[loi->loi_ost_idx].active == 0) + + if (lov->tgts[loi->loi_ost_idx].active == 0) { + CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx); continue; + } /* create data objects with "parent" OA */ memcpy(&tmp, oa, sizeof(tmp)); @@ -1029,7 +1036,7 @@ static int lov_punch(struct lustre_handle *conn, struct obdo *oa, } if (lsm->lsm_magic != LOV_MAGIC) { - CERROR("LOV striping magic bad %#lx != %#lx\n", + CERROR("LOV striping magic bad %#x != %#x\n", lsm->lsm_magic, LOV_MAGIC); RETURN(-EINVAL); } @@ -1048,6 +1055,7 @@ static int lov_punch(struct lustre_handle *conn, struct obdo *oa, if (starti == endi) continue; + /* create data objects with "parent" OA */ memcpy(&tmp, oa, sizeof(tmp)); tmp.o_id = loi->loi_id; @@ -1094,7 +1102,7 @@ static inline int lov_brw(int cmd, struct lustre_handle *conn, } if (lsm->lsm_magic != LOV_MAGIC) { - CERROR("LOV striping magic bad %#lx != %#lx\n", + CERROR("LOV striping magic bad %#x != %#x\n", lsm->lsm_magic, LOV_MAGIC); RETURN(-EINVAL); } @@ -1159,13 +1167,49 @@ static inline int lov_brw(int cmd, struct lustre_handle *conn, RETURN(rc); } +static struct lov_lock_handles *lov_newlockh(struct lov_stripe_md *lsm) +{ + struct lov_lock_handles *lov_lockh; + + OBD_ALLOC(lov_lockh, sizeof(*lov_lockh) + + sizeof(*lov_lockh->llh_handles) * lsm->lsm_stripe_count); + if (!lov_lockh) + return NULL; + + get_random_bytes(&lov_lockh->llh_cookie, sizeof(lov_lockh->llh_cookie)); + + return lov_lockh; +} + +/* We are only ever passed local lock handles here, so we do not need to + * validate (and we can't really because these structs are variable sized + * and therefore alloced, and not from a private slab). + * + * We just check because we can... + */ +static struct lov_lock_handles *lov_h2lovlockh(struct lustre_handle *handle) +{ + struct lov_lock_handles *lov_lockh = NULL; + + if (!handle || !handle->addr) + RETURN(NULL); + + lov_lockh = (struct lov_lock_handles *)(unsigned long)(handle->addr); + if (lov_lockh->llh_cookie != handle->cookie) + RETURN(NULL); + + return lov_lockh; +} + static int lov_enqueue(struct lustre_handle *conn, struct lov_stripe_md *lsm, struct lustre_handle *parent_lock, __u32 type, void *cookie, int cookielen, __u32 mode, int *flags, void *cb, void *data, int datalen, - struct lustre_handle *lockhs) + struct lustre_handle *lockh) { struct obd_export *export = class_conn2export(conn); + struct lov_lock_handles *lov_lockh = NULL; + struct lustre_handle *lov_lockhp; struct lov_obd *lov; struct lov_oinfo *loi; struct lov_stripe_md submd; @@ -1178,7 +1222,7 @@ static int lov_enqueue(struct lustre_handle *conn, struct lov_stripe_md *lsm, } if (lsm->lsm_magic != LOV_MAGIC) { - CERROR("LOV striping magic bad %#lx != %#lx\n", + CERROR("LOV striping magic bad %#x != %#x\n", lsm->lsm_magic, LOV_MAGIC); RETURN(-EINVAL); } @@ -1190,33 +1234,45 @@ static int lov_enqueue(struct lustre_handle *conn, struct lov_stripe_md *lsm, if (!export || !export->exp_obd) RETURN(-ENODEV); - memset(lockhs, 0, sizeof(*lockhs) * lsm->lsm_stripe_count); + if (lsm->lsm_stripe_count > 1) { + lov_lockh = lov_newlockh(lsm); + if (!lov_lockh) + RETURN(-ENOMEM); + + lockh->addr = (__u64)(unsigned long)lov_lockh; + lockh->cookie = lov_lockh->llh_cookie; + lov_lockhp = lov_lockh->llh_handles; + } else + lov_lockhp = lockh; lov = &export->exp_obd->u.lov; - for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) { + for (i = 0, loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; + i++, loi++, lov_lockhp++) { struct ldlm_extent *extent = (struct ldlm_extent *)cookie; struct ldlm_extent sub_ext; - if (lov->tgts[loi->loi_ost_idx].active == 0) + if (lov->tgts[loi->loi_ost_idx].active == 0) { + CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx); continue; + } *flags = 0; sub_ext.start = lov_stripe_offset(lsm, extent->start, i); sub_ext.end = lov_stripe_offset(lsm, extent->end, i); - if (sub_ext.start == sub_ext.end) + if (sub_ext.start == sub_ext.end /* || !active */) continue; + /* XXX LOV STACKING: submd should be from the subobj */ submd.lsm_object_id = loi->loi_id; - /* XXX submd should be that from the subobj, it should come - * opaquely from the LOV. - */ submd.lsm_stripe_count = 0; /* XXX submd is not fully initialized here */ *flags = 0; rc = obd_enqueue(&(lov->tgts[loi->loi_ost_idx].conn), &submd, parent_lock, type, &sub_ext, sizeof(sub_ext), - mode, flags, cb, data, datalen, &(lockhs[i])); + mode, flags, cb, data, datalen, lov_lockhp); // XXX add a lock debug statement here + if (rc) + memset(lov_lockhp, 0, sizeof(*lov_lockhp)); if (rc && lov->tgts[loi->loi_ost_idx].active) { CERROR("Error enqueue objid "LPX64" subobj "LPX64 " on OST idx %d: rc = %d\n", lsm->lsm_object_id, @@ -1224,33 +1280,47 @@ static int lov_enqueue(struct lustre_handle *conn, struct lov_stripe_md *lsm, goto out_locks; } } - RETURN(0); - out_locks: - for (i--, loi = &lsm->lsm_oinfo[i]; i >= 0; i--, loi--) { +out_locks: + while (loi--, lov_lockhp--, i-- > 0) { + struct lov_stripe_md submd; int err; - - if (lov->tgts[loi->loi_ost_idx].active == 0) + + if (lov_lockhp->addr == 0 || + lov->tgts[loi->loi_ost_idx].active == 0) continue; + /* XXX LOV STACKING: submd should be from the subobj */ submd.lsm_object_id = loi->loi_id; submd.lsm_stripe_count = 0; err = obd_cancel(&lov->tgts[loi->loi_ost_idx].conn, &submd, - mode, &lockhs[i]); + mode, lov_lockhp); if (err) { - CERROR("Error cancelling objid "LPX64" subobj "LPX64 + CERROR("Error cancelling objid "LPX64 " on OST idx %d after enqueue error: rc = %d\n", loi->loi_id, loi->loi_ost_idx, err); } } + + if (lsm->lsm_stripe_count > 1) { + lov_lockh->llh_cookie = DEAD_HANDLE_MAGIC; + OBD_FREE(lov_lockh, sizeof(*lov_lockh) + + sizeof(*lov_lockh->llh_handles) * + lsm->lsm_stripe_count); + } + lockh->addr = 0; + lockh->cookie = DEAD_HANDLE_MAGIC; + RETURN(rc); } static int lov_cancel(struct lustre_handle *conn, struct lov_stripe_md *lsm, - __u32 mode, struct lustre_handle *lockhs) + __u32 mode, struct lustre_handle *lockh) { struct obd_export *export = class_conn2export(conn); + struct lov_lock_handles *lov_lockh = NULL; + struct lustre_handle *lov_lockhp; struct lov_obd *lov; struct lov_oinfo *loi; int rc = 0, i; @@ -1262,7 +1332,7 @@ static int lov_cancel(struct lustre_handle *conn, struct lov_stripe_md *lsm, } if (lsm->lsm_magic != LOV_MAGIC) { - CERROR("LOV striping magic bad %#lx != %#lx\n", + CERROR("LOV striping magic bad %#x != %#x\n", lsm->lsm_magic, LOV_MAGIC); RETURN(-EINVAL); } @@ -1270,29 +1340,55 @@ static int lov_cancel(struct lustre_handle *conn, struct lov_stripe_md *lsm, if (!export || !export->exp_obd) RETURN(-ENODEV); + LASSERT(lockh); + if (lsm->lsm_stripe_count > 1) { + lov_lockh = lov_h2lovlockh(lockh); + if (!lov_lockh) { + CERROR("LOV: invalid lov lock handle %p\n", lockh); + RETURN(-EINVAL); + } + + lov_lockhp = lov_lockh->llh_handles; + } else + lov_lockhp = lockh; + lov = &export->exp_obd->u.lov; - for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) { + for (i = 0, loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; + i++, loi++, lov_lockhp++ ) { struct lov_stripe_md submd; int err; - if (lov->tgts[loi->loi_ost_idx].active == 0) - continue; - - if (lockhs[i].addr == 0) + if (lov_lockhp->addr == 0) { + CDEBUG(D_HA, "lov idx %d no lock?\n", loi->loi_ost_idx); continue; + } + /* XXX LOV STACKING: submd should be from the subobj */ submd.lsm_object_id = loi->loi_id; submd.lsm_stripe_count = 0; err = obd_cancel(&lov->tgts[loi->loi_ost_idx].conn, &submd, - mode, &lockhs[i]); - if (err && lov->tgts[loi->loi_ost_idx].active) { - CERROR("Error cancel objid "LPX64" subobj "LPX64 - " on OST idx %d: rc = %d\n", lsm->lsm_object_id, - loi->loi_id, loi->loi_ost_idx, err); - if (!rc) - rc = err; + mode, lov_lockhp); + if (err) { + if (lov->tgts[loi->loi_ost_idx].active) { + CERROR("Error cancel objid "LPX64" subobj " + LPX64" on OST idx %d: rc = %d\n", + lsm->lsm_object_id, + loi->loi_id, loi->loi_ost_idx, err); + if (!rc) + rc = err; + } } } + + if (lsm->lsm_stripe_count > 1) { + lov_lockh->llh_cookie = DEAD_HANDLE_MAGIC; + OBD_FREE(lov_lockh, sizeof(*lov_lockh) + + sizeof(*lov_lockh->llh_handles) * + lsm->lsm_stripe_count); + } + lockh->addr = 0; + lockh->cookie = DEAD_HANDLE_MAGIC; + RETURN(rc); } @@ -1302,7 +1398,7 @@ static int lov_cancel_unused(struct lustre_handle *conn, struct obd_export *export = class_conn2export(conn); struct lov_obd *lov; struct lov_oinfo *loi; - int rc = 0, i, err; + int rc = 0, i; ENTRY; if (!lsm) { @@ -1316,6 +1412,7 @@ static int lov_cancel_unused(struct lustre_handle *conn, lov = &export->exp_obd->u.lov; for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) { struct lov_stripe_md submd; + int err; submd.lsm_object_id = loi->loi_id; submd.lsm_stripe_count = 0; @@ -1352,12 +1449,14 @@ static int lov_statfs(struct lustre_handle *conn, struct obd_statfs *osfs) for (i = 0; i < lov->desc.ld_tgt_count; i++) { int err; - if (!lov->tgts[i].active) + if (!lov->tgts[i].active) { + CDEBUG(D_HA, "lov idx %d inactive\n", i); continue; + } err = obd_statfs(&lov->tgts[i].conn, &lov_sfs); if (err) { - CERROR("Error statfs OSC %s idx %d: err = %d\n", + CERROR("Error statfs OSC %s i %d: err = %d\n", lov->tgts[i].uuid, i, err); if (!rc) rc = err; @@ -1389,7 +1488,6 @@ static int lov_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len, { struct obd_device *obddev = class_conn2obd(conn); struct lov_obd *lov = &obddev->u.lov; - struct obd_ioctl_data *data = karg; int i, count = lov->desc.ld_tgt_count; int rc; @@ -1397,10 +1495,12 @@ static int lov_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len, switch (cmd) { case IOC_LOV_SET_OSC_ACTIVE: { + struct obd_ioctl_data *data = karg; rc = lov_set_osc_active(lov,data->ioc_inlbuf1,data->ioc_offset); break; } case OBD_IOC_LOV_GET_CONFIG: { + struct obd_ioctl_data *data = karg; struct lov_tgt_desc *tgtdesc; struct lov_desc *desc; obd_uuid_t *uuidp; @@ -1437,13 +1537,21 @@ static int lov_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len, OBD_FREE(buf, len); break; } + case LL_IOC_LOV_SETSTRIPE: + rc = lov_setstripe(conn, karg, uarg); + break; + case LL_IOC_LOV_GETSTRIPE: + rc = lov_getstripe(conn, karg, uarg); + break; default: if (count == 0) RETURN(-ENOTTY); rc = 0; for (i = 0; i < count; i++) { - int err = obd_iocontrol(cmd, &lov->tgts[i].conn, - len, karg, uarg); + int err; + + err = obd_iocontrol(cmd, &lov->tgts[i].conn, + len, karg, uarg); if (err && !rc) rc = err; } @@ -1453,6 +1561,7 @@ static int lov_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len, } struct obd_ops lov_obd_ops = { + o_owner: THIS_MODULE, o_attach: lov_attach, o_detach: lov_detach, o_setup: lov_setup, diff --git a/lustre/lov/lov_pack.c b/lustre/lov/lov_pack.c index 247015c..3d4b4b8 100644 --- a/lustre/lov/lov_pack.c +++ b/lustre/lov/lov_pack.c @@ -27,10 +27,10 @@ #include #include #include +#include #include /* lov_packdesc() is in mds/mds_lov.c */ - void lov_unpackdesc(struct lov_desc *ld) { ld->ld_tgt_count = NTOH__u32(ld->ld_tgt_count); @@ -39,6 +39,28 @@ void lov_unpackdesc(struct lov_desc *ld) ld->ld_pattern = HTON__u32(ld->ld_pattern); } +void lov_dump_lmm(int level, struct lov_mds_md *lmm) +{ + struct lov_object_id *loi; + int idx; + + CDEBUG(level, "objid "LPX64", magic %#08x, ost_count %u\n", + lmm->lmm_object_id, lmm->lmm_magic, lmm->lmm_ost_count); + CDEBUG(level,"stripe_size %u, stripe_count %u, stripe_offset %u\n", + lmm->lmm_stripe_size, lmm->lmm_stripe_count, + lmm->lmm_stripe_offset); + for (idx = 0, loi = lmm->lmm_objects; idx < lmm->lmm_ost_count; + idx++, loi++) + CDEBUG(level, "ost idx %u subobj "LPX64"\n", idx, + loi->l_object_id); +} + +#define LMM_ASSERT(test) \ +do { \ + if (!(test)) lov_dump_lmm(D_ERROR, lmm); \ + LASSERT(test); /* so we know what assertion failed */ \ +} while(0) + /* Pack LOV object metadata for shipment to the MDS. * * XXX In the future, this will be enhanced to get the EA size from the @@ -60,12 +82,19 @@ int lov_packmd(struct lustre_handle *conn, struct lov_mds_md **lmmp, ENTRY; if (lsm) { + int i, max = 0; if (lsm->lsm_magic != LOV_MAGIC) { - CERROR("bad mem LOV MAGIC: %#08x != %#08x\n", + CERROR("bad mem LOV MAGIC: %#010x != %#010x\n", lsm->lsm_magic, LOV_MAGIC); RETURN(-EINVAL); } stripe_count = lsm->lsm_stripe_count; + + for (i = 0,loi = lsm->lsm_oinfo; i < stripe_count; i++,loi++) { + if (loi->loi_ost_idx > max) + max = loi->loi_ost_idx; + } + ost_count = max + 1; } /* XXX LOV STACKING call into osc for sizes */ @@ -93,14 +122,14 @@ int lov_packmd(struct lustre_handle *conn, struct lov_mds_md **lmmp, lmm->lmm_stripe_count = (stripe_count); if (!lsm) RETURN(lmm_size); + /* XXX endianness */ lmm->lmm_magic = (lsm->lsm_magic); lmm->lmm_object_id = (lsm->lsm_object_id); LASSERT(lsm->lsm_object_id); lmm->lmm_stripe_size = (lsm->lsm_stripe_size); - lmm->lmm_stripe_pattern = (lsm->lsm_stripe_pattern); lmm->lmm_stripe_offset = (lsm->lsm_stripe_offset); - lmm->lmm_ost_count = (lov->desc.ld_tgt_count); + lmm->lmm_ost_count = (ost_count); /* Only fill in the object ids which we are actually using. * Assumes lmm_objects is otherwise zero-filled. */ @@ -113,6 +142,16 @@ int lov_packmd(struct lustre_handle *conn, struct lov_mds_md **lmmp, RETURN(lmm_size); } +static int lov_get_stripecnt(struct lov_obd *lov, int stripe_count) +{ + if (!stripe_count) + stripe_count = lov->desc.ld_default_stripe_count; + if (!stripe_count || stripe_count > lov->desc.ld_active_tgt_count) + stripe_count = lov->desc.ld_active_tgt_count; + + return stripe_count; +} + int lov_unpackmd(struct lustre_handle *conn, struct lov_stripe_md **lsmp, struct lov_mds_md *lmm) { @@ -120,9 +159,9 @@ int lov_unpackmd(struct lustre_handle *conn, struct lov_stripe_md **lsmp, struct lov_obd *lov = &obd->u.lov; struct lov_stripe_md *lsm; struct lov_oinfo *loi; - int ost_count = lov->desc.ld_active_tgt_count; + int ost_count; int ost_offset = 0; - int stripe_count = 0; + int stripe_count; int lsm_size; int i; ENTRY; @@ -135,12 +174,9 @@ int lov_unpackmd(struct lustre_handle *conn, struct lov_stripe_md **lsmp, RETURN(-EINVAL); } stripe_count = (lmm->lmm_stripe_count); - } - - if (!stripe_count) - stripe_count = lov->desc.ld_default_stripe_count; - if (!stripe_count || stripe_count > ost_count) - stripe_count = ost_count; + LASSERT(stripe_count); + } else + stripe_count = lov_get_stripecnt(lov, 0); /* XXX LOV STACKING call into osc for sizes */ lsm_size = lov_stripe_md_size(stripe_count); @@ -171,9 +207,12 @@ int lov_unpackmd(struct lustre_handle *conn, struct lov_stripe_md **lsmp, ost_offset = lsm->lsm_stripe_offset = (lmm->lmm_stripe_offset); lsm->lsm_magic = (lmm->lmm_magic); lsm->lsm_object_id = (lmm->lmm_object_id); - LASSERT(lsm->lsm_object_id); lsm->lsm_stripe_size = (lmm->lmm_stripe_size); - lsm->lsm_stripe_pattern = (lmm->lmm_stripe_pattern); + + ost_count = (lmm->lmm_ost_count); + + LMM_ASSERT(lsm->lsm_object_id); + LMM_ASSERT(ost_count); for (i = 0, loi = lsm->lsm_oinfo; i < ost_count; i++, ost_offset++) { ost_offset %= ost_count; @@ -181,13 +220,126 @@ int lov_unpackmd(struct lustre_handle *conn, struct lov_stripe_md **lsmp, if (!lmm->lmm_objects[ost_offset].l_object_id) continue; - LASSERT(loi - lsm->lsm_oinfo < stripe_count); + LMM_ASSERT(loi - lsm->lsm_oinfo < stripe_count); /* XXX LOV STACKING call down to osc_unpackmd() */ loi->loi_id = (lmm->lmm_objects[ost_offset].l_object_id); loi->loi_ost_idx = ost_offset; loi++; } - LASSERT(loi - lsm->lsm_oinfo == stripe_count); + LMM_ASSERT(loi - lsm->lsm_oinfo > 0); + LMM_ASSERT(loi - lsm->lsm_oinfo == stripe_count); RETURN(lsm_size); } + +/* Configure object striping information on a new file. + * + * @lmmu is a pointer to a user struct with one or more of the fields set to + * indicate the application preference: lmm_stripe_count, lmm_stripe_size, + * lmm_stripe_offset, and lmm_stripe_pattern. lmm_magic must be LOV_MAGIC. + * @lsmp is a pointer to an in-core stripe MD that needs to be filled in. + */ +int lov_setstripe(struct lustre_handle *conn, struct lov_stripe_md **lsmp, + struct lov_mds_md *lmmu) +{ + struct obd_device *obd = class_conn2obd(conn); + struct lov_obd *lov = &obd->u.lov; + struct lov_mds_md lmm; + struct lov_stripe_md *lsm; + int stripe_count; + int rc; + ENTRY; + + rc = copy_from_user(&lmm, lmmu, sizeof(lmm)); + if (rc) + RETURN(-EFAULT); + + if (lmm.lmm_magic != LOV_MAGIC) { + CERROR("bad wire LOV MAGIC: %#08x != %#08x\n", + lmm.lmm_magic, LOV_MAGIC); + RETURN(-EINVAL); + } + if (lmm.lmm_stripe_count > lov->desc.ld_tgt_count) { + CERROR("stripe count %d more than OST count %d\n", + (int)lmm.lmm_stripe_count, lov->desc.ld_tgt_count); + RETURN(-EINVAL); + } + if (lmm.lmm_stripe_offset >= lov->desc.ld_tgt_count) { + CERROR("stripe offset %d more than max OST index %d\n", + (int)lmm.lmm_stripe_count, lov->desc.ld_tgt_count); + RETURN(-EINVAL); + } + if (lmm.lmm_stripe_size & (PAGE_SIZE - 1)) { + CERROR("stripe size %u not multiple of %lu\n", + lmm.lmm_stripe_size, PAGE_SIZE); + RETURN(-EINVAL); + } + if (lmm.lmm_stripe_size * lmm.lmm_stripe_count > ~0UL) { + CERROR("stripe width %ux%u > %lu on 32-bit system\n", + lmm.lmm_stripe_size, (int)lmm.lmm_stripe_count, ~0UL); + RETURN(-EINVAL); + } + + stripe_count = lov_get_stripecnt(lov, lmm.lmm_stripe_count); + + /* XXX LOV STACKING call into osc for sizes */ + OBD_ALLOC(lsm, lov_stripe_md_size(stripe_count)); + if (!lsm) + RETURN(-ENOMEM); + + lsm->lsm_magic = LOV_MAGIC; + /* This is all validated in lov_create() */ + lsm->lsm_stripe_count = stripe_count; + lsm->lsm_stripe_offset = lmm.lmm_stripe_offset; + lsm->lsm_stripe_size = lmm.lmm_stripe_size; + + *lsmp = lsm; + + RETURN(rc); +} + +/* Retrieve object striping information. + * + * @lmmu is a pointer to an in-core struct with lmm_ost_count indicating + * the maximum number of OST indices which will fit in the user buffer. + * lmm_magic must be LOV_MAGIC. + */ +int lov_getstripe(struct lustre_handle *conn, struct lov_stripe_md *lsm, + struct lov_mds_md *lmmu) +{ + struct obd_device *obd = class_conn2obd(conn); + struct lov_obd *lov = &obd->u.lov; + struct lov_mds_md lmm, *lmmk = NULL; + int ost_count, rc, lmm_size; + ENTRY; + + if (!lsm) + RETURN(-ENODATA); + + rc = copy_from_user(&lmm, lmmu, sizeof(lmm)); + if (rc) + RETURN(-EFAULT); + + if (lmm.lmm_magic != LOV_MAGIC) + RETURN(-EINVAL); + + ost_count = lov->desc.ld_tgt_count; + + /* XXX we _could_ check if indices > user lmm_ost_count are zero */ + if (lmm.lmm_ost_count < ost_count) + RETURN(-EOVERFLOW); + + rc = lov_packmd(conn, &lmmk, lsm); + if (rc < 0) + RETURN(rc); + + lmm_size = rc; + rc = 0; + + if (lmm_size && copy_to_user(lmmu, lmmk, lmm_size)) + rc = -EFAULT; + + obd_free_wiremd(conn, &lmmk); + + RETURN(rc); +} diff --git a/lustre/lov/lproc_lov.c b/lustre/lov/lproc_lov.c index a68b57e..0812e00 100644 --- a/lustre/lov/lproc_lov.c +++ b/lustre/lov/lproc_lov.c @@ -28,139 +28,124 @@ * Common STATUS namespace */ -int rd_uuid(char* page, char **start, off_t off, int count, int *eof, +int rd_uuid(char *page, char **start, off_t off, int count, int *eof, void *data) { - int len = 0; struct obd_device* dev = (struct obd_device*)data; - len += snprintf(page, count, "%s\n", dev->obd_uuid); - return len; - - + return snprintf(page, count, "%s\n", dev->obd_uuid); } -int rd_stripesize(char* page, char **start, off_t off, int count, int *eof, + +int rd_stripesize(char *page, char **start, off_t off, int count, int *eof, void *data) { - struct obd_device* dev = (struct obd_device*)data; - int len = 0; - struct lov_obd* lov = &dev->u.lov; - len += snprintf(page, count, LPU64"\n", - (__u64)(lov->desc.ld_default_stripe_size)); - - return len; + struct obd_device *dev = (struct obd_device*)data; + struct lov_desc *desc = &dev->u.lov.desc; + + return snprintf(page, count, LPU64"\n", desc->ld_default_stripe_size); } -int rd_stripeoffset(char* page, char **start, off_t off, int count, int *eof, +int rd_stripeoffset(char *page, char **start, off_t off, int count, int *eof, void *data) { struct obd_device* dev = (struct obd_device*)data; - int len = 0; struct lov_obd* lov = &dev->u.lov; - len += snprintf(page, count, LPU64"\n", - lov->desc.ld_default_stripe_offset); - return len; + return snprintf(page, count, LPU64"\n", + lov->desc.ld_default_stripe_offset); } -int rd_stripetype(char* page, char **start, off_t off, int count, int *eof, +int rd_stripetype(char *page, char **start, off_t off, int count, int *eof, void *data) { struct obd_device* dev = (struct obd_device*)data; - int len = 0; struct lov_obd* lov = &dev->u.lov; - len += snprintf(page, count, LPU64"\n", - (__u64)(lov->desc.ld_pattern)); - return len; + return snprintf(page, count, "%u\n", lov->desc.ld_pattern); } -int rd_stripecount(char* page, char **start, off_t off, int count, int *eof, + +int rd_stripecount(char *page, char **start, off_t off, int count, int *eof, void *data) -{ +{ struct obd_device* dev = (struct obd_device*)data; - int len = 0; struct lov_obd* lov = &dev->u.lov; - len += snprintf(page, count, LPU64"\n", - (__u64)(lov->desc.ld_default_stripe_count)); - return len; + return snprintf(page, count, "%u\n", lov->desc.ld_default_stripe_count); } -int rd_numobd(char* page, char **start, off_t off, int count, int *eof, + +int rd_numobd(char *page, char **start, off_t off, int count, int *eof, void *data) -{ - struct obd_device* dev = (struct obd_device*)data; - int len = 0; - struct lov_obd* lov=&dev->u.lov; - len += snprintf(page, count, LPU64"\n", - (__u64)(lov->desc.ld_tgt_count)); - return len; +{ + struct obd_device *dev = (struct obd_device*)data; + struct lov_obd *lov = &dev->u.lov; + + return snprintf(page, count, "%u\n", lov->desc.ld_tgt_count); } -int rd_activeobd(char* page, char **start, off_t off, int count, int *eof, +int rd_activeobd(char *page, char **start, off_t off, int count, int *eof, void *data) -{ +{ struct obd_device* dev = (struct obd_device*)data; - int len = 0; struct lov_obd* lov = &dev->u.lov; - len += snprintf(page, count, LPU64"\n", - (__u64)(lov->desc.ld_active_tgt_count)); - return len; + return snprintf(page, count, "%u\n", lov->desc.ld_active_tgt_count); } -int rd_blksize(char* page, char **start, off_t off, int count, int *eof, +int rd_blksize(char *page, char **start, off_t off, int count, int *eof, void *data) { return 0; } -int rd_kbtotal(char* page, char **start, off_t off, int count, int *eof, +int rd_kbtotal(char *page, char **start, off_t off, int count, int *eof, void *data) { return 0; } -int rd_kbfree(char* page, char **start, off_t off, int count, int *eof, +int rd_kbfree(char *page, char **start, off_t off, int count, int *eof, void *data) { return 0; } -int rd_filestotal(char* page, char **start, off_t off, int count, int *eof, +int rd_filestotal(char *page, char **start, off_t off, int count, int *eof, void *data) { return 0; } -int rd_filesfree(char* page, char **start, off_t off, int count, int *eof, +int rd_filesfree(char* page, char **start, off_t off, int count, int *eof, void *data) { return 0; } -int rd_filegroups(char* page, char **start, off_t off, int count, int *eof, - void *data) +int rd_filegroups(char* page, char **start, off_t off, int count, int *eof, + void *data) { return 0; } -int rd_target(char* page, char **start, off_t off, int count, int *eof, +int rd_target(char *page, char **start, off_t off, int count, int *eof, void *data) { struct obd_device* dev = (struct obd_device*)data; int len = 0, i = 0; struct lov_obd* lov = &dev->u.lov; struct lov_tgt_desc* tgts = lov->tgts; - while(i < lov->desc.ld_tgt_count){ - len += snprintf(&page[len], count, "%d: %s\n", i, tgts->uuid); + while (i < lov->desc.ld_tgt_count) { + len += snprintf(&page[len], count - len, "%d: %s %sACTIVE\n", + i, tgts->uuid, tgts->active ? "" : "IN"); i++; tgts++; } - + return len; } + int rd_mdc(char* page, char **start, off_t off, int count, int *eof, void *data) { struct obd_device* dev = (struct obd_device*)data; @@ -186,16 +171,15 @@ struct lprocfs_vars status_var_nm_1[] = { {"status/kbytesfree", rd_kbfree, 0, 0}, {"status/target_obd", rd_target, 0, 0}, {"status/target_mdc", rd_mdc, 0, 0}, - {0} }; -int rd_numrefs(char* page, char **start, off_t off, int count, int *eof, + +int rd_numrefs(char *page, char **start, off_t off, int count, int *eof, void *data) { struct obd_type* class = (struct obd_type*)data; - int len = 0; - len += snprintf(page, count, "%d\n", class->typ_refcnt); - return len; + + return snprintf(page, count, "%d\n", class->typ_refcnt); } struct lprocfs_vars status_class_var[]={ diff --git a/lustre/mdc/Makefile.am b/lustre/mdc/Makefile.am index 8dd9175..1d9c099 100644 --- a/lustre/mdc/Makefile.am +++ b/lustre/mdc/Makefile.am @@ -9,11 +9,9 @@ MODULE = mdc modulefs_DATA = mdc.o EXTRA_PROGRAMS = mdc -LINX= mds_updates.c ll_pack.c client.c +LINX= mds_updates.c client.c mdc_SOURCES = mdc_request.c mdc_reint.c lproc_mdc.c $(LINX) -ll_pack.c: - test -e ll_pack.c || ln -sf $(top_srcdir)/lib/ll_pack.c . mds_updates.c: test -e mds_updates.c || ln -sf $(top_srcdir)/lib/mds_updates.c . client.c: diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c index c856d10..a97cfb5c 100644 --- a/lustre/mdc/mdc_request.c +++ b/lustre/mdc/mdc_request.c @@ -37,24 +37,25 @@ extern int mds_queue_req(struct ptlrpc_request *); extern struct lprocfs_vars status_var_nm_1[]; extern struct lprocfs_vars status_class_var[]; -/* should become mdc_getinfo() */ -int mdc_getstatus(struct lustre_handle *conn, struct ll_fid *rootfid) +/* Helper that implements most of mdc_getstatus and signal_completed_replay. */ +static int send_getstatus(struct obd_import *imp, struct ll_fid *rootfid, + int level, int msg_flags) { struct ptlrpc_request *req; struct mds_body *body; int rc, size = sizeof(*body); ENTRY; - req = ptlrpc_prep_req(class_conn2cliimp(conn), MDS_GETSTATUS, 1, &size, - NULL); + req = ptlrpc_prep_req(imp, MDS_GETSTATUS, 1, &size, NULL); if (!req) GOTO(out, rc = -ENOMEM); body = lustre_msg_buf(req->rq_reqmsg, 0); - req->rq_level = LUSTRE_CONN_CON; + req->rq_level = level; req->rq_replen = lustre_msg_size(1, &size); - + mds_pack_req_body(req); + req->rq_reqmsg->flags |= msg_flags; rc = ptlrpc_queue_wait(req); if (!rc) { @@ -74,6 +75,13 @@ int mdc_getstatus(struct lustre_handle *conn, struct ll_fid *rootfid) return rc; } +/* should become mdc_getinfo() */ +int mdc_getstatus(struct lustre_handle *conn, struct ll_fid *rootfid) +{ + return send_getstatus(class_conn2cliimp(conn), rootfid, LUSTRE_CONN_CON, + 0); +} + int mdc_getlovinfo(struct obd_device *obd, struct lustre_handle *mdc_connh, struct ptlrpc_request **request) { @@ -104,9 +112,8 @@ int mdc_getlovinfo(struct obd_device *obd, struct lustre_handle *mdc_connh, RETURN(rc); } - int mdc_getattr(struct lustre_handle *conn, - obd_id ino, int type, unsigned long valid, size_t ea_size, + obd_id ino, int type, unsigned long valid, unsigned int ea_size, struct ptlrpc_request **request) { struct ptlrpc_request *req; @@ -130,7 +137,7 @@ int mdc_getattr(struct lustre_handle *conn, size[bufcount] = ea_size; bufcount++; body->size = ea_size; - CDEBUG(D_INODE, "reserving %d bytes for MD/symlink in packet\n", + CDEBUG(D_INODE, "reserved %u bytes for MD/symlink in packet\n", ea_size); } req->rq_replen = lustre_msg_size(bufcount, size); @@ -150,6 +157,50 @@ int mdc_getattr(struct lustre_handle *conn, return rc; } +int mdc_getattr_name(struct lustre_handle *conn, struct inode *parent, + char *filename, int namelen, unsigned long valid, + unsigned int ea_size, struct ptlrpc_request **request) +{ + struct ptlrpc_request *req; + struct mds_body *body; + int rc, size[2] = {sizeof(*body), namelen}, bufcount = 1; + ENTRY; + + req = ptlrpc_prep_req(class_conn2cliimp(conn), MDS_GETATTR_NAME, 2, + size, NULL); + if (!req) + GOTO(out, rc = -ENOMEM); + + body = lustre_msg_buf(req->rq_reqmsg, 0); + ll_inode2fid(&body->fid1, parent); + body->valid = valid; + memcpy(lustre_msg_buf(req->rq_reqmsg, 1), filename, namelen); + + if (ea_size) { + size[1] = ea_size; + bufcount++; + body->size = ea_size; + CDEBUG(D_INODE, "reserved %u bytes for MD/symlink in packet\n", + ea_size); + valid |= OBD_MD_FLEASIZE; + } + + req->rq_replen = lustre_msg_size(bufcount, size); + mds_pack_req_body(req); + + rc = ptlrpc_queue_wait(req); + + if (!rc) { + body = lustre_msg_buf(req->rq_repmsg, 0); + mds_unpack_body(body); + } + + EXIT; + out: + *request = req; + return rc; +} + void d_delete_aliases(struct inode *inode) { struct dentry *dentry = NULL; @@ -187,15 +238,19 @@ static int mdc_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, break; case LDLM_CB_CANCELING: { /* Invalidate all dentries associated with this inode */ - struct inode *inode = data; - -#warning "FIXME: what tells us that 'inode' is valid at all?" - if (inode->i_state & I_FREEING) - break; + struct inode *inode; - LASSERT(inode != NULL); + LASSERT(data != NULL); LASSERT(data_len == sizeof(*inode)); + /* XXX what tells us that 'data' is a valid inode at all? + * we should probably validate the lock handle first? + */ + inode = igrab(data); + + if (inode == NULL) /* inode->i_state & I_FREEING */ + break; + if (S_ISDIR(inode->i_mode)) { CDEBUG(D_INODE, "invalidating inode %lu\n", inode->i_ino); @@ -203,12 +258,10 @@ static int mdc_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, ll_invalidate_inode_pages(inode); } - if (inode != inode->i_sb->s_root->d_inode) { - /* XXX should this igrab move up 12 lines? */ - LASSERT(igrab(inode) == inode); + if (inode != inode->i_sb->s_root->d_inode) d_delete_aliases(inode); - iput(inode); - } + + iput(inode); break; } default: @@ -225,11 +278,16 @@ void mdc_store_inode_generation(struct ptlrpc_request *req, int reqoff, struct mds_rec_create *rec = lustre_msg_buf(req->rq_reqmsg, reqoff); struct mds_body *body = lustre_msg_buf(req->rq_repmsg, repoff); - DEBUG_REQ(D_HA, req, "storing generation %x for ino "LPD64, - body->fid1.generation, body->fid1.id); memcpy(&rec->cr_replayfid, &body->fid1, sizeof rec->cr_replayfid); + DEBUG_REQ(D_HA, req, "storing generation %x for ino "LPD64, + rec->cr_replayfid.generation, rec->cr_replayfid.id); } +/* We always reserve enough space in the reply packet for a stripe MD, because + * we don't know in advance the file type. + * + * XXX we could get that from ext2_dir_entry_2 file_type + */ int mdc_enqueue(struct lustre_handle *conn, int lock_type, struct lookup_intent *it, int lock_mode, struct inode *dir, struct dentry *de, struct lustre_handle *lockh, @@ -408,7 +466,8 @@ int mdc_enqueue(struct lustre_handle *conn, int lock_type, &lockh2)) { /* We already have a lock; cancel the old one */ ldlm_lock_decref(lockh, lock_mode); - ldlm_cli_cancel(lockh); + /* FIXME: bug 563 */ + //ldlm_cli_cancel(lockh); memcpy(lockh, &lockh2, sizeof(lockh2)); } LDLM_LOCK_PUT(lock); @@ -459,6 +518,13 @@ static void mdc_replay_open(struct ptlrpc_request *req) memcpy(saved->fh, &body->handle, sizeof(body->handle)); } +/* If lmm is non-NULL and lmm_size is non-zero, the stripe MD is stored on + * the MDS. Otherwise, we have already read a copy from the MDS (probably + * during mdc_enqueue() and we do not need to send it to the MDS again. + * + * In the future (when we support the non-intent case) we need to be able + * to read the stripe MD from the MDS here (need to fix mds_open() too). + */ int mdc_open(struct lustre_handle *conn, obd_id ino, int type, int flags, struct lov_mds_md *lmm, int lmm_size, struct lustre_handle *fh, struct ptlrpc_request **request) @@ -469,9 +535,9 @@ int mdc_open(struct lustre_handle *conn, obd_id ino, int type, int flags, struct ptlrpc_request *req; ENTRY; - if (lmm && lmm_size) { + if (lmm_size) { bufcount = 3; - size[2] = size[1]; /* shuffle the spare data along */ + size[2] = size[1]; /* shuffle the replay data along */ size[1] = lmm_size; } @@ -487,12 +553,14 @@ int mdc_open(struct lustre_handle *conn, obd_id ino, int type, int flags, body->flags = HTON__u32(flags); memcpy(&body->handle, fh, sizeof(body->handle)); - if (lmm && lmm_size) { - CDEBUG(D_INODE, "sending %u bytes MD for ino "LPU64"\n", - lmm_size, ino); - lustre_msg_set_op_flags(req->rq_reqmsg, MDS_OPEN_HAS_EA); - memcpy(lustre_msg_buf(req->rq_reqmsg, 1), lmm, lmm_size); + if (lmm_size) { body->flags |= HTON__u32(OBD_MD_FLEASIZE); + if (lmm) { + CDEBUG(D_INODE, "sending %u bytes MD for ino "LPU64"\n", + lmm_size, ino); + lustre_msg_set_op_flags(req->rq_reqmsg,MDS_OPEN_HAS_EA); + memcpy(lustre_msg_buf(req->rq_reqmsg,1), lmm, lmm_size); + } } req->rq_replen = lustre_msg_size(1, size); @@ -502,12 +570,12 @@ int mdc_open(struct lustre_handle *conn, obd_id ino, int type, int flags, body = lustre_msg_buf(req->rq_repmsg, 0); mds_unpack_body(body); memcpy(fh, &body->handle, sizeof(*fh)); - } - /* If open is replayed, we need to fix up the fh. */ - req->rq_replay_cb = mdc_replay_open; - replay_data = lustre_msg_buf(req->rq_reqmsg, lmm ? 2 : 1); - replay_data->fh = fh; + /* If open is replayed, we need to fix up the fh. */ + req->rq_replay_cb = mdc_replay_open; + replay_data = lustre_msg_buf(req->rq_reqmsg, lmm ? 2 : 1); + replay_data->fh = fh; + } EXIT; out: @@ -635,9 +703,19 @@ static int mdc_detach(struct obd_device *dev) return lprocfs_dereg_obd(dev); } +/* Send a mostly-dummy GETSTATUS request and indicate that we're done replay. */ +static int signal_completed_replay(struct obd_import *imp) +{ + struct ll_fid fid; + + return send_getstatus(imp, &fid, LUSTRE_CONN_RECOVD, MSG_LAST_REPLAY); +} + static int mdc_recover(struct obd_import *imp, int phase) { int rc; + unsigned long flags; + struct ptlrpc_request *req; ENTRY; switch(phase) { @@ -647,13 +725,30 @@ static int mdc_recover(struct obd_import *imp, int phase) RETURN(0); case PTLRPC_RECOVD_PHASE_RECOVER: reconnect: - rc = ptlrpc_reconnect_import(imp, MDS_CONNECT); + rc = ptlrpc_reconnect_import(imp, MDS_CONNECT, &req); + + /* We were still connected, just go about our business. */ if (rc == EALREADY) - RETURN(ptlrpc_replay(imp, 0)); - if (rc) + GOTO(skip_replay, rc); + + if (rc) { + ptlrpc_req_finished(req); RETURN(rc); + } + + /* We can't replay, which might be a problem. */ + if (!(lustre_msg_get_flags(req->rq_repmsg) & + MSG_REPLAY_IN_PROGRESS)) { + if (phase != PTLRPC_RECOVD_PHASE_NOTCONN) { + CERROR("can't replay, invalidating\n"); + ldlm_namespace_cleanup(imp->imp_obd->obd_namespace, + 1); + ptlrpc_abort_inflight(imp); + } + goto skip_replay; + } - rc = ptlrpc_replay(imp, 0 /* no last flag*/); + rc = ptlrpc_replay(imp); if (rc) RETURN(rc); @@ -661,9 +756,16 @@ static int mdc_recover(struct obd_import *imp, int phase) if (rc) RETURN(rc); - spin_lock(&imp->imp_lock); + rc = signal_completed_replay(imp); + if (rc) + RETURN(rc); + + skip_replay: + ptlrpc_req_finished(req); + spin_lock_irqsave(&imp->imp_lock, flags); imp->imp_level = LUSTRE_CONN_FULL; - spin_unlock(&imp->imp_lock); + imp->imp_flags &= ~IMP_INVALID; + spin_unlock_irqrestore(&imp->imp_lock, flags); ptlrpc_wake_delayed(imp); @@ -693,13 +795,14 @@ static int mdc_connect(struct lustre_handle *conn, struct obd_device *obd, } struct obd_ops mdc_obd_ops = { - o_attach: mdc_attach, - o_detach: mdc_detach, - o_setup: client_obd_setup, - o_cleanup: client_obd_cleanup, - o_connect: mdc_connect, - o_disconnect: client_obd_disconnect, - o_statfs: mdc_statfs, + o_owner: THIS_MODULE, + o_attach: mdc_attach, + o_detach: mdc_detach, + o_setup: client_obd_setup, + o_cleanup: client_obd_cleanup, + o_connect: mdc_connect, + o_disconnect: client_obd_disconnect, + o_statfs: mdc_statfs }; static int __init ptlrpc_request_init(void) @@ -723,6 +826,7 @@ EXPORT_SYMBOL(mdc_getlovinfo); EXPORT_SYMBOL(mdc_enqueue); EXPORT_SYMBOL(mdc_cancel_unused); EXPORT_SYMBOL(mdc_getattr); +EXPORT_SYMBOL(mdc_getattr_name); EXPORT_SYMBOL(mdc_create); EXPORT_SYMBOL(mdc_unlink); EXPORT_SYMBOL(mdc_rename); diff --git a/lustre/mds/Makefile.am b/lustre/mds/Makefile.am index 6a0855e..12f06fc 100644 --- a/lustre/mds/Makefile.am +++ b/lustre/mds/Makefile.am @@ -10,10 +10,8 @@ MODULE = mds modulefs_DATA = mds.o EXTRA_PROGRAMS = mds -LINX= mds_updates.c simple.c ll_pack.c target.c +LINX= mds_updates.c simple.c target.c -ll_pack.c: - test -e ll_pack.c || ln -sf $(top_srcdir)/lib/ll_pack.c mds_updates.c: test -e mds_updates.c || ln -sf $(top_srcdir)/lib/mds_updates.c simple.c: diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c index ea30d51..bfdad03 100644 --- a/lustre/mds/handler.c +++ b/lustre/mds/handler.c @@ -297,7 +297,9 @@ static int mds_connect(struct lustre_handle *conn, struct obd_device *obd, if (!conn || !obd || !cluuid) RETURN(-EINVAL); - MOD_INC_USE_COUNT; + /* lctl gets a backstage, all-access pass. */ + if (!strcmp(cluuid, "OBD_CLASS_UUID")) + goto dont_check_exports; spin_lock(&obd->obd_dev_lock); list_for_each(p, &obd->obd_exports) { @@ -308,41 +310,10 @@ static int mds_connect(struct lustre_handle *conn, struct obd_device *obd, continue; } if (!memcmp(cluuid, mcd->mcd_uuid, sizeof mcd->mcd_uuid)) { - /* XXX make handle-found-export a subroutine */ - LASSERT(exp->exp_obd == obd); - spin_unlock(&obd->obd_dev_lock); - if (exp->exp_connection) { - struct lustre_handle *hdl; - hdl = &exp->exp_ldlm_data.led_import.imp_handle; - /* Might be a re-connect after a partition. */ - if (!memcmp(conn, hdl, sizeof *conn)) { - CERROR("%s reconnecting\n", cluuid); - conn->addr = (__u64) (unsigned long)exp; - conn->cookie = exp->exp_cookie; - rc = EALREADY; - } else { - CERROR("%s reconnecting from %s, " - "handle mismatch (ours %Lx/%Lx, " - "theirs %Lx/%Lx)\n", cluuid, - exp->exp_connection-> - c_remote_uuid, hdl->addr, - hdl->cookie, conn->addr, - conn->cookie); - /* XXX disconnect them here? */ - memset(conn, 0, sizeof *conn); - rc = -EALREADY; - } - MOD_DEC_USE_COUNT; - RETURN(rc); - } - conn->addr = (__u64) (unsigned long)exp; - conn->cookie = exp->exp_cookie; - CDEBUG(D_INFO, "existing export for UUID '%s' at %p\n", - cluuid, exp); - CDEBUG(D_IOCTL,"connect: addr %Lx cookie %Lx\n", - (long long)conn->addr, (long long)conn->cookie); - RETURN(0); + LASSERT(exp->exp_obd == obd); + + RETURN(target_handle_reconnect(conn, exp, cluuid)); } } spin_unlock(&obd->obd_dev_lock); @@ -350,10 +321,10 @@ static int mds_connect(struct lustre_handle *conn, struct obd_device *obd, if (obd->u.mds.mds_recoverable_clients != 0) { CERROR("denying connection for new client %s: in recovery\n", cluuid); - MOD_DEC_USE_COUNT; RETURN(-EBUSY); } + dont_check_exports: /* XXX There is a small race between checking the list and adding a * new connection for the same UUID, but the real threat (list * corruption when multiple different clients connect) is solved. @@ -366,7 +337,7 @@ static int mds_connect(struct lustre_handle *conn, struct obd_device *obd, */ rc = class_connect(conn, obd, cluuid); if (rc) - GOTO(out_dec, rc); + RETURN(rc); exp = class_conn2export(conn); LASSERT(exp); med = &exp->exp_mds_data; @@ -393,8 +364,6 @@ out_mcd: OBD_FREE(mcd, sizeof(*mcd)); out_export: class_disconnect(conn); -out_dec: - MOD_DEC_USE_COUNT; return rc; } @@ -427,11 +396,12 @@ static int mds_disconnect(struct lustre_handle *conn) list_for_each_safe(tmp, n, &med->med_open_head) { struct mds_file_data *mfd = list_entry(tmp, struct mds_file_data, mfd_list); + CERROR("force closing client file handle for %*s\n", + mfd->mfd_file->f_dentry->d_name.len, + mfd->mfd_file->f_dentry->d_name.name); rc = mds_close_mfd(mfd, med); - if (rc) { - /* XXX better diagnostics, with file path and stuff */ - CDEBUG(D_INODE, "Error %d closing mfd %p\n", rc, mfd); - } + if (rc) + CDEBUG(D_INODE, "Error closing file: %d\n", rc); } spin_unlock(&med->med_open_lock); @@ -439,8 +409,6 @@ static int mds_disconnect(struct lustre_handle *conn) mds_client_free(export); rc = class_disconnect(conn); - if (!rc) - MOD_DEC_USE_COUNT; RETURN(rc); } @@ -473,7 +441,7 @@ static int mds_getstatus(struct ptlrpc_request *req) if (rc || OBD_FAIL_CHECK(OBD_FAIL_MDS_GETSTATUS_PACK)) { CERROR("mds: out of memory for message: size=%d\n", size); req->rq_status = -ENOMEM; - RETURN(0); + RETURN(-ENOMEM); } /* Flush any outstanding transactions to disk so the client will @@ -510,16 +478,17 @@ static int mds_getlovinfo(struct ptlrpc_request *req) if (rc) { CERROR("mds: out of memory for message: size=%d\n", size[1]); req->rq_status = -ENOMEM; - RETURN(0); + RETURN(-ENOMEM); } - desc = lustre_msg_buf(req->rq_repmsg, 0); - rc = mds_get_lovdesc(mds, desc); - if (rc) { - req->rq_status = rc; + if (!mds->mds_has_lov_desc) { + req->rq_status = -ENOENT; RETURN(0); } + desc = lustre_msg_buf(req->rq_repmsg, 0); + memcpy(desc, &mds->mds_lov_desc, sizeof *desc); + lov_packdesc(desc); tgt_count = le32_to_cpu(desc->ld_tgt_count); if (tgt_count * sizeof(obd_uuid_t) > streq->repbuf) { CERROR("too many targets, enlarge client buffers\n"); @@ -527,8 +496,6 @@ static int mds_getlovinfo(struct ptlrpc_request *req) RETURN(0); } - /* XXX the MDS should not really know about this */ - mds->mds_max_mdsize = lov_mds_md_size(tgt_count); rc = mds_get_lovtgts(mds, tgt_count, lustre_msg_buf(req->rq_repmsg, 1)); if (rc) { @@ -627,7 +594,7 @@ static int mds_getattr_internal(struct mds_obd *mds, struct dentry *dentry, mds_pack_inode2fid(&body->fid1, inode); mds_pack_inode2body(body, inode); - if (S_ISREG(inode->i_mode) /* && reqbody->valid & OBD_MD_FLEASIZE */) { + if (S_ISREG(inode->i_mode) && reqbody->valid & OBD_MD_FLEASIZE) { rc = mds_pack_md(mds, req, reply_off + 1, body, inode); } else if (S_ISLNK(inode->i_mode) && reqbody->valid & OBD_MD_LINKNAME) { char *symname = lustre_msg_buf(req->rq_repmsg, reply_off + 1); @@ -645,6 +612,58 @@ static int mds_getattr_internal(struct mds_obd *mds, struct dentry *dentry, RETURN(rc); } +static int mds_getattr_pack_msg(struct ptlrpc_request *req, struct inode *inode, + int offset) +{ + struct mds_obd *mds = mds_req2mds(req); + struct mds_body *body; + int rc = 0, size[2] = {sizeof(*body)}, bufcount = 1; + ENTRY; + + body = lustre_msg_buf(req->rq_reqmsg, offset); + + if (S_ISREG(inode->i_mode) && body->valid & OBD_MD_FLEASIZE) { + int rc = fsfilt_get_md(req->rq_export->exp_obd, inode, NULL, 0); + CDEBUG(D_INODE, "got %d bytes MD data for inode %lu\n", + rc, inode->i_ino); + if (rc < 0) { + if (rc != -ENODATA) + CERROR("error getting inode %lu MD: rc = %d\n", + inode->i_ino, rc); + size[bufcount] = 0; + } else if (rc > mds->mds_max_mdsize) { + size[bufcount] = 0; + CERROR("MD size %d larger than maximum possible %u\n", + rc, mds->mds_max_mdsize); + } else + size[bufcount] = rc; + bufcount++; + } else if (body->valid & OBD_MD_LINKNAME) { + size[bufcount] = MIN(inode->i_size + 1, body->size); + bufcount++; + CDEBUG(D_INODE, "symlink size: %Lu, reply space: "LPU64"\n", + inode->i_size + 1, body->size); + } + + if (OBD_FAIL_CHECK(OBD_FAIL_MDS_GETATTR_PACK)) { + CERROR("failed MDS_GETATTR_PACK test\n"); + req->rq_status = -ENOMEM; + GOTO(out, rc = -ENOMEM); + } + + rc = lustre_pack_msg(bufcount, size, NULL, &req->rq_replen, + &req->rq_repmsg); + if (rc) { + CERROR("out of memoryK\n"); + req->rq_status = rc; + GOTO(out, rc); + } + + EXIT; + out: + return(rc); +} + static int mds_getattr_name(int offset, struct ptlrpc_request *req) { struct mds_obd *mds = mds_req2mds(req); @@ -680,7 +699,7 @@ static int mds_getattr_name(int offset, struct ptlrpc_request *req) push_ctxt(&saved, &mds->mds_ctxt, &uc); de = mds_fid2dentry(mds, &body->fid1, NULL); if (IS_ERR(de)) { - GOTO(out_pre_de, rc = -ENOENT); + GOTO(out_pre_de, rc = PTR_ERR(de)); } dir = de->d_inode; @@ -703,29 +722,33 @@ static int mds_getattr_name(int offset, struct ptlrpc_request *req) GOTO(out_create_de, rc = -EIO); } } - ldlm_lock_dump_handle(&lockh); + ldlm_lock_dump_handle(D_OTHER, &lockh); down(&dir->i_sem); dchild = lookup_one_len(name, de, namelen - 1); + up(&dir->i_sem); if (IS_ERR(dchild)) { CDEBUG(D_INODE, "child lookup error %ld\n", PTR_ERR(dchild)); - up(&dir->i_sem); GOTO(out_create_dchild, rc = PTR_ERR(dchild)); + } else if (dchild->d_inode == NULL) { + GOTO(out_create_dchild, rc = -ENOENT); } + if (req->rq_repmsg == NULL) + mds_getattr_pack_msg(req, dchild->d_inode, offset); + rc = mds_getattr_internal(mds, dchild, req, body, offset); EXIT; out_create_dchild: l_dput(dchild); - up(&dir->i_sem); ldlm_lock_decref(&lockh, lock_mode); out_create_de: l_dput(de); out_pre_de: req->rq_status = rc; pop_ctxt(&saved, &mds->mds_ctxt, &uc); - return 0; + return rc; } static int mds_getattr(int offset, struct ptlrpc_request *req) @@ -733,10 +756,9 @@ static int mds_getattr(int offset, struct ptlrpc_request *req) struct mds_obd *mds = mds_req2mds(req); struct obd_run_ctxt saved; struct dentry *de; - struct inode *inode; struct mds_body *body; struct obd_ucred uc; - int rc = 0, size[2] = {sizeof(*body)}, bufcount = 1; + int rc = 0; ENTRY; body = lustre_msg_buf(req->rq_reqmsg, offset); @@ -750,49 +772,12 @@ static int mds_getattr(int offset, struct ptlrpc_request *req) GOTO(out_pop, PTR_ERR(de)); } - inode = de->d_inode; - if (S_ISREG(body->fid1.f_type)) { - int rc = fsfilt_get_md(req->rq_export->exp_obd, inode, NULL, 0); - CDEBUG(D_INODE, "got %d bytes MD data for inode %lu\n", - rc, inode->i_ino); - if (rc < 0) { - if (rc != -ENODATA) - CERROR("error getting inode %lu MD: rc = %d\n", - inode->i_ino, rc); - size[bufcount] = 0; - } else if (rc > mds->mds_max_mdsize) { - size[bufcount] = 0; - CERROR("MD size %d larger than maximum possible %u\n", - rc, mds->mds_max_mdsize); - } else - size[bufcount] = rc; - bufcount++; - } else if (body->valid & OBD_MD_LINKNAME) { - size[bufcount] = MIN(inode->i_size + 1, body->size); - bufcount++; - CDEBUG(D_INODE, "symlink size: %d, reply space: %d\n", - inode->i_size + 1, body->size); - } - - if (OBD_FAIL_CHECK(OBD_FAIL_MDS_GETATTR_PACK)) { - CERROR("failed MDS_GETATTR_PACK test\n"); - req->rq_status = -ENOMEM; - GOTO(out, rc = -ENOMEM); - } - - rc = lustre_pack_msg(bufcount, size, NULL, &req->rq_replen, - &req->rq_repmsg); - if (rc) { - CERROR("out of memoryK\n"); - req->rq_status = rc; - GOTO(out, rc); - } + rc = mds_getattr_pack_msg(req, de->d_inode, offset); req->rq_status = mds_getattr_internal(mds, de, req, body, 0); - EXIT; -out: l_dput(de); + EXIT; out_pop: pop_ctxt(&saved, &mds->mds_ctxt, &uc); return rc; @@ -871,7 +856,7 @@ static int mds_store_md(struct mds_obd *mds, struct ptlrpc_request *req, uc.ouc_cap = body->capability; push_ctxt(&saved, &mds->mds_ctxt, &uc); mds_start_transno(mds); - handle = fsfilt_start(obd, inode,FSFILT_OP_SETATTR); + handle = fsfilt_start(obd, inode, FSFILT_OP_SETATTR); if (IS_ERR(handle)) { rc = PTR_ERR(handle); mds_finish_transno(mds, handle, req, rc); @@ -1002,9 +987,10 @@ static int mds_close(struct ptlrpc_request *req) mfd = mds_handle2mfd(&body->handle); if (!mfd) { - CERROR("no handle for file close "LPD64 - ": addr "LPX64", cookie "LPX64"\n", - body->fid1.id, body->handle.addr, body->handle.cookie); + DEBUG_REQ(D_ERROR, req, "no handle for file close "LPD64 + ": addr "LPX64", cookie "LPX64"\n", + body->fid1.id, body->handle.addr, + body->handle.cookie); RETURN(-ESTALE); } @@ -1068,7 +1054,6 @@ static int mds_readpage(struct ptlrpc_request *req) /* to make this asynchronous make sure that the handling function doesn't send a reply when this function completes. Instead a callback function would send the reply */ - /* note: in case of an error, dentry_open puts dentry */ rc = mds_sendpage(req, file, body->size); filp_close(file, 0); @@ -1103,43 +1088,61 @@ static int check_for_next_transno(struct mds_obd *mds) struct ptlrpc_request *req; req = list_entry(mds->mds_recovery_queue.next, struct ptlrpc_request, rq_list); + LASSERT(req->rq_reqmsg->transno >= mds->mds_next_recovery_transno); return req->rq_reqmsg->transno == mds->mds_next_recovery_transno; } static void process_recovery_queue(struct mds_obd *mds) { struct ptlrpc_request *req; + ENTRY; for (;;) { spin_lock(&mds->mds_processing_task_lock); + LASSERT(mds->mds_processing_task == current->pid); req = list_entry(mds->mds_recovery_queue.next, struct ptlrpc_request, rq_list); if (req->rq_reqmsg->transno != mds->mds_next_recovery_transno) { spin_unlock(&mds->mds_processing_task_lock); + CDEBUG(D_HA, "Waiting for transno "LPD64" (1st is " + LPD64")\n", + mds->mds_next_recovery_transno, + req->rq_reqmsg->transno); wait_event(mds->mds_next_transno_waitq, check_for_next_transno(mds)); continue; } - list_del(&req->rq_list); + list_del_init(&req->rq_list); spin_unlock(&mds->mds_processing_task_lock); - DEBUG_REQ(D_HA, req, ""); - mds_handle(req); - - if (list_empty(&mds->mds_recovery_queue)) + DEBUG_REQ(D_ERROR, req, "processing: "); + (void)mds_handle(req); + mds_fsync_super(mds->mds_sb); + OBD_FREE(req, sizeof *req); + spin_lock(&mds->mds_processing_task_lock); + mds->mds_next_recovery_transno++; + if (list_empty(&mds->mds_recovery_queue)) { + mds->mds_processing_task = 0; + spin_unlock(&mds->mds_processing_task_lock); break; + } + spin_unlock(&mds->mds_processing_task_lock); } + EXIT; } static int queue_recovery_request(struct ptlrpc_request *req, struct mds_obd *mds) { struct list_head *tmp; - int inserted = 0, transno = req->rq_reqmsg->transno; + int inserted = 0; + __u64 transno = req->rq_reqmsg->transno; + struct ptlrpc_request *saved_req; if (!transno) { - DEBUG_REQ(D_HA, req, "not queueing"); + INIT_LIST_HEAD(&req->rq_list); + DEBUG_REQ(D_ERROR, req, "not queueing"); return 1; } @@ -1147,14 +1150,23 @@ static int queue_recovery_request(struct ptlrpc_request *req, if (mds->mds_processing_task == current->pid) { /* Processing the queue right now, don't re-add. */ + LASSERT(list_empty(&req->rq_list)); spin_unlock(&mds->mds_processing_task_lock); return 1; } + OBD_ALLOC(saved_req, sizeof *saved_req); + if (!saved_req) + LBUG(); + memcpy(saved_req, req, sizeof *req); + req = saved_req; + INIT_LIST_HEAD(&req->rq_list); + /* XXX O(n^2) */ list_for_each(tmp, &mds->mds_recovery_queue) { struct ptlrpc_request *reqiter = list_entry(tmp, struct ptlrpc_request, rq_list); + if (reqiter->rq_reqmsg->transno > transno) { list_add_tail(&req->rq_list, &reqiter->rq_list); inserted = 1; @@ -1162,16 +1174,17 @@ static int queue_recovery_request(struct ptlrpc_request *req, } } - if (!inserted) + if (!inserted) { list_add_tail(&req->rq_list, &mds->mds_recovery_queue); + } if (mds->mds_processing_task != 0) { /* Someone else is processing this queue, we'll leave it to * them. */ - spin_unlock(&mds->mds_processing_task_lock); if (transno == mds->mds_next_recovery_transno) wake_up(&mds->mds_next_transno_waitq); + spin_unlock(&mds->mds_processing_task_lock); return 0; } @@ -1191,10 +1204,10 @@ static int filter_recovery_request(struct ptlrpc_request *req, switch (req->rq_reqmsg->opc) { case MDS_CONNECT: case MDS_DISCONNECT: - case MDS_OPEN: *process = 1; RETURN(0); + case MDS_OPEN: case MDS_GETSTATUS: /* used in unmounting */ case MDS_REINT: case LDLM_ENQUEUE: @@ -1204,6 +1217,7 @@ static int filter_recovery_request(struct ptlrpc_request *req, default: DEBUG_REQ(D_ERROR, req, "not permitted during recovery"); *process = 0; + /* XXX what should we set rq_status to here? */ RETURN(ptlrpc_error(req->rq_svc, req)); } } @@ -1211,7 +1225,9 @@ static int filter_recovery_request(struct ptlrpc_request *req, static int mds_queue_final_reply(struct ptlrpc_request *req, int rc) { struct mds_obd *mds = mds_req2mds(req); + struct ptlrpc_request *saved_req; + spin_lock(&mds->mds_processing_task_lock); if (rc) { /* Just like ptlrpc_error, but without the sending. */ lustre_pack_msg(0, NULL, NULL, &req->rq_replen, @@ -1219,22 +1235,29 @@ static int mds_queue_final_reply(struct ptlrpc_request *req, int rc) req->rq_type = PTL_RPC_MSG_ERR; } + LASSERT(list_empty(&req->rq_list)); + OBD_ALLOC(saved_req, sizeof *saved_req); + memcpy(saved_req, req, sizeof *saved_req); + req = saved_req; list_add(&req->rq_list, &mds->mds_delayed_reply_queue); if (--mds->mds_recoverable_clients == 0) { struct list_head *tmp, *n; - - CDEBUG(D_HA, + ldlm_reprocess_all_ns(req->rq_export->exp_obd->obd_namespace); + CDEBUG(D_ERROR, "all clients recovered, sending delayed replies\n"); list_for_each_safe(tmp, n, &mds->mds_delayed_reply_queue) { req = list_entry(tmp, struct ptlrpc_request, rq_list); - DEBUG_REQ(D_HA, req, "delayed:"); + DEBUG_REQ(D_ERROR, req, "delayed:"); ptlrpc_reply(req->rq_svc, req); + list_del(&req->rq_list); + OBD_FREE(req, sizeof *req); } } else { - CDEBUG(D_HA, "%d recoverable clients remain\n", + CERROR("%d recoverable clients remain\n", mds->mds_recoverable_clients); } + spin_unlock(&mds->mds_processing_task_lock); return 1; } @@ -1255,7 +1278,7 @@ int mds_handle(struct ptlrpc_request *req) rc = lustre_unpack_msg(req->rq_reqmsg, req->rq_reqlen); if (rc || OBD_FAIL_CHECK(OBD_FAIL_MDS_HANDLE_UNPACK)) { - CERROR("lustre_mds: Invalid request\n"); + DEBUG_REQ(D_ERROR, req, "invalid request (%d)", rc); GOTO(out, rc); } @@ -1286,6 +1309,12 @@ int mds_handle(struct ptlrpc_request *req) mds = mds_req2mds(req); mds_fsync_super(mds->mds_sb); } + + /* Let the client know if it can replay. */ + if (mds->mds_recoverable_clients) { + lustre_msg_add_flags(req->rq_repmsg, + MSG_REPLAY_IN_PROGRESS); + } break; case MDS_DISCONNECT: @@ -1295,7 +1324,8 @@ int mds_handle(struct ptlrpc_request *req) /* Make sure that last_rcvd is correct. */ if (!rc) mds_fsync_super(mds->mds_sb); - goto out; + req->rq_status = rc; + break; case MDS_GETSTATUS: DEBUG_REQ(D_INODE, req, "getstatus"); @@ -1314,6 +1344,12 @@ int mds_handle(struct ptlrpc_request *req) rc = mds_getattr(0, req); break; + case MDS_GETATTR_NAME: + DEBUG_REQ(D_INODE, req, "getattr_name"); + OBD_FAIL_RETURN(OBD_FAIL_MDS_GETATTR_NAME_NET, 0); + rc = mds_getattr_name(0, req); + break; + case MDS_STATFS: DEBUG_REQ(D_INODE, req, "statfs"); OBD_FAIL_RETURN(OBD_FAIL_MDS_STATFS_NET, 0); @@ -1387,14 +1423,15 @@ int mds_handle(struct ptlrpc_request *req) EXIT; - if (!rc) { + /* If we're DISCONNECTing, the mds_export_data is already freed */ + if (!rc && req->rq_reqmsg->opc != MDS_DISCONNECT) { struct mds_export_data *med = &req->rq_export->exp_mds_data; req->rq_repmsg->last_xid = HTON__u64(le64_to_cpu(med->med_mcd->mcd_last_xid)); req->rq_repmsg->last_committed = HTON__u64(mds->mds_last_committed); - CDEBUG(D_INFO, "last_rcvd ~%Lu, last_committed %Lu, xid %d\n", + CDEBUG(D_INFO, "last_transno %Lu, last_committed %Lu, xid %d\n", (unsigned long long)mds->mds_last_rcvd, (unsigned long long)mds->mds_last_committed, cpu_to_le32(req->rq_xid)); @@ -1408,14 +1445,14 @@ int mds_handle(struct ptlrpc_request *req) return mds_queue_final_reply(req, rc); } + /* XXX bug 578 */ /* MDS_CONNECT / EALREADY (note: not -EALREADY!) isn't an error */ if (rc && (req->rq_reqmsg->opc != MDS_CONNECT || rc != EALREADY)) { - CERROR("mds: processing error (opcode %d): %d\n", - req->rq_reqmsg->opc, rc); + DEBUG_REQ(D_ERROR, req, "processing error (%d)", rc); ptlrpc_error(req->rq_svc, req); } else { - CDEBUG(D_NET, "sending reply\n"); + DEBUG_REQ(D_NET, req, "sending reply"); ptlrpc_reply(req->rq_svc, req); } return 0; @@ -1426,17 +1463,17 @@ int mds_handle(struct ptlrpc_request *req) * then the server last_rcvd value may be less than that of the clients. * This will alert us that we may need to do client recovery. * - * Assumes we are already in the server filesystem context. - * * Also assumes for mds_last_rcvd that we are not modifying it (no locking). */ int mds_update_server_data(struct mds_obd *mds) { struct mds_server_data *msd = mds->mds_server_data; struct file *filp = mds->mds_rcvd_filp; + struct obd_run_ctxt saved; loff_t off = 0; int rc; + push_ctxt(&saved, &mds->mds_ctxt, NULL); msd->msd_last_rcvd = cpu_to_le64(mds->mds_last_rcvd); msd->msd_mount_count = cpu_to_le64(mds->mds_mount_count); @@ -1447,8 +1484,8 @@ int mds_update_server_data(struct mds_obd *mds) if (rc != sizeof(*msd)) { CERROR("error writing MDS server data: rc = %d\n", rc); if (rc > 0) - RETURN(-EIO); - RETURN(rc); + rc = -EIO; + GOTO(out, rc); } #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) rc = fsync_dev(filp->f_dentry->d_inode->i_rdev); @@ -1458,25 +1495,9 @@ int mds_update_server_data(struct mds_obd *mds) if (rc) CERROR("error flushing MDS server data: rc = %d\n", rc); - return 0; -} - -/* Do recovery actions for the MDS */ -static int mds_recovery_complete(struct obd_device *obddev) -{ - struct mds_obd *mds = &obddev->u.mds; - struct obd_run_ctxt saved; - int rc; - - LASSERT(mds->mds_recoverable_clients == 0); - - /* This happens at the end when recovery is complete */ - ++mds->mds_mount_count; - push_ctxt(&saved, &mds->mds_ctxt, NULL); - rc = mds_update_server_data(mds); +out: pop_ctxt(&saved, &mds->mds_ctxt, NULL); - - return rc; + RETURN(rc); } /* mount the file system (secretly) */ @@ -1488,16 +1509,15 @@ static int mds_setup(struct obd_device *obddev, obd_count len, void *buf) int rc = 0; ENTRY; - MOD_INC_USE_COUNT; #ifdef CONFIG_DEV_RDONLY dev_clear_rdonly(2); #endif if (!data->ioc_inlbuf1 || !data->ioc_inlbuf2) - GOTO(err_dec, rc = -EINVAL); + RETURN(rc = -EINVAL); obddev->obd_fsops = fsfilt_get_ops(data->ioc_inlbuf2); if (IS_ERR(obddev->obd_fsops)) - GOTO(err_dec, rc = PTR_ERR(obddev->obd_fsops)); + RETURN(rc = PTR_ERR(obddev->obd_fsops)); mnt = do_kern_mount(data->ioc_inlbuf2, 0, data->ioc_inlbuf1, NULL); if (IS_ERR(mnt)) { @@ -1531,8 +1551,10 @@ static int mds_setup(struct obd_device *obddev, obd_count len, void *buf) spin_lock_init(&mds->mds_processing_task_lock); mds->mds_processing_task = 0; + mds->mds_has_lov_desc = 0; INIT_LIST_HEAD(&mds->mds_recovery_queue); INIT_LIST_HEAD(&mds->mds_delayed_reply_queue); + init_waitqueue_head(&mds->mds_next_transno_waitq); RETURN(0); @@ -1545,8 +1567,6 @@ err_put: lock_kernel(); err_ops: fsfilt_put_ops(obddev->obd_fsops); -err_dec: - MOD_DEC_USE_COUNT; RETURN(rc); } @@ -1554,24 +1574,14 @@ static int mds_cleanup(struct obd_device *obddev) { struct super_block *sb; struct mds_obd *mds = &obddev->u.mds; - struct obd_run_ctxt saved; ENTRY; sb = mds->mds_sb; if (!mds->mds_sb) RETURN(0); - push_ctxt(&saved, &mds->mds_ctxt, NULL); mds_update_server_data(mds); - - if (mds->mds_rcvd_filp) { - int rc = filp_close(mds->mds_rcvd_filp, 0); - mds->mds_rcvd_filp = NULL; - - if (rc) - CERROR("last_rcvd file won't close, rc=%d\n", rc); - } - pop_ctxt(&saved, &mds->mds_ctxt, NULL); + mds_fs_cleanup(obddev); unlock_kernel(); mntput(mds->mds_vfsmnt); @@ -1583,15 +1593,14 @@ static int mds_cleanup(struct obd_device *obddev) #ifdef CONFIG_DEV_RDONLY dev_clear_rdonly(2); #endif - mds_fs_cleanup(obddev); fsfilt_put_ops(obddev->obd_fsops); - MOD_DEC_USE_COUNT; RETURN(0); } -static int ldlm_intent_policy(struct ldlm_lock *lock, void *req_cookie, - ldlm_mode_t mode, int flags, void *data) +static int ldlm_intent_policy(struct ldlm_namespace *ns, struct ldlm_lock *lock, + void *req_cookie, ldlm_mode_t mode, int flags, + void *data) { struct ptlrpc_request *req = req_cookie; int rc = 0; @@ -1603,7 +1612,7 @@ static int ldlm_intent_policy(struct ldlm_lock *lock, void *req_cookie, if (req->rq_reqmsg->bufcount > 1) { /* an intent needs to be considered */ struct ldlm_intent *it = lustre_msg_buf(req->rq_reqmsg, 1); - struct mds_obd *mds= &req->rq_export->exp_obd->u.mds; + struct mds_obd *mds = &req->rq_export->exp_obd->u.mds; struct mds_body *mds_rep; struct ldlm_reply *rep; __u64 new_resid[3] = {0, 0, 0}, old_res; @@ -1692,8 +1701,9 @@ static int ldlm_intent_policy(struct ldlm_lock *lock, void *req_cookie, rep->lock_policy_res2 = req->rq_status; mds_rep = lustre_msg_buf(req->rq_repmsg, 1); - /* If the client is about to open a file that doesn't have an MD - * stripe record, it's going to need a write lock. */ + /* If the client is about to open a file that doesn't have an + * MD stripe record, it's going to need a write lock. + */ if (it->opc & IT_OPEN && !(mds_rep->valid & OBD_MD_FLEASIZE)) { LDLM_DEBUG(lock, "open with no EA; returning PW lock"); lock->l_req_mode = LCK_PW; @@ -1711,7 +1721,7 @@ static int ldlm_intent_policy(struct ldlm_lock *lock, void *req_cookie, LBUG(); old_res = lock->l_resource->lr_name[0]; - ldlm_lock_change_resource(lock, new_resid); + ldlm_lock_change_resource(ns, lock, new_resid); if (lock->l_resource == NULL) { LBUG(); RETURN(-ENOMEM); @@ -1749,15 +1759,13 @@ static int mdt_setup(struct obd_device *obddev, obd_count len, void *buf) int rc = 0; ENTRY; - MOD_INC_USE_COUNT; - mds->mds_service = ptlrpc_init_svc(MDS_NEVENTS, MDS_NBUFS, MDS_BUFSIZE, MDS_MAXREQSIZE, MDS_REQUEST_PORTAL, MDC_REPLY_PORTAL, "self", mds_handle, "mds"); if (!mds->mds_service) { CERROR("failed to start service\n"); - GOTO(err_dec, rc = -ENOMEM); + RETURN(rc = -ENOMEM); } for (i = 0; i < MDT_NUM_THREADS; i++) { @@ -1775,8 +1783,6 @@ static int mdt_setup(struct obd_device *obddev, obd_count len, void *buf) err_thread: ptlrpc_stop_all_threads(mds->mds_service); ptlrpc_unregister_service(mds->mds_service); -err_dec: - MOD_DEC_USE_COUNT; RETURN(rc); } @@ -1789,7 +1795,6 @@ static int mdt_cleanup(struct obd_device *obddev) ptlrpc_stop_all_threads(mds->mds_service); ptlrpc_unregister_service(mds->mds_service); - MOD_DEC_USE_COUNT; RETURN(0); } @@ -1798,6 +1803,7 @@ extern int mds_iocontrol(unsigned int cmd, struct lustre_handle *conn, /* use obd ops to offer management infrastructure */ static struct obd_ops mds_obd_ops = { + o_owner: THIS_MODULE, o_attach: mds_attach, o_detach: mds_detach, o_connect: mds_connect, @@ -1808,6 +1814,7 @@ static struct obd_ops mds_obd_ops = { }; static struct obd_ops mdt_obd_ops = { + o_owner: THIS_MODULE, o_setup: mdt_setup, o_cleanup: mdt_cleanup, }; diff --git a/lustre/mds/lproc_mds.c b/lustre/mds/lproc_mds.c index 7028603..37c7bc8 100644 --- a/lustre/mds/lproc_mds.c +++ b/lustre/mds/lproc_mds.c @@ -25,68 +25,63 @@ #include #include -int rd_uuid(char* page, char **start, off_t off, int count, int *eof, +int rd_uuid(char *page, char **start, off_t off, int count, int *eof, void *data) { struct obd_device* temp = (struct obd_device*)data; - int len = 0; - len += snprintf(page, count, "%s\n", temp->obd_uuid); - return len; + return snprintf(page, count, "%s\n", temp->obd_uuid); } -int rd_blksize(char* page, char **start, off_t off, int count, int *eof, + +int rd_blksize(char *page, char **start, off_t off, int count, int *eof, void *data) { struct obd_device* temp = (struct obd_device*)data; struct mds_obd *mds = &temp->u.mds; struct statfs mystats; - int rc, len = 0; - + int rc; + rc = vfs_statfs(mds->mds_sb, &mystats); if (rc) { CERROR("mds: statfs failed: rc %d\n", rc); return 0; } - len += snprintf(page, count, LPU64"\n", (__u64)(mystats.f_bsize)); - return len; - + return snprintf(page, count, LPU64"\n", (__u64)(mystats.f_bsize)); } -int rd_kbtotal(char* page, char **start, off_t off, int count, int *eof, + +int rd_kbtotal(char *page, char **start, off_t off, int count, int *eof, void *data) { struct obd_device* temp = (struct obd_device*)data; struct mds_obd *mds = &temp->u.mds; struct statfs mystats; - int rc, len = 0; + int rc; __u32 blk_size; __u64 result; - + rc = vfs_statfs(mds->mds_sb, &mystats); if (rc) { CERROR("mds: statfs failed: rc %d\n", rc); return 0; } - + blk_size = mystats.f_bsize; blk_size >>= 10; result = mystats.f_blocks; - while(blk_size >>= 1){ + while(blk_size >>= 1) result <<= 1; - } - len += snprintf(page, count, LPU64"\n", result); - return len; - + + return snprintf(page, count, LPU64"\n", result); } -int rd_kbfree(char* page, char **start, off_t off, int count, int *eof, +int rd_kbfree(char *page, char **start, off_t off, int count, int *eof, void *data) { struct obd_device* temp = (struct obd_device*)data; struct mds_obd *mds = &temp->u.mds; struct statfs mystats; - int rc, len = 0; + int rc; __u32 blk_size; __u64 result; - rc = vfs_statfs(mds->mds_sb, &mystats); if (rc) { @@ -96,12 +91,10 @@ int rd_kbfree(char* page, char **start, off_t off, int count, int *eof, blk_size = mystats.f_bsize; blk_size >>= 10; result = mystats.f_blocks; - while(blk_size >>= 1){ + while (blk_size >>= 1) result <<= 1; - } - len += snprintf(page, count, LPU64"\n", result); - return len; - + + return snprintf(page, count, LPU64"\n", result); } int rd_fstype(char *page, char **start, off_t off, int count, int *eof, @@ -112,45 +105,41 @@ int rd_fstype(char *page, char **start, off_t off, int count, int *eof, return snprintf(page, count, "%s\n", obd->obd_fsops->fs_type); } -int rd_filestotal(char* page, char **start, off_t off, int count, int *eof, +int rd_filestotal(char *page, char **start, off_t off, int count, int *eof, void *data) { struct obd_device* temp = (struct obd_device*)data; struct mds_obd *mds = &temp->u.mds; struct statfs mystats; - int rc, len = 0; - + int rc; + rc = vfs_statfs(mds->mds_sb, &mystats); if (rc) { CERROR("mds: statfs failed: rc %d\n", rc); return 0; } - - len += snprintf(page, count, LPU64"\n", (__u64)(mystats.f_files)); - return len; - - + return snprintf(page, count, LPU64"\n", (__u64)(mystats.f_files)); } -int rd_filesfree(char* page, char **start, off_t off, int count, int *eof, - void *data) +int rd_filesfree(char *page, char **start, off_t off, int count, int *eof, + void *data) { struct obd_device* temp = (struct obd_device*)data; struct mds_obd *mds = &temp->u.mds; struct statfs mystats; int rc, len = 0; - + rc = vfs_statfs(mds->mds_sb, &mystats); if (rc) { CERROR("mds: statfs failed: rc %d\n", rc); return 0; } - + len += snprintf(page, count, LPU64"\n", (__u64)(mystats.f_ffree)); - return len; + return len; } -int rd_filegroups(char* page, char **start, off_t off, int count, int *eof, +int rd_filegroups(char *page, char **start, off_t off, int count, int *eof, void *data) { return 0; @@ -166,13 +155,13 @@ struct lprocfs_vars status_var_nm_1[]={ {"status/filegroups", rd_filegroups, 0, 0}, {0} }; -int rd_numrefs(char* page, char **start, off_t off, int count, int *eof, + +int rd_numrefs(char *page, char **start, off_t off, int count, int *eof, void *data) { - struct obd_type* class = (struct obd_type*)data; - int len = 0; - len += snprintf(page, count, "%d\n", class->typ_refcnt); - return len; + struct obd_type *class = (struct obd_type*)data; + + return snprintf(page, count, "%d\n", class->typ_refcnt); } struct lprocfs_vars status_class_var[]={ diff --git a/lustre/mds/mds_fs.c b/lustre/mds/mds_fs.c index 83201aa..3f6c420 100644 --- a/lustre/mds/mds_fs.c +++ b/lustre/mds/mds_fs.c @@ -88,6 +88,9 @@ int mds_client_add(struct mds_obd *mds, struct mds_export_data *med, int cl_off) RETURN(written); RETURN(-EIO); } + CDEBUG(D_INFO, "wrote client mcd at off %u (len %u)\n", + MDS_LR_CLIENT + (cl_off * MDS_LR_SIZE), + (unsigned int)sizeof(*med->med_mcd)); } return 0; } @@ -104,28 +107,27 @@ int mds_client_free(struct obd_export *exp) if (!med->med_mcd) RETURN(0); - CDEBUG(D_INFO, "freeing client at offset %d with UUID '%s'\n", - med->med_off, med->med_mcd->mcd_uuid); + off = MDS_LR_CLIENT + (med->med_off * MDS_LR_SIZE); + + CDEBUG(D_INFO, "freeing client at offset %u (%lld)with UUID '%s'\n", + med->med_off, off, med->med_mcd->mcd_uuid); if (!test_and_clear_bit(med->med_off, last_rcvd_slots)) { - CERROR("MDS client %d: bit already clear in bitmap!!\n", + CERROR("MDS client %u: bit already clear in bitmap!!\n", med->med_off); LBUG(); } - off = med->med_off; - memset(&zero_mcd, 0, sizeof zero_mcd); push_ctxt(&saved, &mds->mds_ctxt, NULL); written = lustre_fwrite(mds->mds_rcvd_filp, (const char *)&zero_mcd, - sizeof zero_mcd, &off); + sizeof(zero_mcd), &off); pop_ctxt(&saved, &mds->mds_ctxt, NULL); - if (written != sizeof zero_mcd) { + if (written != sizeof(zero_mcd)) { CERROR("error zeroing out client %s off %d in %s: %d\n", med->med_mcd->mcd_uuid, med->med_off, LAST_RCVD, written); - LBUG(); } else { CDEBUG(D_INFO, "zeroed out disconnecting client %s at off %d\n", med->med_mcd->mcd_uuid, med->med_off); @@ -151,7 +153,7 @@ static int mds_read_last_rcvd(struct obd_device *obddev, struct file *f) struct mds_client_data *mcd = NULL; loff_t off = 0; int cl_off; - int max_off = f->f_dentry->d_inode->i_size / sizeof(*mcd); + unsigned long last_rcvd_size = f->f_dentry->d_inode->i_size; __u64 last_rcvd = 0; __u64 last_mount; int rc = 0; @@ -169,12 +171,14 @@ static int mds_read_last_rcvd(struct obd_device *obddev, struct file *f) if (rc != sizeof(*msd)) { CERROR("error reading MDS %s: rc = %d\n", LAST_RCVD, rc); - if (rc > 0) { + if (rc > 0) rc = -EIO; - } GOTO(err_msd, rc); } + CDEBUG(D_INODE, "last_rcvd has size %lu (msd + %lu clients)\n", + last_rcvd_size, (last_rcvd_size - sizeof *msd) / sizeof *mcd); + /* * When we do a clean MDS shutdown, we save the last_rcvd into * the header. If we find clients with higher last_rcvd values @@ -182,17 +186,14 @@ static int mds_read_last_rcvd(struct obd_device *obddev, struct file *f) */ last_rcvd = le64_to_cpu(msd->msd_last_rcvd); mds->mds_last_rcvd = last_rcvd; - CDEBUG(D_INODE, "got %Lu for server last_rcvd value\n", - (unsigned long long)last_rcvd); + CDEBUG(D_INODE, "got "LPU64" for server last_rcvd value\n", last_rcvd); last_mount = le64_to_cpu(msd->msd_mount_count); mds->mds_mount_count = last_mount; - CDEBUG(D_INODE, "got %Lu for server last_mount value\n", - (unsigned long long)last_mount); + CDEBUG(D_INODE, "got "LPU64" for server last_mount value\n",last_mount); - for (off = MDS_LR_CLIENT, cl_off = 0; - off < max_off; - off += MDS_LR_SIZE, cl_off++) { + /* off is adjusted by lustre_fread, so we don't adjust it in the loop */ + for (off = MDS_LR_CLIENT, cl_off = 0; off < last_rcvd_size; cl_off++) { int mount_age; if (!mcd) { @@ -205,7 +206,7 @@ static int mds_read_last_rcvd(struct obd_device *obddev, struct file *f) if (rc != sizeof(*mcd)) { CERROR("error reading MDS %s offset %d: rc = %d\n", LAST_RCVD, cl_off, rc); - if (rc > 0) + if (rc > 0) /* XXX fatal error or just abort reading? */ rc = -EIO; break; } @@ -218,11 +219,11 @@ static int mds_read_last_rcvd(struct obd_device *obddev, struct file *f) last_rcvd = le64_to_cpu(mcd->mcd_last_rcvd); - /* The exports are cleaned up by mds_disconnect, so they - * need to be set up like real exports also. + /* These exports are cleaned up by mds_disconnect(), so they + * need to be set up like real exports as mds_connect() does. */ mount_age = last_mount - le64_to_cpu(mcd->mcd_mount_count); - if (last_rcvd && mount_age < MDS_MOUNT_RECOV) { + if (mount_age < MDS_MOUNT_RECOV) { struct obd_export *exp = class_new_export(obddev); struct mds_export_data *med; @@ -234,13 +235,12 @@ static int mds_read_last_rcvd(struct obd_device *obddev, struct file *f) med = &exp->exp_mds_data; med->med_mcd = mcd; mds_client_add(mds, med, cl_off); - /* XXX put this in a helper if it gets more complex */ + /* create helper if export init gets more complex */ INIT_LIST_HEAD(&med->med_open_head); spin_lock_init(&med->med_open_lock); mcd = NULL; mds->mds_recoverable_clients++; - MOD_INC_USE_COUNT; } else { CDEBUG(D_INFO, "discarded client %d, UUID '%s', count %Ld\n", @@ -248,18 +248,18 @@ static int mds_read_last_rcvd(struct obd_device *obddev, struct file *f) (long long)le64_to_cpu(mcd->mcd_mount_count)); } - if (last_rcvd > mds->mds_last_rcvd) { - CDEBUG(D_OTHER, - "client at offset %d has last_rcvd = %Lu\n", - cl_off, (unsigned long long)last_rcvd); + CDEBUG(D_OTHER, "client at offset %d has last_rcvd = %Lu\n", + cl_off, (unsigned long long)last_rcvd); + + if (last_rcvd > mds->mds_last_rcvd) mds->mds_last_rcvd = last_rcvd; - } } mds->mds_last_committed = mds->mds_last_rcvd; if (mds->mds_recoverable_clients) { - CERROR("need recovery: %d recoverable clients, last_rcvd %Lu\n", + CERROR("RECOVERY: %d recoverable clients, last_rcvd "LPU64"\n", mds->mds_recoverable_clients, mds->mds_last_rcvd); + mds->mds_next_recovery_transno = mds->mds_last_committed + 1; } if (mcd) @@ -312,7 +312,7 @@ static int mds_fs_prep(struct obd_device *obddev) if (!S_ISREG(f->f_dentry->d_inode->i_mode)) { CERROR("%s is not a regular file!: mode = %o\n", LAST_RCVD, f->f_dentry->d_inode->i_mode); - GOTO(err_pop, rc = -ENOENT); + GOTO(err_filp, rc = -ENOENT); } rc = fsfilt_journal_data(obddev, f); @@ -355,10 +355,24 @@ int mds_fs_setup(struct obd_device *obddev, struct vfsmount *mnt) RETURN(mds_fs_prep(obddev)); } -void mds_fs_cleanup(struct obd_device *obddev) +int mds_fs_cleanup(struct obd_device *obddev) { struct mds_obd *mds = &obddev->u.mds; + struct obd_run_ctxt saved; + int rc = 0; class_disconnect_all(obddev); /* this cleans up client info too */ mds_server_free_data(mds); + + push_ctxt(&saved, &mds->mds_ctxt, NULL); + if (mds->mds_rcvd_filp) { + rc = filp_close(mds->mds_rcvd_filp, 0); + mds->mds_rcvd_filp = NULL; + + if (rc) + CERROR("last_rcvd file won't close, rc=%d\n", rc); + } + pop_ctxt(&saved, &mds->mds_ctxt, NULL); + + return rc; } diff --git a/lustre/mds/mds_lov.c b/lustre/mds/mds_lov.c index ba9a750..b548792 100644 --- a/lustre/mds/mds_lov.c +++ b/lustre/mds/mds_lov.c @@ -46,6 +46,32 @@ int mds_set_lovdesc(struct obd_device *obd, struct lov_desc *desc, ENTRY; tgt_count = desc->ld_tgt_count; + if (desc->ld_default_stripe_count > desc->ld_tgt_count) { + CERROR("default stripe count %u > OST count %u\n", + desc->ld_default_stripe_count, desc->ld_tgt_count); + RETURN(-EINVAL); + } + if (desc->ld_default_stripe_size & (PAGE_SIZE - 1)) { + CERROR("default stripe size "LPU64" not a multiple of %lu\n", + desc->ld_default_stripe_size, PAGE_SIZE); + RETURN(-EINVAL); + } + if (desc->ld_default_stripe_offset > desc->ld_tgt_count) { + CERROR("default stripe offset "LPU64" > max OST index %u\n", + desc->ld_default_stripe_offset, desc->ld_tgt_count); + RETURN(-EINVAL); + } + if (desc->ld_pattern != 0) { + CERROR("stripe pattern %u unknown\n", + desc->ld_pattern); + RETURN(-EINVAL); + } + + memcpy(&mds->mds_lov_desc, desc, sizeof *desc); + mds->mds_has_lov_desc = 1; + /* XXX the MDS should not really know about this */ + mds->mds_max_mdsize = lov_mds_md_size(desc->ld_tgt_count); + lov_packdesc(desc); push_ctxt(&saved, &mds->mds_ctxt, NULL); @@ -55,6 +81,7 @@ int mds_set_lovdesc(struct obd_device *obd, struct lov_desc *desc, GOTO(out, rc = PTR_ERR(f)); } +#warning FIXME: if there is an existing LOVDESC, verify new tgt_count > old rc = lustre_fwrite(f, (char *)desc, sizeof(*desc), &f->f_pos); if (filp_close(f, 0)) CERROR("Error closing LOVDESC file\n"); @@ -69,6 +96,7 @@ int mds_set_lovdesc(struct obd_device *obd, struct lov_desc *desc, GOTO(out, rc = PTR_ERR(f)); } +#warning FIXME: if there is an existing LOVTGTS, verify existing UUIDs same rc = 0; for (i = 0; i < tgt_count ; i++) { rc = lustre_fwrite(f, uuidarray[i], diff --git a/lustre/mds/mds_reint.c b/lustre/mds/mds_reint.c index 9151326..3d340f7 100644 --- a/lustre/mds/mds_reint.c +++ b/lustre/mds/mds_reint.c @@ -86,7 +86,7 @@ int mds_finish_transno(struct mds_obd *mds, void *handle, written = lustre_fwrite(mds->mds_rcvd_filp, (char *)mcd, sizeof(*mcd), &off); CDEBUG(D_INODE, "wrote trans #"LPD64" for client %s at #%d: written = " - "%d\n", last_rcvd, mcd->mcd_uuid, med->med_off, written); + LPSZ"\n", last_rcvd, mcd->mcd_uuid, med->med_off, written); if (written == sizeof(*mcd)) GOTO(out, rc = 0); @@ -220,7 +220,7 @@ static int mds_reint_create(struct mds_update_record *rec, int offset, CDEBUG(D_INODE, "parent ino %lu name %s mode %o\n", dir->i_ino, rec->ur_name, rec->ur_mode); - ldlm_lock_dump_handle(&lockh); + ldlm_lock_dump_handle(D_OTHER, &lockh); down(&dir->i_sem); dchild = lookup_one_len(rec->ur_name, de, rec->ur_namelen - 1); @@ -264,6 +264,11 @@ static int mds_reint_create(struct mds_update_record *rec, int offset, rec->ur_mode |= S_ISGID; } + if (rec->ur_fid2->id) + dchild->d_fsdata = (void *)(unsigned long)rec->ur_fid2->id; + else + LASSERT(!(rec->ur_opcode & REINT_REPLAYING)); + /* From here on, we must exit via a path that calls mds_finish_transno, * so that we release the mds_transno_sem (and, in the case of success, * update the transno correctly). out_create_commit and @@ -314,6 +319,11 @@ static int mds_reint_create(struct mds_update_record *rec, int offset, GOTO(out_transno_dchild, rc = -EINVAL); } + /* In case we stored the desired inum in here, we want to clean up. + * We also do this in the out_transno_dchild block, for the error cases. + */ + dchild->d_fsdata = NULL; + if (rc) { CDEBUG(D_INODE, "error during create: %d\n", rc); GOTO(out_create_commit, rc); @@ -331,13 +341,14 @@ static int mds_reint_create(struct mds_update_record *rec, int offset, ATTR_MTIME | ATTR_CTIME; if (rec->ur_fid2->id) { - LASSERT(rec->ur_opcode & REINT_REPLAYING); + LASSERT(rec->ur_fid2->id == inode->i_ino); inode->i_generation = rec->ur_fid2->generation; /* Dirtied and committed by the upcoming setattr. */ - CDEBUG(D_INODE, "recreated ino %lu with gen %lu\n", + CDEBUG(D_INODE, "recreated ino %lu with gen %x\n", inode->i_ino, inode->i_generation); } else { - CDEBUG(D_INODE, "created ino %lu\n", inode->i_ino); + CDEBUG(D_INODE, "created ino %lu with gen %x\n", + inode->i_ino, inode->i_generation); } rc = fsfilt_setattr(obd, dchild, handle, &iattr); @@ -376,6 +387,7 @@ out_create: return 0; out_transno_dchild: + dchild->d_fsdata = NULL; /* Need to release the transno lock, and then put the dchild. */ LASSERT(rc); mds_finish_transno(mds, handle, req, rc); @@ -442,7 +454,8 @@ static int mds_reint_unlink(struct mds_update_record *rec, int offset, dir = de->d_inode; inode = dchild->d_inode; - CDEBUG(D_INODE, "parent ino %lu\n", dir->i_ino); + DEBUG_REQ(D_INODE, req, "parent ino %lu, child ino %lu\n", dir->i_ino, + inode ? inode->i_ino : 0); if (!inode) { if (rec->ur_opcode & REINT_REPLAYING) { @@ -572,7 +585,7 @@ static int mds_reint_link(struct mds_update_record *rec, int offset, GOTO(out_link_src_put, rc = -EIO); } } else { - ldlm_lock_dump_handle(&srclockh); + ldlm_lock_dump_handle(D_OTHER, &srclockh); } de_tgt_dir = mds_fid2dentry(mds, rec->ur_fid2, NULL); @@ -597,7 +610,7 @@ static int mds_reint_link(struct mds_update_record *rec, int offset, GOTO(out_link_tgt_dir_put, rc = -EIO); } } else { - ldlm_lock_dump_handle(&tgtlockh); + ldlm_lock_dump_handle(D_OTHER, &tgtlockh); } down(&de_tgt_dir->d_inode->i_sem); @@ -709,7 +722,7 @@ static int mds_reint_rename(struct mds_update_record *rec, int offset, GOTO(out_rename_srcput, rc = -EIO); } } else { - ldlm_lock_dump_handle(&srclockh); + ldlm_lock_dump_handle(D_OTHER, &srclockh); } de_tgtdir = mds_fid2dentry(mds, rec->ur_fid2, NULL); @@ -734,7 +747,7 @@ static int mds_reint_rename(struct mds_update_record *rec, int offset, GOTO(out_rename_tgtput, rc = -EIO); } } else { - ldlm_lock_dump_handle(&tgtlockh); + ldlm_lock_dump_handle(D_OTHER, &tgtlockh); } #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) diff --git a/lustre/obdclass/Makefile.am b/lustre/obdclass/Makefile.am index ed2b321..6f7f8fc 100644 --- a/lustre/obdclass/Makefile.am +++ b/lustre/obdclass/Makefile.am @@ -9,16 +9,18 @@ else FSMOD = fsfilt_extN endif -modulefs_DATA = lustre_build_version obdclass.o $(FSMOD).o -EXTRA_PROGRAMS = obdclass $(FSMOD) +modulefs_DATA = lustre_build_version obdclass.o $(FSMOD).o fsfilt_reiserfs.o +EXTRA_PROGRAMS = obdclass $(FSMOD) fsfilt_reiserfs obdclass_SOURCES = debug.c genops.c class_obd.c sysctl.c uuid.c lprocfs_status.c -obdclass_SOURCES += fsfilt.c +obdclass_SOURCES += fsfilt.c statfs_pack.c include $(top_srcdir)/Rules + +# XXX I'm sure there's some automake mv-if-different helper for this. lustre_build_version: perl $(top_srcdir)/scripts/version_tag.pl $(top_srcdir) > tmpver - diff -u $(top_builddir)/include/linux/lustre_build_version.h tmpver \ - 2> /dev/null &&\ - $(RM) tmpver || \ + cmp -z $(top_builddir)/include/linux/lustre_build_version.h tmpver \ + 2> /dev/null && \ + $(RM) tmpver || \ mv tmpver $(top_builddir)/include/linux/lustre_build_version.h diff --git a/lustre/obdclass/class_obd.c b/lustre/obdclass/class_obd.c index 4769d61..61e9114 100644 --- a/lustre/obdclass/class_obd.c +++ b/lustre/obdclass/class_obd.c @@ -52,22 +52,19 @@ struct semaphore obd_conf_sem; /* serialize configuration commands */ struct obd_device obd_dev[MAX_OBD_DEVICES]; struct list_head obd_types; atomic_t obd_memory; +int obd_memmax; /* The following are visible and mutable through /proc/sys/lustre/. */ unsigned long obd_fail_loc; unsigned long obd_timeout = 100; char obd_recovery_upcall[128] = "/usr/lib/lustre/ha_assist"; -extern struct obd_type *class_nm_to_type(char *nm); - /* opening /dev/obd */ static int obd_class_open(struct inode * inode, struct file * file) { ENTRY; file->private_data = NULL; - CDEBUG(D_IOCTL, "MOD_INC_USE for open: count = %d\n", - atomic_read(&(THIS_MODULE)->uc.usecount)); MOD_INC_USE_COUNT; RETURN(0); } @@ -80,9 +77,6 @@ static int obd_class_release(struct inode * inode, struct file * file) // XXX drop lsm, connections here if (file->private_data) file->private_data = NULL; - - CDEBUG(D_IOCTL, "MOD_DEC_USE for close: count = %d\n", - atomic_read(&(THIS_MODULE)->uc.usecount) - 1); MOD_DEC_USE_COUNT; RETURN(0); } @@ -329,16 +323,16 @@ static int obd_class_ioctl (struct inode * inode, struct file * filp, if (obd->obd_flags & OBD_ATTACHED || obd->obd_type) { CERROR("OBD: Device %d already typed as %s.\n", obd->obd_minor, MKSTR(obd->obd_type->typ_name)); - GOTO(out, err=-EBUSY); + GOTO(out, err = -EBUSY); } if (!data->ioc_inllen1 || !data->ioc_inlbuf1) { CERROR("No type passed!\n"); - GOTO(out, err=-EINVAL); + GOTO(out, err = -EINVAL); } if (data->ioc_inlbuf1[data->ioc_inllen1-1] !=0) { CERROR("Type not nul terminated!\n"); - GOTO(out, err=-EINVAL); + GOTO(out, err = -EINVAL); } CDEBUG(D_IOCTL, "attach type %s name: %s uuid: %s\n", @@ -346,10 +340,10 @@ static int obd_class_ioctl (struct inode * inode, struct file * filp, MKSTR(data->ioc_inlbuf2), MKSTR(data->ioc_inlbuf3)); /* find the type */ - type = class_nm_to_type(data->ioc_inlbuf1); + type = class_get_type(data->ioc_inlbuf1); if (!type) { CERROR("OBD: unknown type dev %d\n", obd->obd_minor); - GOTO(out, err=-EINVAL); + GOTO(out, err = -EINVAL); } minor = obd->obd_minor; @@ -364,8 +358,8 @@ static int obd_class_ioctl (struct inode * inode, struct file * filp, int len = strlen(data->ioc_inlbuf2) + 1; OBD_ALLOC(obd->obd_name, len); if (!obd->obd_name) { - CERROR("no memory\n"); - LBUG(); + class_put_type(obd->obd_type); + GOTO(out, err = -ENOMEM); } memcpy(obd->obd_name, data->ioc_inlbuf2, len); } else { @@ -374,11 +368,12 @@ static int obd_class_ioctl (struct inode * inode, struct file * filp, if (data->ioc_inlbuf3) { int len = strlen(data->ioc_inlbuf3); if (len >= sizeof(obd->obd_uuid)) { - CERROR("uuid must be < %d bytes long\n", + CERROR("uuid must be < "LPSZ" bytes long\n", sizeof(obd->obd_uuid)); if (obd->obd_name) OBD_FREE(obd->obd_name, strlen(obd->obd_name) + 1); + class_put_type(obd->obd_type); GOTO(out, err=-EINVAL); } memcpy(obd->obd_uuid, data->ioc_inlbuf3, len); @@ -389,6 +384,7 @@ static int obd_class_ioctl (struct inode * inode, struct file * filp, if (err) { if(data->ioc_inlbuf2) OBD_FREE(obd->obd_name, strlen(obd->obd_name)+1); + class_put_type(obd->obd_type); obd->obd_type = NULL; } else { obd->obd_flags |= OBD_ATTACHED; @@ -396,10 +392,6 @@ static int obd_class_ioctl (struct inode * inode, struct file * filp, type->typ_refcnt++; CDEBUG(D_IOCTL, "OBD: dev %d attached type %s\n", obd->obd_minor, data->ioc_inlbuf1); - - CDEBUG(D_IOCTL, "MOD_INC_USE for attach: count = %d\n", - atomic_read(&(THIS_MODULE)->uc.usecount)); - MOD_INC_USE_COUNT; } GOTO(out, err); @@ -423,8 +415,8 @@ static int obd_class_ioctl (struct inode * inode, struct file * filp, } forcibly_detach_exports(obd); } - if (OBP(obd, detach)) - err=OBP(obd,detach)(obd); + if (OBP(obd, detach)) + err = OBP(obd,detach)(obd); if (obd->obd_name) { OBD_FREE(obd->obd_name, strlen(obd->obd_name)+1); @@ -433,10 +425,8 @@ static int obd_class_ioctl (struct inode * inode, struct file * filp, obd->obd_flags &= ~OBD_ATTACHED; obd->obd_type->typ_refcnt--; + class_put_type(obd->obd_type); obd->obd_type = NULL; - CDEBUG(D_IOCTL, "MOD_DEC_USE for detach: count = %d\n", - atomic_read(&(THIS_MODULE)->uc.usecount) - 1); - MOD_DEC_USE_COUNT; GOTO(out, err = 0); } @@ -505,13 +495,6 @@ static int obd_class_ioctl (struct inode * inode, struct file * filp, GOTO(out, err); } - case OBD_IOC_DEC_USE_COUNT: { - CDEBUG(D_IOCTL, "MOD_DEC_USE for force dec: count = %d\n", - atomic_read(&(THIS_MODULE)->uc.usecount) - 1); - MOD_DEC_USE_COUNT; - GOTO(out, err=0); - } - default: obd_data2conn(&conn, data); @@ -620,6 +603,7 @@ EXPORT_SYMBOL(obd_kmap_put); EXPORT_SYMBOL(obd_dev); EXPORT_SYMBOL(obdo_cachep); EXPORT_SYMBOL(obd_memory); +EXPORT_SYMBOL(obd_memmax); EXPORT_SYMBOL(obd_fail_loc); EXPORT_SYMBOL(obd_timeout); EXPORT_SYMBOL(obd_recovery_upcall); @@ -627,6 +611,8 @@ EXPORT_SYMBOL(ptlrpc_put_connection_superhack); EXPORT_SYMBOL(class_register_type); EXPORT_SYMBOL(class_unregister_type); +EXPORT_SYMBOL(class_get_type); +EXPORT_SYMBOL(class_put_type); EXPORT_SYMBOL(class_name2dev); EXPORT_SYMBOL(class_uuid2dev); EXPORT_SYMBOL(class_uuid2obd); @@ -642,7 +628,6 @@ EXPORT_SYMBOL(class_disconnect_all); EXPORT_SYMBOL(class_uuid_unparse); EXPORT_SYMBOL(class_signal_connection_failure); -EXPORT_SYMBOL(class_nm_to_type); static int __init init_obdclass(void) { @@ -666,9 +651,9 @@ static int __init init_obdclass(void) obd->obd_minor = i; err = obd_init_caches(); - if (err) return err; + obd_sysctl_init(); err = lprocfs_reg_main(); @@ -696,7 +681,8 @@ static void __exit cleanup_obdclass(void) err = lprocfs_dereg_main(); - CERROR("obd memory leaked: %ld bytes\n", obd_memory); + CERROR("obd mem max: %d leaked: %d\n", obd_memmax, + atomic_read(&obd_memory)); EXIT; } diff --git a/lustre/obdclass/fsfilt.c b/lustre/obdclass/fsfilt.c index 97a84df..07ce0b3 100644 --- a/lustre/obdclass/fsfilt.c +++ b/lustre/obdclass/fsfilt.c @@ -6,9 +6,6 @@ #include #include #include -#include -#include -#include #include #include diff --git a/lustre/obdclass/fsfilt_extN.c b/lustre/obdclass/fsfilt_extN.c index 9b5a1f9..4302392 100644 --- a/lustre/obdclass/fsfilt_extN.c +++ b/lustre/obdclass/fsfilt_extN.c @@ -36,6 +36,7 @@ #include #include #include +#include #include static kmem_cache_t *fcb_cache; @@ -216,12 +217,18 @@ static void *fsfilt_extN_brw_start(int objcount, struct fsfilt_objinfo *fso, RETURN(handle); } -static int fsfilt_extN_commit(struct inode *inode, void *handle) +static int fsfilt_extN_commit(struct inode *inode, void *h /*, force_sync */) { int rc; + handle_t *handle = h; + +#if 0 + if (force_sync) + handle->h_sync = 1; /* recovery likes this */ +#endif lock_kernel(); - rc = journal_stop((handle_t *)handle); + rc = journal_stop(handle); unlock_kernel(); return rc; @@ -234,6 +241,31 @@ static int fsfilt_extN_setattr(struct dentry *dentry, void *handle, int rc; lock_kernel(); + + /* A _really_ horrible hack to avoid removing the data stored + * in the block pointers; this is really the "small" stripe MD data. + * We can avoid further hackery by virtue of the MDS file size being + * zero all the time (which doesn't invoke block truncate at unlink + * time), so we assert we never change the MDS file size from zero. + */ + if (iattr->ia_valid & ATTR_SIZE) { + CERROR("hmm, setting %*s file size to %lld\n", + dentry->d_name.len, dentry->d_name.name, iattr->ia_size); + LASSERT(iattr->ia_size == 0); +#if 0 + /* ATTR_SIZE would invoke truncate: clear it */ + iattr->ia_valid &= ~ATTR_SIZE; + inode->i_size = iattr->ia_size; + + /* make sure _something_ gets set - so new inode + * goes to disk (probably won't work over XFS + */ + if (!iattr->ia_valid & ATTR_MODE) { + iattr->ia_valid |= ATTR_MODE; + iattr->ia_mode = inode->i_mode; + } +#endif + } if (inode->i_op->setattr) rc = inode->i_op->setattr(dentry, iattr); else @@ -249,29 +281,58 @@ static int fsfilt_extN_set_md(struct inode *inode, void *handle, { int rc; - down(&inode->i_sem); - lock_kernel(); - rc = extN_xattr_set(handle, inode, EXTN_XATTR_INDEX_LUSTRE, - XATTR_LUSTRE_MDS_OBJID, lmm, lmm_size, 0); - unlock_kernel(); - up(&inode->i_sem); + /* Nasty hack city - store stripe MD data in the block pointers if + * it will fit, because putting it in an EA currently kills the MDS + * performance. We'll fix this with "fast EAs" in the future. + */ + if (lmm_size <= sizeof(EXTN_I(inode)->i_data) - + sizeof(EXTN_I(inode)->i_data[0])) { + /* XXX old_size is debugging only */ + int old_size = EXTN_I(inode)->i_data[0]; + if (old_size != 0) { + LASSERT(old_size < sizeof(EXTN_I(inode)->i_data)); + CERROR("setting EA on %lu again... interesting\n", + inode->i_ino); + } + + EXTN_I(inode)->i_data[0] = cpu_to_le32(lmm_size); + memcpy(&EXTN_I(inode)->i_data[1], lmm, lmm_size); + mark_inode_dirty(inode); + return 0; + } else { + down(&inode->i_sem); + lock_kernel(); + rc = extN_xattr_set(handle, inode, EXTN_XATTR_INDEX_LUSTRE, + XATTR_LUSTRE_MDS_OBJID, lmm, lmm_size, 0); + unlock_kernel(); + up(&inode->i_sem); + } - if (rc) { + if (rc) CERROR("error adding MD data to inode %lu: rc = %d\n", inode->i_ino, rc); - if (rc != -ENOSPC) LBUG(); - } return rc; } -static int fsfilt_extN_get_md(struct inode *inode, void *lmm, int size) +static int fsfilt_extN_get_md(struct inode *inode, void *lmm, int lmm_size) { int rc; + if (EXTN_I(inode)->i_data[0]) { + int size = le32_to_cpu(EXTN_I(inode)->i_data[0]); + LASSERT(size < sizeof(EXTN_I(inode)->i_data)); + if (lmm) { + if (size > lmm_size) + return -ERANGE; + memcpy(lmm, &EXTN_I(inode)->i_data[1], size); + } + return size; + } + down(&inode->i_sem); lock_kernel(); rc = extN_xattr_get(inode, EXTN_XATTR_INDEX_LUSTRE, - XATTR_LUSTRE_MDS_OBJID, lmm, size); + XATTR_LUSTRE_MDS_OBJID, lmm, lmm_size); unlock_kernel(); up(&inode->i_sem); @@ -282,7 +343,7 @@ static int fsfilt_extN_get_md(struct inode *inode, void *lmm, int size) if (rc < 0) { CDEBUG(D_INFO, "error getting EA %s from inode %lu: " "rc = %d\n", XATTR_LUSTRE_MDS_OBJID, inode->i_ino, rc); - memset(lmm, 0, size); + memset(lmm, 0, lmm_size); return (rc == -ENODATA) ? 0 : rc; } diff --git a/lustre/obdclass/fsfilt_reiserfs.c b/lustre/obdclass/fsfilt_reiserfs.c new file mode 100644 index 0000000..1ec5916 --- /dev/null +++ b/lustre/obdclass/fsfilt_reiserfs.c @@ -0,0 +1,193 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lustre/lib/fsfilt_reiserfs.c + * Lustre filesystem abstraction routines + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * Author: Andreas Dilger + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* + * NOTE - According to Hans Reiser, this could actually be implemented more + * efficiently than creating a directory and putting ASCII objids in it. + * Instead, we should return the reiserfs object ID as the lustre objid + * (although I'm not sure what impact that would have on backup/restore). + */ + +#define DEBUG_SUBSYSTEM S_FILTER + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void *fsfilt_reiserfs_start(struct inode *inode, int op) +{ + return (void *)0xf00f00be; +} + +static void *fsfilt_reiserfs_brw_start(int objcount, struct fsfilt_objinfo *fso, + int niocount, struct niobuf_remote *nb) +{ + return (void *)0xf00f00be; +} + +static int fsfilt_reiserfs_commit(struct inode *inode, void *handle) +{ + if (handle != (void *)0xf00f00be) { + CERROR("bad handle %p", handle); + return -EINVAL; + } + + return 0; +} + +static int fsfilt_reiserfs_setattr(struct dentry *dentry, void *handle, + struct iattr *iattr) +{ + struct inode *inode = dentry->d_inode; + int rc; + + lock_kernel(); + + /* A _really_ horrible hack to avoid removing the data stored + * in the block pointers; this is really the "small" stripe MD data. + * We can avoid further hackery by virtue of the MDS file size being + * zero all the time (which doesn't invoke block truncate at unlink + * time), so we assert we never change the MDS file size from zero. + */ + if (iattr->ia_valid & ATTR_SIZE) { + CERROR("hmm, setting %*s file size to %llu\n", + dentry->d_name.len, dentry->d_name.name, iattr->ia_size); + LASSERT(iattr->ia_size == 0); +#if 0 + /* ATTR_SIZE would invoke truncate: clear it */ + iattr->ia_valid &= ~ATTR_SIZE; + inode->i_size = iattr->ia_size; + + /* make sure _something_ gets set - so new inode + * goes to disk (probably won't work over XFS + */ + if (!iattr->ia_valid & ATTR_MODE) { + iattr->ia_valid |= ATTR_MODE; + iattr->ia_mode = inode->i_mode; + } +#endif + } + if (inode->i_op->setattr) + rc = inode->i_op->setattr(dentry, iattr); + else + rc = inode_setattr(inode, iattr); + + unlock_kernel(); + + return rc; +} + +static int fsfilt_reiserfs_set_md(struct inode *inode, void *handle, + void *lmm, int lmm_size) +{ + /* XXX write stripe data into MDS file itself */ + CERROR("not implemented yet\n"); + + return -ENOSYS; +} + +static int fsfilt_reiserfs_get_md(struct inode *inode, void *lmm, int lmm_size) +{ + if (lmm == NULL) + return inode->i_size; + + CERROR("not implemented yet\n"); + return -ENOSYS; +} + +static ssize_t fsfilt_reiserfs_readpage(struct file *file, char *buf, size_t count, + loff_t *offset) +{ + return file->f_op->read(file, buf, count, offset); +} + +static int fsfilt_reiserfs_set_last_rcvd(struct obd_device *obd, __u64 last_rcvd, + void *handle, fsfilt_cb_t cb_func) +{ + static long next = 0; + + if (time_after(jiffies, next)) { + CERROR("no journal callback kernel patch, faking it...\n"); + next = jiffies + 300 * HZ; + } + + cb_func(obd, last_rcvd, 0); + + return 0; +} + +static int fsfilt_reiserfs_journal_data(struct file *filp) +{ + CERROR("not implemented yet\n"); + return 0; +} + +static int fsfilt_reiserfs_statfs(struct super_block *sb, struct obd_statfs *osfs) +{ + struct statfs sfs; + int rc = vfs_statfs(sb, &sfs); + + statfs_pack(osfs, &sfs); + return rc; +} + +static struct fsfilt_operations fsfilt_reiserfs_ops = { + fs_type: "reiserfs", + fs_owner: THIS_MODULE, + fs_start: fsfilt_reiserfs_start, + fs_brw_start: fsfilt_reiserfs_brw_start, + fs_commit: fsfilt_reiserfs_commit, + fs_setattr: fsfilt_reiserfs_setattr, + fs_set_md: fsfilt_reiserfs_set_md, + fs_get_md: fsfilt_reiserfs_get_md, + fs_readpage: fsfilt_reiserfs_readpage, + fs_journal_data: fsfilt_reiserfs_journal_data, + fs_set_last_rcvd: fsfilt_reiserfs_set_last_rcvd, + fs_statfs: fsfilt_reiserfs_statfs, +}; + +static int __init fsfilt_reiserfs_init(void) +{ + return fsfilt_register_ops(&fsfilt_reiserfs_ops); +} + +static void __exit fsfilt_reiserfs_exit(void) +{ + fsfilt_unregister_ops(&fsfilt_reiserfs_ops); +} + +MODULE_AUTHOR("Cluster File Systems, Inc. "); +MODULE_DESCRIPTION("Lustre reiserfs Filesystem Helper v0.1"); +MODULE_LICENSE("GPL"); + +module_init(fsfilt_reiserfs_init); +module_exit(fsfilt_reiserfs_exit); diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c index 8a0ed36..994949e 100644 --- a/lustre/obdclass/genops.c +++ b/lustre/obdclass/genops.c @@ -41,53 +41,59 @@ int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c); * support functions: we could use inter-module communication, but this * is more portable to other OS's */ -static struct obd_type *class_search_type(char *nm) +static struct obd_type *class_search_type(char *name) { struct list_head *tmp; struct obd_type *type; - CDEBUG(D_INFO, "SEARCH %s\n", nm); + CDEBUG(D_INFO, "SEARCH %s\n", name); tmp = &obd_types; list_for_each(tmp, &obd_types) { type = list_entry(tmp, struct obd_type, typ_chain); CDEBUG(D_INFO, "TYP %s\n", type->typ_name); - if (strlen(type->typ_name) == strlen(nm) && - strcmp(type->typ_name, nm) == 0 ) { + if (strlen(type->typ_name) == strlen(name) && + strcmp(type->typ_name, name) == 0) { return type; } } return NULL; } -struct obd_type *class_nm_to_type(char *nm) +struct obd_type *class_get_type(char *name) { - struct obd_type *type = class_search_type(nm); + struct obd_type *type = class_search_type(name); #ifdef CONFIG_KMOD - if ( !type ) { - if ( !request_module(nm) ) { - CDEBUG(D_INFO, "Loaded module '%s'\n", nm); - type = class_search_type(nm); - } else { - CDEBUG(D_INFO, "Can't load module '%s'\n", nm); - } + if (!type) { + if (!request_module(name)) { + CDEBUG(D_INFO, "Loaded module '%s'\n", name); + type = class_search_type(name); + } else + CDEBUG(D_INFO, "Can't load module '%s'\n", name); } #endif + if (type) + __MOD_INC_USE_COUNT(type->typ_ops->o_owner); return type; } +void class_put_type(struct obd_type *type) +{ + LASSERT(type); + __MOD_DEC_USE_COUNT(type->typ_ops->o_owner); +} + int class_register_type(struct obd_ops *ops, struct lprocfs_vars *vars, - char *nm) + char *name) { struct obd_type *type; int rc; - ENTRY; - LASSERT (strnlen (nm, 1024) < 1024); /* sanity check */ - - if (class_search_type(nm)) { - CDEBUG(D_IOCTL, "Type %s already registered\n", nm); + LASSERT(strnlen(name, 1024) < 1024); /* sanity check */ + + if (class_search_type(name)) { + CDEBUG(D_IOCTL, "Type %s already registered\n", name); RETURN(-EEXIST); } @@ -97,38 +103,33 @@ int class_register_type(struct obd_ops *ops, struct lprocfs_vars *vars, RETURN(rc); OBD_ALLOC(type->typ_ops, sizeof(*type->typ_ops)); - OBD_ALLOC(type->typ_name, strlen(nm) + 1); - if (type->typ_ops == NULL || - type->typ_name == NULL) + OBD_ALLOC(type->typ_name, strlen(name) + 1); + if (type->typ_ops == NULL || type->typ_name == NULL) GOTO (failed, rc); - + *(type->typ_ops) = *ops; - strcpy(type->typ_name, nm); + strcpy(type->typ_name, name); list_add(&type->typ_chain, &obd_types); rc = lprocfs_reg_class(type, vars, type); if (rc != 0) { - list_del (&type->typ_chain); - GOTO (failed, rc); + list_del(&type->typ_chain); + GOTO(failed, rc); } - - CDEBUG(D_INFO, "MOD_INC_USE for register_type: count = %d\n", - atomic_read(&(THIS_MODULE)->uc.usecount)); - MOD_INC_USE_COUNT; + RETURN (0); failed: if (type->typ_ops != NULL) - OBD_FREE (type->typ_name, strlen (nm) + 1); + OBD_FREE(type->typ_name, strlen(name) + 1); if (type->typ_ops != NULL) OBD_FREE (type->typ_ops, sizeof (*type->typ_ops)); RETURN(rc); } -int class_unregister_type(char *nm) +int class_unregister_type(char *name) { - struct obd_type *type = class_nm_to_type(nm); - + struct obd_type *type = class_search_type(name); ENTRY; if (!type) { @@ -137,7 +138,7 @@ int class_unregister_type(char *nm) } if (type->typ_refcnt) { - CERROR("type %s has refcount (%d)\n", nm, type->typ_refcnt); + CERROR("type %s has refcount (%d)\n", name, type->typ_refcnt); /* This is a bad situation, let's make the best of it */ /* Remove ops, but leave the name for debugging */ OBD_FREE(type->typ_ops, sizeof(*type->typ_ops)); @@ -147,13 +148,10 @@ int class_unregister_type(char *nm) lprocfs_dereg_class(type); list_del(&type->typ_chain); - OBD_FREE(type->typ_name, strlen(nm) + 1); + OBD_FREE(type->typ_name, strlen(name) + 1); if (type->typ_ops != NULL) OBD_FREE(type->typ_ops, sizeof(*type->typ_ops)); OBD_FREE(type, sizeof(*type)); - CDEBUG(D_INFO, "MOD_DEC_USE for register_type: count = %d\n", - atomic_read(&(THIS_MODULE)->uc.usecount) - 1); - MOD_DEC_USE_COUNT; RETURN(0); } /* class_unregister_type */ @@ -165,7 +163,7 @@ int class_name2dev(char *name) if (!name) return -1; - for (i=0; i < MAX_OBD_DEVICES; i++) { + for (i = 0; i < MAX_OBD_DEVICES; i++) { struct obd_device *obd = &obd_dev[i]; if (obd->obd_name && strcmp(name, obd->obd_name) == 0) { res = i; @@ -181,7 +179,7 @@ int class_uuid2dev(char *uuid) int res = -1; int i; - for (i=0; i < MAX_OBD_DEVICES; i++) { + for (i = 0; i < MAX_OBD_DEVICES; i++) { struct obd_device *obd = &obd_dev[i]; if (strncmp(uuid, obd->obd_uuid, sizeof(obd->obd_uuid)) == 0) { res = i; @@ -197,7 +195,7 @@ struct obd_device *class_uuid2obd(char *uuid) { int i; - for (i=0; i < MAX_OBD_DEVICES; i++) { + for (i = 0; i < MAX_OBD_DEVICES; i++) { struct obd_device *obd = &obd_dev[i]; if (strncmp(uuid, obd->obd_uuid, sizeof(obd->obd_uuid)) == 0) return obd; @@ -428,7 +426,9 @@ void class_disconnect_all(struct obd_device *obddev) spin_unlock(&obddev->obd_dev_lock); CERROR("force disconnecting %s:%s export %p\n", export->exp_obd->obd_type->typ_name, - export->exp_connection->c_remote_uuid, export); + export->exp_connection ? + (char *)export->exp_connection->c_remote_uuid : + "", export); rc = obd_disconnect(&conn); if (rc < 0) { /* AED: not so sure about this... We can't diff --git a/lustre/obdclass/lprocfs_status.c b/lustre/obdclass/lprocfs_status.c index 62a806e..f096772 100644 --- a/lustre/obdclass/lprocfs_status.c +++ b/lustre/obdclass/lprocfs_status.c @@ -91,7 +91,8 @@ void lprocfs_remove_all(struct proc_dir_entry* root) rm_entry = temp; temp = temp->parent; remove_proc_entry(rm_entry->name, rm_entry->parent); - if (temp == parent) break; + if (temp == parent) + break; } } @@ -111,7 +112,7 @@ struct proc_dir_entry* lprocfs_new_dir(struct proc_dir_entry* root, new_root = root; mover_str = temp_string; while ((my_str = strsep(&mover_str, tok))) { - if(!*my_str) + if (!*my_str) continue; CDEBUG(D_OTHER, "SEARCH= %s\t, ROOT=%s\n", my_str, new_root->name); @@ -120,7 +121,7 @@ struct proc_dir_entry* lprocfs_new_dir(struct proc_dir_entry* root, CDEBUG(D_OTHER, "Adding: %s\n", my_str); temp_entry = lprocfs_mkdir(my_str, new_root); if (temp_entry == NULL) { - CDEBUG(D_OTHER, + CDEBUG(D_OTHER, "! Did not create new dir %s !!\n", my_str); return temp_entry; @@ -131,8 +132,7 @@ struct proc_dir_entry* lprocfs_new_dir(struct proc_dir_entry* root, return new_root; } -int lprocfs_new_vars(struct proc_dir_entry* root, - struct lprocfs_vars* list, +int lprocfs_new_vars(struct proc_dir_entry* root, struct lprocfs_vars* list, const char* tok, void* data) { struct proc_dir_entry *temp_root; @@ -188,9 +188,9 @@ int lprocfs_reg_obd(struct obd_device *device, struct lprocfs_vars *list, { struct proc_dir_entry* this_dev_root; int retval; - - if(lprocfs_srch(device->obd_type->typ_procroot, device->obd_name)){ - CDEBUG(D_OTHER, "Device with name [%s] exists!", + + if (lprocfs_srch(device->obd_type->typ_procroot, device->obd_name)) { + CDEBUG(D_OTHER, "Device with name [%s] exists!", device->obd_name); return 0; } @@ -227,7 +227,7 @@ int lprocfs_dereg_obd(struct obd_device* device) struct proc_dir_entry* lprocfs_reg_mnt(char* mnt_name) { - if(lprocfs_srch(proc_lustre_fs_root, mnt_name)){ + if (lprocfs_srch(proc_lustre_fs_root, mnt_name)) { CDEBUG(D_OTHER, "Mount with same name exists!"); return 0; } @@ -236,7 +236,7 @@ struct proc_dir_entry* lprocfs_reg_mnt(char* mnt_name) int lprocfs_dereg_mnt(struct proc_dir_entry* root) { - if(root == NULL){ + if (root == NULL) { CDEBUG(D_OTHER, "Non-existent root!"); return 0; } @@ -247,7 +247,6 @@ int lprocfs_dereg_mnt(struct proc_dir_entry* root) int lprocfs_reg_class(struct obd_type* type, struct lprocfs_vars* list, void* data) { - struct proc_dir_entry* root; int retval; root = lprocfs_mkdir(type->typ_name, proc_lustre_dev_root); @@ -259,9 +258,8 @@ int lprocfs_reg_class(struct obd_type* type, struct lprocfs_vars* list, int lprocfs_dereg_class(struct obd_type* class) { - if(class == NULL){ - CDEBUG(D_OTHER, "Non-existent class", - class->typ_name); + if (class == NULL) { + CDEBUG(D_OTHER, "Non-existent class"); return 0; } lprocfs_remove_all(class->typ_procroot); @@ -270,6 +268,7 @@ int lprocfs_dereg_class(struct obd_type* class) return 0; } + int lprocfs_reg_main() { proc_lustre_root = lprocfs_mkdir("lustre", &proc_root); diff --git a/lustre/lib/ll_pack.c b/lustre/obdclass/statfs_pack.c similarity index 88% rename from lustre/lib/ll_pack.c rename to lustre/obdclass/statfs_pack.c index 184c2c1..876d41c 100644 --- a/lustre/lib/ll_pack.c +++ b/lustre/obdclass/statfs_pack.c @@ -22,8 +22,9 @@ * */ -#define DEBUG_SUBSYSTEM S_LLITE +#define DEBUG_SUBSYSTEM S_CLASS +#define EXPORT_SYMTAB #include #include @@ -39,7 +40,10 @@ void obd_statfs_pack(struct obd_statfs *tgt, struct obd_statfs *src) tgt->os_namelen = HTON__u32(src->os_namelen); } -#define obd_statfs_unpack(tgt, src) obd_statfs_pack(tgt, src) +void obd_statfs_unpack(struct obd_statfs *tgt, struct obd_statfs *src) +{ + obd_statfs_pack(tgt, src); +} void statfs_pack(struct obd_statfs *osfs, struct statfs *sfs) { @@ -65,3 +69,7 @@ void statfs_unpack(struct statfs *sfs, struct obd_statfs *osfs) sfs->f_namelen = osfs->os_namelen; } +EXPORT_SYMBOL(obd_statfs_pack); +EXPORT_SYMBOL(obd_statfs_unpack); +EXPORT_SYMBOL(statfs_pack); +EXPORT_SYMBOL(statfs_unpack); diff --git a/lustre/obdecho/echo.c b/lustre/obdecho/echo.c index 76fddd8..8339327 100644 --- a/lustre/obdecho/echo.c +++ b/lustre/obdecho/echo.c @@ -109,26 +109,7 @@ static int echo_connect(struct lustre_handle *conn, struct obd_device *obd, obd_uuid_t cluuid, struct recovd_obd *recovd, ptlrpc_recovery_cb_t recover) { - int rc; - - MOD_INC_USE_COUNT; - rc = class_connect(conn, obd, cluuid); - - if (rc) - MOD_DEC_USE_COUNT; - - return rc; -} - -static int echo_disconnect(struct lustre_handle *conn) -{ - int rc; - - rc = class_disconnect(conn); - if (!rc) - MOD_DEC_USE_COUNT; - - return rc; + return class_connect(conn, obd, cluuid); } static __u64 echo_next_id(struct obd_device *obddev) @@ -148,7 +129,7 @@ int echo_create(struct lustre_handle *conn, struct obdo *oa, struct obd_device *obd = class_conn2obd(conn); if (!obd) { - CERROR("invalid client %Lx\n", conn->addr); + CERROR("invalid client "LPX64"\n", conn->addr); return -EINVAL; } @@ -453,20 +434,21 @@ int echo_detach(struct obd_device *dev) } static struct obd_ops echo_obd_ops = { - o_attach: echo_attach, - o_detach: echo_detach, - o_connect: echo_connect, - o_disconnect: echo_disconnect, - o_create: echo_create, - o_destroy: echo_destroy, - o_open: echo_open, - o_close: echo_close, - o_getattr: echo_getattr, - o_setattr: echo_setattr, - o_preprw: echo_preprw, - o_commitrw: echo_commitrw, - o_setup: echo_setup, - o_cleanup: echo_cleanup + o_owner: THIS_MODULE, + o_attach: echo_attach, + o_detach: echo_detach, + o_connect: echo_connect, + o_disconnect: class_disconnect, + o_create: echo_create, + o_destroy: echo_destroy, + o_open: echo_open, + o_close: echo_close, + o_getattr: echo_getattr, + o_setattr: echo_setattr, + o_preprw: echo_preprw, + o_commitrw: echo_commitrw, + o_setup: echo_setup, + o_cleanup: echo_cleanup }; extern int echo_client_init(void); diff --git a/lustre/obdecho/echo_client.c b/lustre/obdecho/echo_client.c index 3d2f222..e9c0e90 100644 --- a/lustre/obdecho/echo_client.c +++ b/lustre/obdecho/echo_client.c @@ -173,7 +173,7 @@ static int echo_iocontrol(unsigned int cmd, struct lustre_handle *obdconn, int l void *addr = kmap(pgp->pg); rc = page_debug_check("test_brw", addr, - PAGE_SIZE, pgp->off, id); + pgp->count, pgp->off, id); kunmap(pgp->pg); } __free_pages(pgp->pg, 0); @@ -184,7 +184,7 @@ static int echo_iocontrol(unsigned int cmd, struct lustre_handle *obdconn, int l GOTO(out, rc); } default: - CERROR ("echo_ioctl(): unrecognised ioctl %#lx\n", cmd); + CERROR ("echo_ioctl(): unrecognised ioctl %#x\n", cmd); GOTO (out, rc = -ENOTTY); } @@ -209,23 +209,17 @@ static int echo_setup(struct obd_device *obddev, obd_count len, void *buf) RETURN(-EINVAL); } - MOD_INC_USE_COUNT; tgt = class_uuid2obd(data->ioc_inlbuf1); if (!tgt || !(tgt->obd_flags & OBD_ATTACHED) || !(tgt->obd_flags & OBD_SET_UP)) { CERROR("device not attached or not set up (%d)\n", data->ioc_dev); - GOTO(error_dec, rc = -EINVAL); + RETURN(rc = -EINVAL); } rc = obd_connect(&ec->conn, tgt, NULL, NULL, NULL); - if (rc) { + if (rc) CERROR("fail to connect to device %d\n", data->ioc_dev); - GOTO(error_dec, rc = -EINVAL); - } - RETURN(rc); -error_dec: - MOD_DEC_USE_COUNT; RETURN(rc); } @@ -246,7 +240,6 @@ static int echo_cleanup(struct obd_device * obddev) RETURN(-EINVAL); } - MOD_DEC_USE_COUNT; RETURN(0); } @@ -258,6 +251,7 @@ static int echo_connect(struct lustre_handle *conn, struct obd_device *src, } static struct obd_ops echo_obd_ops = { + o_owner: THIS_MODULE, o_setup: echo_setup, o_cleanup: echo_cleanup, o_iocontrol: echo_iocontrol, diff --git a/lustre/obdfilter/Makefile.am b/lustre/obdfilter/Makefile.am index a237004..c6658d6 100644 --- a/lustre/obdfilter/Makefile.am +++ b/lustre/obdfilter/Makefile.am @@ -8,10 +8,7 @@ MODULE = obdfilter modulefs_DATA = obdfilter.o EXTRA_PROGRAMS = obdfilter -LINX=simple.c ll_pack.c -ll_pack.c: - test -e ll_pack.c || ln -sf $(top_srcdir)/lib/ll_pack.c - +LINX=simple.c simple.c: test -e simple.c || ln -sf $(top_srcdir)/lib/simple.c diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c index a370e56..2d495b2 100644 --- a/lustre/obdfilter/filter.c +++ b/lustre/obdfilter/filter.c @@ -275,11 +275,11 @@ static struct dentry *filter_fid2dentry(struct obd_device *obd, len = sprintf(name, LPU64, id); CDEBUG(D_INODE, "opening object O/%*s/%s\n", dparent->d_name.len, dparent->d_name.name, name); - if (!locked) - down(&dparent->d_inode->i_sem); + //if (!locked) + //down(&dparent->d_inode->i_sem); dchild = lookup_one_len(name, dparent, len); - if (!locked) - up(&dparent->d_inode->i_sem); + //if (!locked) + //up(&dparent->d_inode->i_sem); if (IS_ERR(dchild)) { CERROR("child lookup error %ld\n", PTR_ERR(dchild)); RETURN(dchild); @@ -333,7 +333,7 @@ static struct file *filter_obj_open(struct obd_export *export, RETURN(ERR_PTR(-EINVAL)); } - ffd = kmem_cache_alloc(filter_open_cache, SLAB_KERNEL); + PORTAL_SLAB_ALLOC(ffd, filter_open_cache, sizeof(*ffd)); if (!ffd) { CERROR("obdfilter: out of memory\n"); RETURN(ERR_PTR(-ENOMEM)); @@ -352,7 +352,7 @@ static struct file *filter_obj_open(struct obd_export *export, pop_ctxt(&saved, &filter->fo_ctxt, NULL); if (IS_ERR(file)) { - CERROR("error opening %s: rc %d\n", name, PTR_ERR(file)); + CERROR("error opening %s: rc %ld\n", name, PTR_ERR(file)); GOTO(out_fdd, file); } @@ -397,7 +397,7 @@ out_fdd: kmem_cache_free(filter_dentry_cache, fdd); out_ffd: ffd->ffd_servercookie = DEAD_HANDLE_MAGIC; - kmem_cache_free(filter_open_cache, ffd); + PORTAL_SLAB_FREE(ffd, filter_open_cache, sizeof(*ffd)); goto out; } @@ -459,7 +459,7 @@ static int filter_close_internal(struct obd_device *obd, } f_dput(object_dentry); - kmem_cache_free(filter_open_cache, ffd); + PORTAL_SLAB_FREE(ffd, filter_open_cache, sizeof(*ffd)); RETURN(rc); } @@ -1423,8 +1423,8 @@ out_ctxt: } static int filter_brw(int cmd, struct lustre_handle *conn, - struct lov_stripe_md *lsm, obd_count oa_bufs, - struct brw_page *pga, struct obd_brw_set *set) + struct lov_stripe_md *lsm, obd_count oa_bufs, + struct brw_page *pga, struct obd_brw_set *set) { struct obd_ioobj ioo; struct niobuf_local *lnb; @@ -1437,10 +1437,10 @@ static int filter_brw(int cmd, struct lustre_handle *conn, OBD_ALLOC(lnb, oa_bufs * sizeof(struct niobuf_local)); OBD_ALLOC(rnb, oa_bufs * sizeof(struct niobuf_remote)); - if ( lnb == NULL || rnb == NULL ) + if (lnb == NULL || rnb == NULL) GOTO(out, ret = -ENOMEM); - for ( i = 0 ; i < oa_bufs ; i++ ) { + for (i = 0; i < oa_bufs; i++) { rnb[i].offset = pga[i].off; rnb[i].len = pga[i].count; } @@ -1450,16 +1450,16 @@ static int filter_brw(int cmd, struct lustre_handle *conn, ioo.ioo_type = S_IFREG; ioo.ioo_bufcnt = oa_bufs; - ret = filter_preprw(cmd, conn, 1, &ioo, oa_bufs, rnb, lnb, - &desc_private); - if ( ret != 0 ) + ret = filter_preprw(cmd, conn, 1, &ioo, oa_bufs, rnb, lnb, + &desc_private); + if (ret != 0) GOTO(out, ret); - for ( i = 0; i < oa_bufs ; i++ ) { + for (i = 0; i < oa_bufs; i++) { void *virt = kmap(pga[i].pg); obd_off off = pga[i].off & ~PAGE_MASK; - if ( cmd & OBD_BRW_WRITE ) + if (cmd & OBD_BRW_WRITE) memcpy(lnb[i].addr + off, virt + off, pga[i].count); else memcpy(virt + off, lnb[i].addr + off, pga[i].count); @@ -1470,9 +1470,9 @@ static int filter_brw(int cmd, struct lustre_handle *conn, ret = filter_commitrw(cmd, conn, 1, &ioo, oa_bufs, lnb, desc_private); out: - if ( lnb ) + if (lnb) OBD_FREE(lnb, oa_bufs * sizeof(struct niobuf_local)); - if ( rnb ) + if (rnb) OBD_FREE(rnb, oa_bufs * sizeof(struct niobuf_remote)); RETURN(ret); } @@ -1608,29 +1608,30 @@ int filter_copy_data(struct lustre_handle *dst_conn, struct obdo *dst, } static struct obd_ops filter_obd_ops = { - o_attach: filter_attach, - o_detach: filter_detach, - o_get_info: filter_get_info, - o_setup: filter_setup, - o_cleanup: filter_cleanup, - o_connect: filter_connect, - o_disconnect: filter_disconnect, - o_statfs: filter_statfs, - o_getattr: filter_getattr, - o_create: filter_create, - o_setattr: filter_setattr, - o_destroy: filter_destroy, - o_open: filter_open, - o_close: filter_close, - o_brw: filter_brw, - o_punch: filter_truncate, - o_preprw: filter_preprw, - o_commitrw: filter_commitrw + o_owner: THIS_MODULE, + o_attach: filter_attach, + o_detach: filter_detach, + o_get_info: filter_get_info, + o_setup: filter_setup, + o_cleanup: filter_cleanup, + o_connect: filter_connect, + o_disconnect: filter_disconnect, + o_statfs: filter_statfs, + o_getattr: filter_getattr, + o_create: filter_create, + o_setattr: filter_setattr, + o_destroy: filter_destroy, + o_open: filter_open, + o_close: filter_close, + o_brw: filter_brw, + o_punch: filter_truncate, + o_preprw: filter_preprw, + o_commitrw: filter_commitrw #if 0 - o_preallocate: filter_preallocate_inodes, - o_migrate: filter_migrate, - o_copy: filter_copy_data, - o_iterate: filter_iterate + o_preallocate: filter_preallocate_inodes, + o_migrate: filter_migrate, + o_copy: filter_copy_data, + o_iterate: filter_iterate #endif }; diff --git a/lustre/osc/Makefile.am b/lustre/osc/Makefile.am index 284c2d6..2348a5b 100644 --- a/lustre/osc/Makefile.am +++ b/lustre/osc/Makefile.am @@ -9,13 +9,11 @@ MODULE = osc modulefs_DATA = osc.o EXTRA_PROGRAMS = osc -LINX= obd_pack.c ll_pack.c client.c +LINX= obd_pack.c client.c osc_SOURCES = osc_request.c lproc_osc.c $(LINX) obd_pack.c: test -e obd_pack.c || ln -sf $(top_srcdir)/lib/obd_pack.c -ll_pack.c: - test -e ll_pack.c || ln -sf $(top_srcdir)/lib/ll_pack.c client.c: test -e client.c || ln -sf $(top_srcdir)/lib/client.c diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index 1e2f72e..85b1694 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -399,7 +399,7 @@ static void unmap_and_decref_bulk_desc(void *data) } /* this is the callback function which is invoked by the Portals - * event handler associated with the bulk_sink queue and bulk_source queue. + * event handler associated with the bulk_sink queue and bulk_source queue. */ static void osc_ptl_ev_hdlr(struct ptlrpc_bulk_desc *desc) { @@ -428,7 +428,9 @@ static int osc_brw_read(struct lustre_handle *conn, struct lov_stripe_md *lsm, struct ptlrpc_bulk_desc *desc = NULL; struct ost_body *body; int rc, size[3] = {sizeof(*body)}, mapped = 0; - void *iooptr, *nioptr; + unsigned long flags; + struct obd_ioobj *iooptr; + void *nioptr; __u32 xid; ENTRY; @@ -453,9 +455,9 @@ static int osc_brw_read(struct lustre_handle *conn, struct lov_stripe_md *lsm, ost_pack_ioo(&iooptr, lsm, page_count); /* end almost identical to brw_write case */ - spin_lock(&imp->imp_lock); + spin_lock_irqsave(&imp->imp_lock, flags); xid = ++imp->imp_last_xid; /* single xid for all pages */ - spin_unlock(&imp->imp_lock); + spin_unlock_irqrestore(&imp->imp_lock, flags); obd_kmap_get(page_count, 0); @@ -521,26 +523,27 @@ out_unmap: goto out_req; } -static int osc_brw_write(struct lustre_handle *conn, struct lov_stripe_md *md, +static int osc_brw_write(struct lustre_handle *conn, struct lov_stripe_md *lsm, obd_count page_count, struct brw_page *pga, struct obd_brw_set *set) { - struct ptlrpc_connection *connection = - client_conn2cli(conn)->cl_import.imp_connection; + struct obd_import *imp = class_conn2cliimp(conn); + struct ptlrpc_connection *connection = imp->imp_connection; struct ptlrpc_request *request = NULL; struct ptlrpc_bulk_desc *desc = NULL; struct ost_body *body; struct niobuf_local *local = NULL; struct niobuf_remote *remote; - int rc, j, size[3] = {sizeof(*body)}, mapped = 0; - void *iooptr, *nioptr; + int rc, size[3] = {sizeof(*body)}, mapped = 0; + int j; + struct obd_ioobj *iooptr; + void *nioptr; ENTRY; size[1] = sizeof(struct obd_ioobj); - size[2] = page_count * sizeof(*remote); + size[2] = page_count * sizeof(struct niobuf_remote); - request = ptlrpc_prep_req(class_conn2cliimp(conn), OST_WRITE, 3, size, - NULL); + request = ptlrpc_prep_req(imp, OST_WRITE, 3, size, NULL); if (!request) RETURN(-ENOMEM); @@ -548,14 +551,14 @@ static int osc_brw_write(struct lustre_handle *conn, struct lov_stripe_md *md, desc = ptlrpc_prep_bulk(connection); if (!desc) - GOTO(out_req, rc = -ENOMEM); + GOTO(out_req, rc = -ENOMEM); desc->bd_portal = OSC_BULK_PORTAL; desc->bd_ptl_ev_hdlr = osc_ptl_ev_hdlr; CDEBUG(D_PAGE, "desc = %p\n", desc); iooptr = lustre_msg_buf(request->rq_reqmsg, 1); nioptr = lustre_msg_buf(request->rq_reqmsg, 2); - ost_pack_ioo(&iooptr, md, page_count); + ost_pack_ioo(&iooptr, lsm, page_count); /* end almost identical to brw_read case */ OBD_ALLOC(local, page_count * sizeof(*local)); @@ -567,7 +570,7 @@ static int osc_brw_write(struct lustre_handle *conn, struct lov_stripe_md *md, for (mapped = 0; mapped < page_count; mapped++) { local[mapped].addr = kmap(pga[mapped].pg); - CDEBUG(D_INFO, "kmap(pg) = %p ; pg->flags = %lx ; pg->count = " + CDEBUG(D_INFO, "kmap(pg) = %p ; pg->flags = %lx ; pg->refcount = " "%d ; page %d of %d\n", local[mapped].addr, pga[mapped].pg->flags, page_count(pga[mapped].pg), @@ -604,7 +607,7 @@ static int osc_brw_write(struct lustre_handle *conn, struct lov_stripe_md *md, if (!bulk) GOTO(out_unmap, rc = -ENOMEM); - bulk->bp_buf = (void *)(unsigned long)local[j].addr; + bulk->bp_buf = local[j].addr; bulk->bp_buflen = local[j].len; bulk->bp_xid = remote->xid; bulk->bp_page = pga[j].pg; @@ -776,6 +779,50 @@ static int osc_statfs(struct lustre_handle *conn, struct obd_statfs *osfs) return rc; } +/* Retrieve object striping information. + * + * @lmmu is a pointer to an in-core struct with lmm_ost_count indicating + * the maximum number of OST indices which will fit in the user buffer. + * lmm_magic must be LOV_MAGIC (we only use 1 slot here). + */ +static int osc_getstripe(struct lustre_handle *conn, struct lov_stripe_md *lsm, + struct lov_mds_md *lmmu) +{ + struct lov_mds_md lmm, *lmmk; + int rc, lmm_size; + ENTRY; + + if (!lsm) + RETURN(-ENODATA); + + rc = copy_from_user(&lmm, lmmu, sizeof(lmm)); + if (rc) + RETURN(-EFAULT); + + if (lmm.lmm_magic != LOV_MAGIC) + RETURN(-EINVAL); + + if (lmm.lmm_ost_count < 1) + RETURN(-EOVERFLOW); + + lmm_size = sizeof(lmm) + sizeof(lmm.lmm_objects[0]); + OBD_ALLOC(lmmk, lmm_size); + if (rc < 0) + RETURN(rc); + + lmmk->lmm_stripe_count = 1; + lmmk->lmm_ost_count = 1; + lmmk->lmm_object_id = lsm->lsm_object_id; + lmmk->lmm_objects[0].l_object_id = lsm->lsm_object_id; + + if (copy_to_user(lmmu, lmmk, lmm_size)) + rc = -EFAULT; + + OBD_FREE(lmmk, lmm_size); + + RETURN(rc); +} + static int osc_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len, void *karg, void *uarg) { @@ -878,8 +925,16 @@ static int osc_iocontrol(unsigned int cmd, struct lustre_handle *conn, int len, OBD_FREE(buf, len); GOTO(out, err); } + case LL_IOC_LOV_SETSTRIPE: + err = obd_alloc_memmd(conn, karg); + if (err > 0) + err = 0; + GOTO(out, err); + case LL_IOC_LOV_GETSTRIPE: + err = osc_getstripe(conn, karg, uarg); + GOTO(out, err); default: - CERROR ("osc_ioctl(): unrecognised ioctl %#lx\n", cmd); + CERROR ("osc_ioctl(): unrecognised ioctl %#x\n", cmd); GOTO(out, err = -ENOTTY); } out: @@ -904,7 +959,7 @@ static void set_osc_active(struct obd_import *imp, int active) fakeconn.addr = (__u64)(unsigned long)exp; fakeconn.cookie = exp->exp_cookie; - ioc_data.ioc_inlbuf1 = imp->imp_obd->obd_uuid; + ioc_data.ioc_inlbuf1 = imp->imp_obd->u.cli.cl_target_uuid; ioc_data.ioc_offset = active; rc = obd_iocontrol(IOC_LOV_SET_OSC_ACTIVE, &fakeconn, sizeof ioc_data, &ioc_data, NULL); @@ -919,42 +974,11 @@ static void set_osc_active(struct obd_import *imp, int active) } } - -/* XXX looks a lot like super.c:invalidate_request_list, don't it? */ -static void abort_inflight_for_import(struct obd_import *imp) -{ - struct list_head *tmp, *n; - - /* Make sure that no new requests get processed for this import. - * ptlrpc_queue_wait must (and does) hold imp_lock while testing this - * flag and then putting requests on sending_list or delayed_list. - */ - spin_lock(&imp->imp_lock); - imp->imp_flags |= IMP_INVALID; - spin_unlock(&imp->imp_lock); - - list_for_each_safe(tmp, n, &imp->imp_sending_list) { - struct ptlrpc_request *req = - list_entry(tmp, struct ptlrpc_request, rq_list); - - DEBUG_REQ(D_HA, req, "inflight"); - req->rq_flags |= PTL_RPC_FL_ERR; - wake_up(&req->rq_wait_for_rep); - } - - list_for_each_safe(tmp, n, &imp->imp_delayed_list) { - struct ptlrpc_request *req = - list_entry(tmp, struct ptlrpc_request, rq_list); - - DEBUG_REQ(D_HA, req, "aborting waiting req"); - req->rq_flags |= PTL_RPC_FL_ERR; - wake_up(&req->rq_wait_for_rep); - } -} - static int osc_recover(struct obd_import *imp, int phase) { int rc; + unsigned long flags; + struct ptlrpc_request *req; ENTRY; switch(phase) { @@ -969,15 +993,21 @@ static int osc_recover(struct obd_import *imp, int phase) case PTLRPC_RECOVD_PHASE_RECOVER: imp->imp_flags &= ~IMP_INVALID; - rc = ptlrpc_reconnect_import(imp, OST_CONNECT); + rc = ptlrpc_reconnect_import(imp, OST_CONNECT, &req); + ptlrpc_req_finished(req); if (rc) { imp->imp_flags |= IMP_INVALID; RETURN(rc); } - spin_lock(&imp->imp_lock); + spin_lock_irqsave(&imp->imp_lock, flags); imp->imp_level = LUSTRE_CONN_FULL; - spin_unlock(&imp->imp_lock); + spin_unlock_irqrestore(&imp->imp_lock, flags); + + /* Is this the right place? Should we do this in _PREPARE + * as well? What about raising the level right away? + */ + ptlrpc_wake_delayed(imp); set_osc_active(imp, 1 /* active */); RETURN(0); @@ -1001,6 +1031,7 @@ static int osc_connect(struct lustre_handle *conn, struct obd_device *obd, } struct obd_ops osc_obd_ops = { + o_owner: THIS_MODULE, o_attach: osc_attach, o_detach: osc_detach, o_setup: client_obd_setup, diff --git a/lustre/ost/Makefile.am b/lustre/ost/Makefile.am index 3ad390a..c158a0f 100644 --- a/lustre/ost/Makefile.am +++ b/lustre/ost/Makefile.am @@ -8,10 +8,8 @@ MODULE = ost modulefs_DATA = ost.o EXTRA_PROGRAMS = ost -LINX=obd_pack.c ll_pack.c target.c +LINX=obd_pack.c target.c -ll_pack.c: - test -e ll_pack.c || ln -sf $(top_srcdir)/lib/ll_pack.c obd_pack.c: test -e obd_pack.c || ln -sf $(top_srcdir)/lib/obd_pack.c target.c: diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c index 6ccb240..db7857c 100644 --- a/lustre/ost/ost_handler.c +++ b/lustre/ost/ost_handler.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -212,10 +213,10 @@ static int ost_setattr(struct ptlrpc_request *req) static int ost_bulk_timeout(void *data) { - struct ptlrpc_bulk_desc *desc = data; - ENTRY; - recovd_conn_fail(desc->bd_connection); + /* We don't fail the connection here, because having the export + * killed makes the (vital) call to commitrw very sad. + */ RETURN(1); } @@ -223,7 +224,8 @@ static int ost_brw_read(struct ptlrpc_request *req) { struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg; struct ptlrpc_bulk_desc *desc; - void *tmp1, *tmp2, *end2; + struct obd_ioobj *tmp1; + void *tmp2, *end2; struct niobuf_remote *remote_nb; struct niobuf_local *local_nb = NULL; struct obd_ioobj *ioo; @@ -316,16 +318,19 @@ static int ost_brw_write(struct ptlrpc_request *req) { struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg; struct ptlrpc_bulk_desc *desc; + struct obd_ioobj *tmp1; + void *tmp2, *end2; struct niobuf_remote *remote_nb; - struct niobuf_local *local_nb, *lnb; + struct niobuf_local *local_nb = NULL; + struct niobuf_local *lnb; struct obd_ioobj *ioo; struct ost_body *body; - int cmd, rc, i, j, objcount, niocount, size[2] = {sizeof(*body)}; - void *tmp1, *tmp2, *end2; + struct l_wait_info lwi; + int rc, cmd, i, j, objcount, niocount; + int size[2] = {sizeof(*body)}; void *desc_priv = NULL; int reply_sent = 0; struct ptlrpc_service *srv; - struct l_wait_info lwi; __u32 xid; ENTRY; @@ -415,11 +420,15 @@ static int ost_brw_write(struct ptlrpc_request *req) if (rc) { if (rc != -ETIMEDOUT) LBUG(); - GOTO(fail_bulk, rc); + ptlrpc_abort_bulk(desc); + recovd_conn_fail(desc->bd_connection); + obd_commitrw(cmd, conn, objcount, tmp1, niocount, local_nb, + desc->bd_desc_private); + } else { + rc = obd_commitrw(cmd, conn, objcount, tmp1, niocount, local_nb, + desc->bd_desc_private); } - rc = obd_commitrw(cmd, conn, objcount, tmp1, niocount, local_nb, - desc->bd_desc_private); ptlrpc_bulk_decref(desc); EXIT; out_free: @@ -438,7 +447,7 @@ out: fail_bulk: ptlrpc_free_bulk(desc); fail_preprw: - /* FIXME: how do we undo the preprw? */ + /* FIXME: how do we undo the preprw? - answer = call commitrw */ goto out_free; } @@ -457,6 +466,7 @@ static int ost_handle(struct ptlrpc_request *req) req->rq_export == NULL) { CERROR("lustre_ost: operation %d on unconnected OST\n", req->rq_reqmsg->opc); + req->rq_status = -ENOTCONN; GOTO(out, rc = -ENOTCONN); } @@ -592,19 +602,18 @@ static int ost_setup(struct obd_device *obddev, obd_count len, void *buf) RETURN(-EINVAL); } - MOD_INC_USE_COUNT; tgt = class_uuid2obd(data->ioc_inlbuf1); if (!tgt || !(tgt->obd_flags & OBD_ATTACHED) || !(tgt->obd_flags & OBD_SET_UP)) { CERROR("device not attached or not set up (%d)\n", data->ioc_dev); - GOTO(error_dec, err = -EINVAL); + RETURN(err = -EINVAL); } err = obd_connect(&ost->ost_conn, tgt, NULL, NULL, NULL); if (err) { CERROR("fail to connect to device %d\n", data->ioc_dev); - GOTO(error_dec, err = -EINVAL); + RETURN(err); } ost->ost_service = ptlrpc_init_svc(OST_NEVENTS, OST_NBUFS, @@ -630,8 +639,6 @@ static int ost_setup(struct obd_device *obddev, obd_count len, void *buf) error_disc: obd_disconnect(&ost->ost_conn); -error_dec: - MOD_DEC_USE_COUNT; RETURN(err); } @@ -651,14 +658,12 @@ static int ost_cleanup(struct obd_device * obddev) ptlrpc_unregister_service(ost->ost_service); err = obd_disconnect(&ost->ost_conn); - if (err) { + if (err) CERROR("lustre ost: fail to disconnect device\n"); - RETURN(-EINVAL); - } - MOD_DEC_USE_COUNT; - RETURN(0); + RETURN(err); } + int ost_attach(struct obd_device *dev, obd_count len, void *data) { return lprocfs_reg_obd(dev, status_var_nm_1, dev); @@ -667,24 +672,71 @@ int ost_attach(struct obd_device *dev, obd_count len, void *data) int ost_detach(struct obd_device *dev) { return lprocfs_dereg_obd(dev); - } +/* This is so similar to mds_connect that it makes my heart weep: we should + * shuffle the UUID into obd_export proper and make this all happen in + * target_handle_connect. + */ +static int ost_connect(struct lustre_handle *conn, + struct obd_device *obd, obd_uuid_t cluuid, + struct recovd_obd *recovd, + ptlrpc_recovery_cb_t recover) +{ + struct obd_export *exp; + struct ost_export_data *oed; + struct list_head *p; + int rc; + ENTRY; + + if (!conn || !obd || !cluuid) + RETURN(-EINVAL); + + /* lctl gets a backstage, all-access pass. */ + if (!strcmp(cluuid, "OBD_CLASS_UUID")) + goto dont_check_exports; + + spin_lock(&obd->obd_dev_lock); + list_for_each(p, &obd->obd_exports) { + exp = list_entry(p, struct obd_export, exp_obd_chain); + oed = &exp->exp_ost_data; + if (!memcmp(cluuid, oed->oed_uuid, sizeof oed->oed_uuid)) { + spin_unlock(&obd->obd_dev_lock); + LASSERT(exp->exp_obd == obd); + + RETURN(target_handle_reconnect(conn, exp, cluuid)); + } + } + + dont_check_exports: + rc = class_connect(conn, obd, cluuid); + if (rc) + RETURN(rc); + exp = class_conn2export(conn); + LASSERT(exp); + + oed = &exp->exp_ost_data; + memcpy(oed->oed_uuid, cluuid, sizeof oed->oed_uuid); + + RETURN(0); +} /* use obd ops to offer management infrastructure */ static struct obd_ops ost_obd_ops = { - o_attach: ost_attach, - o_detach: ost_detach, - o_setup: ost_setup, - o_cleanup: ost_cleanup, + o_owner: THIS_MODULE, + o_attach: ost_attach, + o_detach: ost_detach, + o_setup: ost_setup, + o_cleanup: ost_cleanup, + o_connect: ost_connect, }; static int __init ost_init(void) { int rc; - rc = class_register_type(&ost_obd_ops, status_class_var, + rc = class_register_type(&ost_obd_ops, status_class_var, LUSTRE_OST_NAME); RETURN(rc); @@ -692,7 +744,6 @@ static int __init ost_init(void) static void __exit ost_exit(void) { - class_unregister_type(LUSTRE_OST_NAME); } diff --git a/lustre/patches/.cvsignore b/lustre/patches/.cvsignore deleted file mode 100644 index e530020..0000000 --- a/lustre/patches/.cvsignore +++ /dev/null @@ -1,8 +0,0 @@ -.Xrefs -config.log -config.status -configure -Makefile -Makefile.in -.deps -TAGS diff --git a/lustre/ptlbd/.cvsignore b/lustre/ptlbd/.cvsignore new file mode 100644 index 0000000..e995588 --- /dev/null +++ b/lustre/ptlbd/.cvsignore @@ -0,0 +1,3 @@ +.deps +Makefile +Makefile.in diff --git a/lustre/ptlbd/Makefile.am b/lustre/ptlbd/Makefile.am new file mode 100644 index 0000000..bfaeb25 --- /dev/null +++ b/lustre/ptlbd/Makefile.am @@ -0,0 +1,14 @@ +# Copyright (C) 2002 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +DEFS= + +MODULE = ptlbd +modulefs_DATA = ptlbd.o +EXTRA_PROGRAMS = ptlbd + +ptlbd_SOURCES = blk.c client.c main.c rpc.c server.c + +include $(top_srcdir)/Rules diff --git a/lustre/ptlbd/blk.c b/lustre/ptlbd/blk.c new file mode 100644 index 0000000..4a793436 --- /dev/null +++ b/lustre/ptlbd/blk.c @@ -0,0 +1,247 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_PTLBD + +#include +#include +#include +#include +#include + +/* + * todo: + * assign proper major number + * allow more minors + * discover actual block sizes? + * allow more than one sector per io + * think about vary-io + * restrict single ops to sequential block io + * ddn target addresses need to be 32 bit + * cant get to addresses after 0xFFFF0000 + */ + +#define PTLBD_MAJOR 253 +#define PTLBD_MAX_MINOR 1 + +#define MAJOR_NR PTLBD_MAJOR +#define LOCAL_END_REQUEST +#include +#include +#include + +static int ptlbd_size_size[PTLBD_MAX_MINOR]; +static int ptlbd_size[PTLBD_MAX_MINOR]; +static int ptlbd_hardsect_size[PTLBD_MAX_MINOR]; +static int ptlbd_max_sectors[PTLBD_MAX_MINOR]; +//RHism static char ptlbd_dev_varyio[PTLBD_MAX_MINOR]; + +/* + * per minor state, indexed by minor. + */ + +static struct ptlbd_obd *one_for_now; + +void ptlbd_blk_register(struct ptlbd_obd *ptlbd) +{ + ENTRY; + one_for_now = ptlbd; + EXIT; +} + +static struct ptlbd_obd * ptlbd_get_minor(int minor) +{ + ENTRY; + if ( minor >= PTLBD_MAX_MINOR ) + RETURN( ERR_PTR(-ENODEV) ); + RETURN(one_for_now); +} + +static struct ptlbd_obd * ptlbd_get_inode(struct inode *inode) +{ + ENTRY; + + if ( inode == NULL ) /* can this really happen? */ + RETURN( ERR_PTR(-EINVAL) ); + + return ptlbd_get_minor(MINOR(inode->i_rdev)); +} + +static int ptlbd_open(struct inode *inode, struct file *file) +{ + struct ptlbd_obd *ptlbd = ptlbd_get_inode(inode); + ENTRY; + + if ( IS_ERR(ptlbd) ) + RETURN(PTR_ERR(ptlbd)); + if ( ptlbd->bd_import.imp_connection == NULL ) + RETURN(-ENODEV); + + ptlbd->refcount++; + RETURN(0); +} + +static int ptlbd_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + struct ptlbd_obd *ptlbd; + + if ( ! capable(CAP_SYS_ADMIN) ) + RETURN(-EPERM); + + ptlbd = ptlbd_get_inode(inode); + if ( IS_ERR(ptlbd) ) + RETURN( PTR_ERR(ptlbd) ); + + /* XXX getattr{,64} */ + + RETURN(-EINVAL); +} + +static int ptlbd_release(struct inode *inode, struct file *file) +{ + struct ptlbd_obd *ptlbd = ptlbd_get_inode(inode); + ENTRY; + + if ( IS_ERR(ptlbd) ) + RETURN( PTR_ERR(ptlbd) ); + + ptlbd->refcount--; + RETURN(0); +} + +static void ptlbd_end_request_havelock(struct request *req) +{ + struct buffer_head *bh; + int uptodate = 1; + + if ( req->errors ) + uptodate = 0; + + while( (bh = req->bh) != NULL ) { + blk_finished_io(bh->b_size >> 9); + req->bh = bh->b_reqnext; + bh->b_reqnext = NULL; + bh->b_end_io(bh, uptodate); + } + blkdev_release_request(req); +} + +#if 0 +static void ptlbd_end_request_getlock(struct request *req) +{ + unsigned long flags; + + spin_lock_irqsave(&io_request_lock, flags); + ptlbd_end_request_havelock(req); + spin_unlock_irqrestore(&io_request_lock, flags); +} +#endif + +static void ptlbd_request(request_queue_t *q) +{ + struct ptlbd_obd *ptlbd; + struct request *req; + ptlbd_cmd_t cmd; + ENTRY; + + while ( !QUEUE_EMPTY ) { + req = CURRENT; + ptlbd = ptlbd_get_minor(MINOR(req->rq_dev)); + + blkdev_dequeue_request(req); + + if ( ptlbd->refcount <= 0 ) { + req->errors++; + ptlbd_end_request_havelock(req); + return; + } + + spin_unlock_irq(&io_request_lock); + + /* XXX dunno if we're supposed to get this or not.. */ + LASSERT(req->cmd != READA); + + if ( req->cmd == READ ) + cmd = PTLBD_READ; + else + cmd = PTLBD_WRITE; + + ptlbd_send_req(ptlbd, cmd, req->bh); + + spin_lock_irq(&io_request_lock); + + ptlbd_end_request_havelock(req); + } +} + +static struct block_device_operations ptlbd_ops = { + .owner = THIS_MODULE, + .open = ptlbd_open, + .release = ptlbd_release, + .ioctl = ptlbd_ioctl, +}; + +int ptlbd_blk_init(void) +{ + int ret; + int i; + ENTRY; + + ret = register_blkdev(PTLBD_MAJOR, "ptlbd", &ptlbd_ops); + if ( ret < 0 ) + RETURN(ret); + + blk_size[PTLBD_MAJOR] = ptlbd_size; + blksize_size[PTLBD_MAJOR] = ptlbd_size_size; + hardsect_size[PTLBD_MAJOR] = ptlbd_hardsect_size; + max_sectors[PTLBD_MAJOR] = ptlbd_max_sectors; + //RHism blkdev_varyio[PTLBD_MAJOR] = ptlbd_dev_varyio; + + blk_init_queue(BLK_DEFAULT_QUEUE(PTLBD_MAJOR), ptlbd_request); + blk_queue_headactive(BLK_DEFAULT_QUEUE(MAJOR_NR), 0); + + for ( i = 0 ; i < PTLBD_MAX_MINOR ; i++) { + ptlbd_size_size[i] = 4096; + ptlbd_size[i] = (4096*2048) >> BLOCK_SIZE_BITS; + ptlbd_hardsect_size[i] = 4096; + ptlbd_max_sectors[i] = 2; + //RHism ptlbd_dev_varyio[i] = 0; + /* XXX register_disk? */ + } + + return 0; +} + +void ptlbd_blk_exit(void) +{ + int ret; + ENTRY; + blk_cleanup_queue(BLK_DEFAULT_QUEUE(PTLBD_MAJOR)); + ret = unregister_blkdev(PTLBD_MAJOR, "ptlbd"); + if ( ret ) /* XXX */ + printk("unregister_blkdev() failed: %d\n", ret); +} + +#undef MAJOR_NR diff --git a/lustre/ptlbd/client.c b/lustre/ptlbd/client.c new file mode 100644 index 0000000..d57e001 --- /dev/null +++ b/lustre/ptlbd/client.c @@ -0,0 +1,142 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_PTLBD + +#include +#include +#include +#include +#include + +static int ptlbd_cl_setup(struct obd_device *obddev, obd_count len, void *buf) +{ + struct ptlbd_obd *ptlbd = &obddev->u.ptlbd; + struct obd_import *imp = &ptlbd->bd_import; + struct obd_ioctl_data* data = buf; + obd_uuid_t server_uuid; + ENTRY; + + if ( ptlbd->bd_import.imp_connection != NULL ) + RETURN(-EALREADY); + + if (data->ioc_inllen1 < 1) { + CERROR("requires a PTLBD server UUID\n"); + RETURN(-EINVAL); + } + + if (data->ioc_inllen1 > 37) { + CERROR("PTLBD server UUID must be less than 38 characters\n"); + RETURN(-EINVAL); + } + + memcpy(server_uuid, data->ioc_inlbuf1, MIN(data->ioc_inllen1, + sizeof(server_uuid))); + + imp->imp_connection = ptlrpc_uuid_to_connection(server_uuid); + if (!imp->imp_connection) + RETURN(-ENOENT); + + INIT_LIST_HEAD(&imp->imp_replay_list); + INIT_LIST_HEAD(&imp->imp_sending_list); + INIT_LIST_HEAD(&imp->imp_delayed_list); + spin_lock_init(&imp->imp_lock); + /* + * from client_obd_connect.. *shrug* + */ + INIT_LIST_HEAD(&imp->imp_chain); + imp->imp_last_xid = 0; + imp->imp_max_transno = 0; + imp->imp_peer_last_xid = 0; + imp->imp_peer_committed_transno = 0; + imp->imp_level = LUSTRE_CONN_FULL; + + ptlrpc_init_client(PTLBD_REQUEST_PORTAL, PTLBD_REPLY_PORTAL, + "ptlbd", &ptlbd->bd_client); + imp->imp_client = &ptlbd->bd_client; + imp->imp_obd = obddev; + + ptlbd_blk_register(ptlbd); + + RETURN(0); +} + +static int ptlbd_cl_cleanup(struct obd_device *obddev) +{ +// struct ptlbd_obd *ptlbd = &obddev->u.ptlbd; + ENTRY; + + CERROR("I should be cleaning things up\n"); + + RETURN(0); +} + +#if 0 +static int ptlbd_cl_connect(struct lustre_handle *conn, struct obd_device *obd, + obd_uuid_t cluuid, struct recovd_obd *recovd, + ptlrpc_recovery_cb_t recover) +{ + struct ptlbd_obd *ptlbd = &obd->u.ptlbd; + struct obd_import *imp = &ptlbd->bd_import; + int rc; + ENTRY; + + rc = class_connect(conn, obd, cluuid); + if (rc) + RETURN(rc); + + INIT_LIST_HEAD(&imp->imp_chain); + imp->imp_last_xid = 0; + imp->imp_max_transno = 0; + imp->imp_peer_last_xid = 0; + imp->imp_peer_committed_transno = 0; + imp->imp_level = LUSTRE_CONN_FULL; + + RETURN(0); +} +#endif + +static struct obd_ops ptlbd_cl_obd_ops = { + o_owner: THIS_MODULE, + o_setup: ptlbd_cl_setup, + o_cleanup: ptlbd_cl_cleanup, +#if 0 + o_connect: ptlbd_cl_connect, + o_disconnect: class_disconnect +#endif +}; + +int ptlbd_cl_init(void) +{ + extern struct lprocfs_vars status_class_var[]; + + return class_register_type(&ptlbd_cl_obd_ops, status_class_var, + OBD_PTLBD_CL_DEVICENAME); +} + +void ptlbd_cl_exit(void) +{ + class_unregister_type(OBD_PTLBD_CL_DEVICENAME); +} diff --git a/lustre/ptlbd/main.c b/lustre/ptlbd/main.c new file mode 100644 index 0000000..a95cc3f --- /dev/null +++ b/lustre/ptlbd/main.c @@ -0,0 +1,70 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_PTLBD + +#include +#include +#include + +#include + +static int __init ptlbd_init(void) +{ + int ret; + ENTRY; + + ret = ptlbd_cl_init(); + if ( ret < 0 ) + RETURN(ret); + + ret = ptlbd_sv_init(); + if ( ret < 0 ) + GOTO(out_cl, ret); + + ret = ptlbd_blk_init(); + if ( ret < 0 ) + GOTO(out_sv, ret); + + RETURN(0); + +out_sv: + ptlbd_sv_exit(); +out_cl: + ptlbd_cl_exit(); + RETURN(ret); +} + +static void __exit ptlbd_exit(void) +{ + ENTRY; + ptlbd_cl_exit(); + ptlbd_sv_exit(); + EXIT; +} + +module_init(ptlbd_init); +module_exit(ptlbd_exit); +MODULE_LICENSE("GPL"); diff --git a/lustre/ptlbd/rpc.c b/lustre/ptlbd/rpc.c new file mode 100644 index 0000000..5ff5177 --- /dev/null +++ b/lustre/ptlbd/rpc.c @@ -0,0 +1,550 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_PTLBD + +#include +#include +#include +#include +#include + +static __u32 get_next_xid(struct obd_import *imp) +{ + unsigned long flags; + __u32 xid; + spin_lock_irqsave(&imp->imp_lock, flags); + xid = ++imp->imp_last_xid; + spin_unlock_irqrestore(&imp->imp_lock, flags); + return xid; +} + +static int ptlbd_brw_callback(struct obd_brw_set *set, int phase) +{ + ENTRY; + RETURN(0); +} + +static void decref_bulk_desc(void *data) +{ + struct ptlrpc_bulk_desc *desc = data; + ENTRY; + + ptlrpc_bulk_decref(desc); + EXIT; +} + +/* this is the callback function which is invoked by the Portals + * event handler associated with the bulk_sink queue and bulk_source queue. + */ +static void ptlbd_ptl_ev_hdlr(struct ptlrpc_bulk_desc *desc) +{ + ENTRY; + + LASSERT(desc->bd_brw_set != NULL); + LASSERT(desc->bd_brw_set->brw_callback != NULL); + + desc->bd_brw_set->brw_callback(desc->bd_brw_set, CB_PHASE_FINISH); + + prepare_work(&desc->bd_queue, decref_bulk_desc, desc); + schedule_work(&desc->bd_queue); + + EXIT; +} + + +int ptlbd_write_put_req(struct ptlbd_obd *ptlbd, ptlbd_cmd_t cmd, + struct buffer_head *first_bh, unsigned int page_count) +{ + struct obd_import *imp = &ptlbd->bd_import; + struct ptlbd_op *op; + struct ptlbd_niob *niob, *niobs; + struct ptlbd_rsp *rsp; + struct ptlrpc_request *req; + struct ptlrpc_bulk_desc *desc; + struct buffer_head *bh; + int rc, size[2]; + struct obd_brw_set *set; + ENTRY; + + size[0] = sizeof(struct ptlbd_op); + size[1] = page_count * sizeof(struct ptlbd_niob); + + req = ptlrpc_prep_req(imp, cmd, 2, size, NULL); + if (!req) + GOTO(out, rc = -ENOMEM); + /* XXX might not need these */ + req->rq_request_portal = PTLBD_REQUEST_PORTAL; + req->rq_reply_portal = PTLBD_REPLY_PORTAL; + + op = lustre_msg_buf(req->rq_reqmsg, 0); + niobs = lustre_msg_buf(req->rq_reqmsg, 1); + + /* XXX pack */ + op->op_cmd = cmd; + op->op_lun = 0; + op->op_niob_cnt = page_count; + op->op__padding = 0; + op->op_block_cnt = page_count; + + desc = ptlrpc_prep_bulk(imp->imp_connection); + if ( desc == NULL ) + GOTO(out_req, rc = -ENOMEM); + desc->bd_portal = PTLBD_BULK_PORTAL; + desc->bd_ptl_ev_hdlr = ptlbd_ptl_ev_hdlr; + + /* XXX someone needs to free this */ + set = obd_brw_set_new(); + if (set == NULL) + GOTO(out_desc, rc = -ENOMEM); + + set->brw_callback = ptlbd_brw_callback; + +#if 0 + xid = get_next_xid(imp); +#endif + + for ( niob = niobs, bh = first_bh ; bh ; bh = bh->b_next, niob++ ) { +#if 0 + struct ptlrpc_bulk_page *bulk = ptlrpc_prep_bulk_page(desc); + if (bulk == NULL) + GOTO(out_set, rc = -ENOMEM); +#endif + +#if 0 + niob->n_xid = xid; +#endif + niob->n_block_nr = bh->b_blocknr; + niob->n_offset = bh_offset(bh); + niob->n_length = bh->b_size; + + +#if 0 + bulk->bp_xid = xid; + bulk->bp_buf = bh->b_data; + bulk->bp_page = bh->b_page; + bulk->bp_buflen = bh->b_size; +#endif + } + + + size[0] = sizeof(struct ptlbd_rsp); + size[1] = sizeof(struct ptlbd_niob) * page_count; + req->rq_replen = lustre_msg_size(2, size); + + /* XXX find out how we're really supposed to manage levels */ + req->rq_level = imp->imp_level; + rc = ptlrpc_queue_wait(req); + + rsp = lustre_msg_buf(req->rq_repmsg, 0); + + niob = lustre_msg_buf(req->rq_repmsg, 1); + /* XXX check that op->num matches ours */ + for ( bh = first_bh ; bh ; bh = bh->b_next, niob++ ) { + struct ptlrpc_bulk_page *bulk = ptlrpc_prep_bulk_page(desc); + if (bulk == NULL) + GOTO(out_set, rc = -ENOMEM); + + bulk->bp_xid = niob->n_xid; + bulk->bp_page = bh->b_page; + bulk->bp_buf = bh->b_data; + bulk->bp_buflen = bh->b_size; + } + + obd_brw_set_add(set, desc); + rc = ptlrpc_send_bulk(desc); + + /* if there's an error, no brw_finish called, just like + * osc_brw_read */ + + GOTO(out_req, rc); + +out_set: + obd_brw_set_free(set); +out_desc: + ptlrpc_bulk_decref(desc); +out_req: + ptlrpc_req_finished(req); +out: + RETURN(rc); +} + +int ptlbd_read_put_req(struct ptlbd_obd *ptlbd, ptlbd_cmd_t cmd, + struct buffer_head *first_bh, unsigned int page_count) +{ + struct obd_import *imp = &ptlbd->bd_import; + struct ptlbd_op *op; + struct ptlbd_niob *niob, *niobs; + struct ptlbd_rsp *rsp; + struct ptlrpc_request *req; + struct ptlrpc_bulk_desc *desc; + struct buffer_head *bh; + int rc, rep_size, size[2]; + struct obd_brw_set *set; + __u32 xid; + ENTRY; + + size[0] = sizeof(struct ptlbd_op); + size[1] = page_count * sizeof(struct ptlbd_niob); + + req = ptlrpc_prep_req(imp, cmd, 2, size, NULL); + if (!req) + GOTO(out, rc = -ENOMEM); + /* XXX might not need these? */ + req->rq_request_portal = PTLBD_REQUEST_PORTAL; + req->rq_reply_portal = PTLBD_REPLY_PORTAL; + + op = lustre_msg_buf(req->rq_reqmsg, 0); + niobs = lustre_msg_buf(req->rq_reqmsg, 1); + + /* XXX pack */ + op->op_cmd = cmd; + op->op_lun = 0; + op->op_niob_cnt = page_count; + op->op__padding = 0; + op->op_block_cnt = page_count; + + desc = ptlrpc_prep_bulk(imp->imp_connection); + if ( desc == NULL ) + GOTO(out_req, rc = -ENOMEM); + desc->bd_portal = PTLBD_BULK_PORTAL; + desc->bd_ptl_ev_hdlr = ptlbd_ptl_ev_hdlr; + + /* XXX someone needs to free this */ + set = obd_brw_set_new(); + if (set == NULL) + GOTO(out_desc, rc = -ENOMEM); + + set->brw_callback = ptlbd_brw_callback; + + xid = get_next_xid(imp); + + for ( niob = niobs, bh = first_bh ; bh ; bh = bh->b_next, niob++ ) { + struct ptlrpc_bulk_page *bulk = ptlrpc_prep_bulk_page(desc); + if (bulk == NULL) + GOTO(out_set, rc = -ENOMEM); + + niob->n_xid = xid; + niob->n_block_nr = bh->b_blocknr; + niob->n_offset = bh_offset(bh); + niob->n_length = bh->b_size; + + bulk->bp_xid = xid; + bulk->bp_buf = bh->b_data; + bulk->bp_page = bh->b_page; + bulk->bp_buflen = bh->b_size; + } + + /* XXX put in OBD_FAIL_CHECK for ptlbd? */ + rc = ptlrpc_register_bulk(desc); + if (rc) + GOTO(out_set, rc); + + obd_brw_set_add(set, desc); + + rep_size = sizeof(struct ptlbd_rsp); + req->rq_replen = lustre_msg_size(1, &rep_size); + + /* XXX find out how we're really supposed to manage levels */ + req->rq_level = imp->imp_level; + rc = ptlrpc_queue_wait(req); + + rsp = lustre_msg_buf(req->rq_repmsg, 0); + + /* if there's an error, no brw_finish called, just like + * osc_brw_read */ + + GOTO(out_req, rc); + +out_set: + obd_brw_set_free(set); +out_desc: + ptlrpc_bulk_decref(desc); +out_req: + ptlrpc_req_finished(req); +out: + RETURN(rc); +} + +int ptlbd_send_req(struct ptlbd_obd *ptlbd, ptlbd_cmd_t cmd, + struct buffer_head *first_bh) +{ + unsigned int page_count = 0; + struct buffer_head *bh; + int rc; + ENTRY; + + for ( page_count = 0, bh = first_bh ; bh ; bh = bh->b_next ) + page_count++; + + switch (cmd) { + case PTLBD_READ: + rc = ptlbd_read_put_req(ptlbd, cmd, + first_bh, page_count); + break; + case PTLBD_WRITE: + rc = ptlbd_write_put_req(ptlbd, cmd, + first_bh, page_count); + break; + default: + rc = -EINVAL; + break; + }; + + RETURN(rc); +} + +static int ptlbd_bulk_timeout(void *data) +{ +/* struct ptlrpc_bulk_desc *desc = data;*/ + ENTRY; + + CERROR("ugh, timed out\n"); + + RETURN(1); +} + +#define SILLY_MAX 2048 +static struct page *pages[SILLY_MAX] = {NULL,}; + +static struct page * fake_page(int block_nr) +{ + if ( block_nr >= SILLY_MAX ) + return NULL; + + if (pages[block_nr] == NULL) { + void *vaddr = (void *)get_free_page(GFP_KERNEL); + pages[block_nr] = virt_to_page(vaddr); + } + return pages[block_nr]; +} + +static int ptlbd_put_write(struct ptlrpc_request *req) +{ + struct lustre_handle *conn = (struct lustre_handle *)req->rq_reqmsg; + struct ptlbd_op *op; + struct ptlbd_niob *reply_niob, *request_niob; + struct ptlbd_rsp *rsp; + struct ptlrpc_bulk_desc *desc; + struct ptlrpc_service *srv; + struct l_wait_info lwi; + int size[2]; + int i, page_count, rc; + __u32 xid; + + op = lustre_msg_buf(req->rq_reqmsg, 0); + request_niob = lustre_msg_buf(req->rq_reqmsg, 1); + page_count = req->rq_reqmsg->buflens[1] / sizeof(struct ptlbd_niob); + + size[0] = sizeof(struct ptlbd_rsp); + size[1] = sizeof(struct ptlbd_niob) * page_count; + rc = lustre_pack_msg(2, size, NULL, &req->rq_replen, &req->rq_repmsg); + if (rc) + GOTO(out, rc); + reply_niob = lustre_msg_buf(req->rq_repmsg, 1); + + desc = ptlrpc_prep_bulk(req->rq_connection); + if (desc == NULL) + GOTO(out, rc = -ENOMEM); + desc->bd_ptl_ev_hdlr = NULL; + desc->bd_portal = PTLBD_BULK_PORTAL; + memcpy(&(desc->bd_conn), &conn, sizeof(conn)); /* XXX what? */ + + srv = req->rq_obd->u.ptlbd.ptlbd_service; + spin_lock(&srv->srv_lock); + xid = srv->srv_xid++; /* single xid for all pages */ + spin_unlock(&srv->srv_lock); + + for ( i = 0; i < page_count; i++) { + struct ptlrpc_bulk_page *bulk = ptlrpc_prep_bulk_page(desc); + if (bulk == NULL) + GOTO(out_desc, rc = -ENOMEM); + + reply_niob[i] = request_niob[i]; + reply_niob[i].n_xid = xid; + + bulk->bp_xid = xid; + bulk->bp_page = fake_page(request_niob[i].n_block_nr); + bulk->bp_buf = page_address(bulk->bp_page); + bulk->bp_buflen = request_niob[i].n_length; + } + + rc = ptlrpc_register_bulk(desc); + if ( rc ) + GOTO(out_desc, rc); + + rsp = lustre_msg_buf(req->rq_reqmsg, 0); + rsp->r_status = 42; + rsp->r_error_cnt = 13; + ptlrpc_reply(req->rq_svc, req); + + /* this synchronization probably isn't good enough */ + lwi = LWI_TIMEOUT(obd_timeout * HZ, ptlbd_bulk_timeout, desc); + rc = l_wait_event(desc->bd_waitq, desc->bd_flags &PTL_BULK_FL_RCVD, + &lwi); + +out_desc: + ptlrpc_free_bulk(desc); +out: + RETURN(rc); +} + +static int ptlbd_put_read(struct ptlrpc_request *req) +{ + struct ptlbd_op *op; + struct ptlbd_niob *niob, *niobs; + struct ptlbd_rsp *rsp; + struct ptlrpc_bulk_desc *desc; + struct l_wait_info lwi; + int size[1]; + int i, page_count, rc; + + op = lustre_msg_buf(req->rq_reqmsg, 0); + niobs = lustre_msg_buf(req->rq_reqmsg, 1); + page_count = req->rq_reqmsg->buflens[1] / sizeof(struct ptlbd_niob); + + desc = ptlrpc_prep_bulk(req->rq_connection); + if (desc == NULL) + GOTO(out, rc = -ENOMEM); + desc->bd_portal = PTLBD_BULK_PORTAL; + + for ( i = 0, niob = niobs ; i < page_count; niob++, i++) { + struct ptlrpc_bulk_page *bulk = ptlrpc_prep_bulk_page(desc); + if (bulk == NULL) + GOTO(out_bulk, rc = -ENOMEM); + + /* + * XXX what about the block number? + */ + bulk->bp_xid = niob->n_xid; + bulk->bp_page = fake_page(niob->n_block_nr); + bulk->bp_buf = page_address(bulk->bp_page); + bulk->bp_buflen = niob->n_length; + } + + rc = ptlrpc_send_bulk(desc); + if ( rc ) + GOTO(out_bulk, rc); + + /* this synchronization probably isn't good enough */ + lwi = LWI_TIMEOUT(obd_timeout * HZ, ptlbd_bulk_timeout, desc); + rc = l_wait_event(desc->bd_waitq, desc->bd_flags &PTL_BULK_FL_SENT, + &lwi); + + size[0] = sizeof(struct ptlbd_rsp); + rc = lustre_pack_msg(1, size, NULL, &req->rq_replen, &req->rq_repmsg); + if ( rc ) + GOTO(out, rc); + + rsp = lustre_msg_buf(req->rq_repmsg, 0); + if ( rsp == NULL ) + GOTO(out, rc = -EINVAL); + + rsp->r_error_cnt = 42; + rsp->r_status = 69; + + req->rq_status = 0; /* XXX */ + ptlrpc_reply(req->rq_svc, req); + +out_bulk: + ptlrpc_free_bulk(desc); +out: + RETURN(rc); +} + + +int ptlbd_parse_req(struct ptlrpc_request *req) +{ + struct ptlbd_op *op; + int rc; + ENTRY; + + rc = lustre_unpack_msg(req->rq_reqmsg, req->rq_reqlen); + if ( rc ) + RETURN(rc); + + op = lustre_msg_buf(req->rq_reqmsg, 0); + + switch(op->op_cmd) { + case PTLBD_READ: + ptlbd_put_read(req); + break; + case PTLBD_WRITE: + ptlbd_put_write(req); + break; + default: + CERROR("fix this %d\n", op->op_cmd); + break; + } + + RETURN(0); +} + + +#if 0 +int ptlbd_bh_req(int cmd, struct ptlbd_state *st, struct buffer_head *first_bh) +{ + struct obd_brw_set *set = NULL; + struct brw_page *pg = NULL; + struct buffer_head *bh; + int rc, i, pg_bytes = 0; + ENTRY; + + for ( bh = first_bh ; bh ; bh = bh->b_reqnext ) + pg_bytes += sizeof(struct brw_page); + + OBD_ALLOC(pg, pg_bytes); + if ( pg == NULL ) + GOTO(out, rc = -ENOMEM); + + set = obd_brw_set_new(); + if (set == NULL) + GOTO(out, rc = -ENOMEM); + + for ( i = 0, bh = first_bh ; bh ; bh = bh->b_reqnext, i++) { + pg[i].pg = bh->b_page; + pg[i].off = bh_offset(bh); + pg[i].count = bh->b_size; + pg[i].flag = 0; + } + + set->brw_callback = ll_brw_sync_wait; + rc = obd_brw(cmd, /* lsm */NULL, num_pages, pg, set); + if ( rc ) + GOTO(out, rc); + + rc = ll_brw_sync_wait(set, CB_PHASE_START); + if (rc) + CERROR("error from callback: rc = %d\n", rc); + +out: + if ( pg != NULL ) + OBD_FREE(pg, pg_bytes); + if ( set != NULL ) + obd_brw_set_free(set); + + RETURN(rc); +} +#endif diff --git a/lustre/ptlbd/server.c b/lustre/ptlbd/server.c new file mode 100644 index 0000000..422f0e1 --- /dev/null +++ b/lustre/ptlbd/server.c @@ -0,0 +1,154 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2002 Cluster File Systems, Inc. + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include + +#define DEBUG_SUBSYSTEM S_PTLBD + +#include +#include +#include +#include +#include + +#if 0 +static int ptlbd_sv_callback(struct ptlrpc_request *req) +{ + int rc; + ENTRY; + + rc = ptlbd_parse_request(req); + + rc = lustre_unpack_msg(req->rq_reqmsg, req->rq_reqlen); + if ( rc ) + GOTO(out, rc); + + printk("callback got a friggin opc %d\n", req->rq_reqmsg->opc); + +out: + RETURN(rc); +} +#endif + +static int ptlbd_sv_already_setup = 1; + +static int ptlbd_sv_setup(struct obd_device *obddev, obd_count len, void *buf) +{ +#if 0 + struct obd_ioctl_data* data = buf; + obd_uuid_t server_uuid; +#endif + struct ptlbd_obd *ptlbd = &obddev->u.ptlbd; + int rc; + ENTRY; + +#if 0 + if (data->ioc_inllen1 < 1) { + CERROR("requires a PTLBD server UUID\n"); + RETURN(rc = -EINVAL); + } + + if (data->ioc_inllen1 > 37) { + CERROR("PTLBD server UUID must be less than 38 characters\n"); + RETURN(rc = -EINVAL); + } + + memcpy(server_uuid, data->ioc_inlbuf1, MIN(data->ioc_inllen1, + sizeof(server_uuid))); + +#endif + ptlbd->ptlbd_service = + ptlrpc_init_svc(PTLBD_NEVENTS, PTLBD_NBUFS, PTLBD_BUFSIZE, + PTLBD_MAXREQSIZE, PTLBD_REQUEST_PORTAL, + PTLBD_REPLY_PORTAL, "self", + ptlbd_parse_req, "ptlbd_sv"); + + if (!ptlbd->ptlbd_service) { + CERROR("failed to start service\n"); + RETURN(rc = -ENOMEM); + } + + rc = ptlrpc_start_thread(obddev, ptlbd->ptlbd_service, "ptldb"); + if (rc) { + CERROR("cannot start PTLBD thread: rc %d\n", rc); + LBUG(); + GOTO(out_thread, rc); + } + + ptlbd_sv_already_setup = 1; + + RETURN(0); + + out_thread: + ptlrpc_stop_all_threads(ptlbd->ptlbd_service); + ptlrpc_unregister_service(ptlbd->ptlbd_service); + + return rc; +} + +static int ptlbd_sv_cleanup(struct obd_device *obddev) +{ + struct ptlbd_obd *ptlbd = &obddev->u.ptlbd; + ENTRY; + + /* XXX check for state */ + + ptlrpc_stop_all_threads(ptlbd->ptlbd_service); + ptlrpc_unregister_service(ptlbd->ptlbd_service); + + ptlbd_sv_already_setup = 0; + RETURN(0); +} + +#if 0 +static int ptlbd_sv_connect(struct lustre_handle *conn, struct obd_device *src, + obd_uuid_t cluuid, struct recovd_obd *recovd, + ptlrpc_recovery_cb_t recover) +{ + return class_connect(conn, src, cluuid); +} +#endif + +static struct obd_ops ptlbd_sv_obd_ops = { + o_owner: THIS_MODULE, +/* o_iocontrol: ptlbd_iocontrol,*/ + o_setup: ptlbd_sv_setup, + o_cleanup: ptlbd_sv_cleanup, +#if 0 + o_connect: ptlbd_sv_connect, + o_disconnect: class_disconnect +#endif +}; + +int ptlbd_sv_init(void) +{ + extern struct lprocfs_vars status_class_var[]; + + return class_register_type(&ptlbd_sv_obd_ops, status_class_var, + OBD_PTLBD_SV_DEVICENAME); +} + +void ptlbd_sv_exit(void) +{ + class_unregister_type(OBD_PTLBD_SV_DEVICENAME); +} diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index ccaa108..1d6c719 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -259,6 +259,7 @@ struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, int opcode, { struct ptlrpc_connection *conn; struct ptlrpc_request *request; + unsigned long flags; int rc; ENTRY; @@ -292,9 +293,9 @@ struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, int opcode, INIT_LIST_HEAD(&request->rq_list); atomic_set(&request->rq_refcount, 1); - spin_lock(&imp->imp_lock); + spin_lock_irqsave(&imp->imp_lock, flags); request->rq_xid = HTON__u32(++imp->imp_last_xid); - spin_unlock(&imp->imp_lock); + spin_unlock_irqrestore(&imp->imp_lock, flags); request->rq_reqmsg->magic = PTLRPC_MSG_MAGIC; request->rq_reqmsg->version = PTLRPC_MSG_VERSION; @@ -318,7 +319,7 @@ static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked) request, request->rq_reqmsg->opc, request->rq_connection->c_remote_uuid, request->rq_import->imp_client->cli_request_portal, - request->rq_refcount); + atomic_read (&request->rq_refcount)); /* LBUG(); */ } @@ -333,11 +334,13 @@ static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked) } if (request->rq_import) { + unsigned long flags = 0; if (!locked) - spin_lock(&request->rq_import->imp_lock); + spin_lock_irqsave(&request->rq_import->imp_lock, flags); list_del_init(&request->rq_list); if (!locked) - spin_unlock(&request->rq_import->imp_lock); + spin_unlock_irqrestore(&request->rq_import->imp_lock, + flags); } ptlrpc_put_connection(request->rq_connection); @@ -356,6 +359,12 @@ static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked) if (request == NULL) RETURN(1); + if (request == (void *)(long)(0x5a5a5a5a5a5a5a5a)) { + CERROR("dereferencing freed request (bug 575)\n"); + LBUG(); + RETURN(1); + } + DEBUG_REQ(D_INFO, request, "refcount now %u", atomic_read(&request->rq_refcount) - 1); @@ -379,6 +388,8 @@ static int ptlrpc_check_reply(struct ptlrpc_request *req) ENTRY; if (req->rq_repmsg != NULL) { req->rq_transno = NTOH__u64(req->rq_repmsg->transno); + /* Store transno in reqmsg for replay. */ + req->rq_reqmsg->transno = req->rq_repmsg->transno; req->rq_flags |= PTL_RPC_FL_REPLIED; GOTO(out, rc = 1); } @@ -412,7 +423,7 @@ static int ptlrpc_check_status(struct ptlrpc_request *req) err = req->rq_repmsg->status; if (req->rq_repmsg->type == NTOH__u32(PTL_RPC_MSG_ERR)) { - DEBUG_REQ(D_ERROR, req, "type == PTL_RPC_MSG_ERR (%d)\n", err); + DEBUG_REQ(D_ERROR, req, "type == PTL_RPC_MSG_ERR (%d)", err); RETURN(err ? err : -EINVAL); } @@ -438,10 +449,13 @@ static int ptlrpc_abort(struct ptlrpc_request *request) { /* First remove the ME for the reply; in theory, this means * that we can tear down the buffer safely. */ - PtlMEUnlink(request->rq_reply_me_h); + if (PtlMEUnlink(request->rq_reply_me_h) != PTL_OK) + RETURN(0); OBD_FREE(request->rq_reply_md.start, request->rq_replen); + + memset(&request->rq_reply_me_h, 0, sizeof(request->rq_reply_me_h)); + request->rq_reply_md.start = NULL; request->rq_repmsg = NULL; - request->rq_replen = 0; return 0; } @@ -487,11 +501,12 @@ void ptlrpc_cleanup_client(struct obd_import *imp) struct list_head *tmp, *saved; struct ptlrpc_request *req; struct ptlrpc_connection *conn = imp->imp_connection; + unsigned long flags; ENTRY; LASSERT(conn); - spin_lock(&imp->imp_lock); + spin_lock_irqsave(&imp->imp_lock, flags); list_for_each_safe(tmp, saved, &imp->imp_replay_list) { req = list_entry(tmp, struct ptlrpc_request, rq_list); @@ -501,7 +516,7 @@ void ptlrpc_cleanup_client(struct obd_import *imp) req->rq_import = NULL; __ptlrpc_req_finished(req, 0); } - spin_unlock(&imp->imp_lock); + spin_unlock_irqrestore(&imp->imp_lock, flags); EXIT; return; @@ -554,6 +569,7 @@ static int expired_request(void *data) } DEBUG_REQ(D_ERROR, req, "timeout"); + ptlrpc_abort(req); req->rq_flags |= PTL_RPC_FL_TIMEOUT; if (!req->rq_import) { @@ -571,16 +587,13 @@ static int expired_request(void *data) if (!req->rq_import->imp_connection->c_recovd_data.rd_recovd) RETURN(1); - req->rq_timeout = 0; recovd_conn_fail(req->rq_import->imp_connection); -#if 0 /* If this request is for recovery or other primordial tasks, * don't go back to sleep. */ if (req->rq_level < LUSTRE_CONN_FULL) RETURN(1); -#endif RETURN(0); } @@ -592,24 +605,13 @@ static int interrupted_request(void *data) RETURN(1); /* ignored, as of this writing */ } -/* If the import has been invalidated (such as by an OST failure), the - * request must fail with -EIO. - * - * Must be called with imp_lock held, will drop it if it returns -EIO. - */ -#define EIO_IF_INVALID(req) \ -if (req->rq_import->imp_flags & IMP_INVALID) { \ - DEBUG_REQ(D_ERROR, req, "IMP_INVALID:"); \ - spin_unlock(&imp->imp_lock); \ - RETURN(-EIO); \ -} - int ptlrpc_queue_wait(struct ptlrpc_request *req) { int rc = 0; struct l_wait_info lwi; struct obd_import *imp = req->rq_import; struct ptlrpc_connection *conn = imp->imp_connection; + unsigned int flags; ENTRY; init_waitqueue_head(&req->rq_wait_for_rep); @@ -620,12 +622,22 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req) NTOH__u32(req->rq_reqmsg->status), req->rq_xid, conn->c_peer.peer_nid, NTOH__u32(req->rq_reqmsg->opc)); - spin_lock(&imp->imp_lock); - EIO_IF_INVALID(req); + spin_lock_irqsave(&imp->imp_lock, flags); + + /* + * If the import has been invalidated (such as by an OST failure), the + * request must fail with -EIO. + */ + if (req->rq_import->imp_flags & IMP_INVALID) { + DEBUG_REQ(D_ERROR, req, "IMP_INVALID:"); + spin_unlock_irqrestore(&imp->imp_lock, flags); + RETURN(-EIO); + } + if (req->rq_level > imp->imp_level) { list_del(&req->rq_list); list_add_tail(&req->rq_list, &imp->imp_delayed_list); - spin_unlock(&imp->imp_lock); + spin_unlock_irqrestore(&imp->imp_lock, flags); DEBUG_REQ(D_HA, req, "\"%s\" waiting for recovery: (%d < %d)", current->comm, req->rq_level, imp->imp_level); @@ -634,15 +646,16 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req) (req->rq_level <= imp->imp_level) || (req->rq_flags & PTL_RPC_FL_ERR), &lwi); - spin_lock(&imp->imp_lock); + spin_lock_irqsave(&imp->imp_lock, flags); list_del_init(&req->rq_list); - spin_unlock(&imp->imp_lock); if (req->rq_flags & PTL_RPC_FL_ERR) - RETURN(-EIO); + rc = -EIO; - if (rc) + if (rc) { + spin_unlock_irqrestore(&imp->imp_lock, flags); RETURN(rc); + } CERROR("process %d resumed\n", current->pid); } @@ -650,7 +663,7 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req) LASSERT(list_empty(&req->rq_list)); list_add_tail(&req->rq_list, &imp->imp_sending_list); - spin_unlock(&imp->imp_lock); + spin_unlock_irqrestore(&imp->imp_lock, flags); rc = ptl_send_rpc(req); if (rc) { CDEBUG(D_HA, "error %d, opcode %d, need recovery\n", rc, @@ -660,15 +673,15 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req) interrupted_request, req); } else { DEBUG_REQ(D_NET, req, "-- sleeping"); - lwi = LWI_TIMEOUT_INTR(req->rq_timeout * HZ, expired_request, + lwi = LWI_TIMEOUT_INTR(obd_timeout * HZ, expired_request, interrupted_request, req); } l_wait_event(req->rq_wait_for_rep, ptlrpc_check_reply(req), &lwi); DEBUG_REQ(D_NET, req, "-- done sleeping"); - spin_lock(&imp->imp_lock); + spin_lock_irqsave(&imp->imp_lock, flags); list_del_init(&req->rq_list); - spin_unlock(&imp->imp_lock); + spin_unlock_irqrestore(&imp->imp_lock, flags); if (req->rq_flags & PTL_RPC_FL_ERR) { ptlrpc_abort(req); @@ -681,6 +694,7 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req) req->rq_flags &= ~PTL_RPC_FL_RESEND; lustre_msg_add_flags(req->rq_reqmsg, MSG_RESENT); DEBUG_REQ(D_HA, req, "resending: "); + spin_lock_irqsave(&imp->imp_lock, flags); goto resend; } @@ -713,11 +727,11 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req) GOTO(out, rc = -EINVAL); } #endif - DEBUG_REQ(D_NET, req, "status %d\n", req->rq_repmsg->status); + DEBUG_REQ(D_NET, req, "status %d", req->rq_repmsg->status); /* We're a rejected connection, need to invalidate and rebuild. */ if (req->rq_repmsg->status == -ENOTCONN) { - spin_lock(&imp->imp_lock); + spin_lock_irqsave(&imp->imp_lock, flags); /* If someone else is reconnecting us (CONN_RECOVD) or has * already completed it (handle mismatch), then we just need * to get out. @@ -725,20 +739,23 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req) if (imp->imp_level == LUSTRE_CONN_RECOVD || imp->imp_handle.addr != req->rq_reqmsg->addr || imp->imp_handle.cookie != req->rq_reqmsg->cookie) { - spin_unlock(&imp->imp_lock); + spin_unlock_irqrestore(&imp->imp_lock, flags); GOTO(out, rc = -EIO); } imp->imp_level = LUSTRE_CONN_RECOVD; - spin_unlock(&imp->imp_lock); + spin_unlock_irqrestore(&imp->imp_lock, flags); rc = imp->imp_recover(imp, PTLRPC_RECOVD_PHASE_NOTCONN); if (rc) LBUG(); GOTO(out, rc = -EIO); } + rc = ptlrpc_check_status(req); + if (req->rq_import->imp_flags & IMP_REPLAYABLE) { - spin_lock(&imp->imp_lock); - if (req->rq_flags & PTL_RPC_FL_REPLAY || req->rq_transno != 0) { + spin_lock_irqsave(&imp->imp_lock, flags); + if ((req->rq_flags & PTL_RPC_FL_REPLAY || req->rq_transno != 0) + && rc >= 0) { /* Balanced in ptlrpc_free_committed, usually. */ atomic_inc(&req->rq_refcount); list_add_tail(&req->rq_list, &imp->imp_replay_list); @@ -758,18 +775,14 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req) imp->imp_peer_committed_transno = req->rq_repmsg->last_committed; ptlrpc_free_committed(imp); - spin_unlock(&imp->imp_lock); + spin_unlock_irqrestore(&imp->imp_lock, flags); } - rc = ptlrpc_check_status(req); - EXIT; out: return rc; } -#undef EIO_IF_INVALID - int ptlrpc_replay_req(struct ptlrpc_request *req) { int rc = 0, old_level, old_status = 0; @@ -780,7 +793,6 @@ int ptlrpc_replay_req(struct ptlrpc_request *req) init_waitqueue_head(&req->rq_wait_for_rep); DEBUG_REQ(D_NET, req, ""); - req->rq_timeout = obd_timeout; req->rq_reqmsg->addr = req->rq_import->imp_handle.addr; req->rq_reqmsg->cookie = req->rq_import->imp_handle.cookie; @@ -837,15 +849,16 @@ int ptlrpc_replay_req(struct ptlrpc_request *req) /* XXX looks a lot like super.c:invalidate_request_list, don't it? */ void ptlrpc_abort_inflight(struct obd_import *imp) { + unsigned long flags; struct list_head *tmp, *n; /* Make sure that no new requests get processed for this import. * ptlrpc_queue_wait must (and does) hold imp_lock while testing this * flag and then putting requests on sending_list or delayed_list. */ - spin_lock(&imp->imp_lock); + spin_lock_irqsave(&imp->imp_lock, flags); imp->imp_flags |= IMP_INVALID; - spin_unlock(&imp->imp_lock); + spin_unlock_irqrestore(&imp->imp_lock, flags); list_for_each_safe(tmp, n, &imp->imp_sending_list) { struct ptlrpc_request *req = diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c index 5cbdbc5..1d6284e 100644 --- a/lustre/ptlrpc/niobuf.c +++ b/lustre/ptlrpc/niobuf.c @@ -148,6 +148,14 @@ int ptlrpc_send_bulk(struct ptlrpc_bulk_desc *desc) iov[desc->bd_md.niov].iov_base = bulk->bp_buf; iov[desc->bd_md.niov].iov_len = bulk->bp_buflen; + if (iov[desc->bd_md.niov].iov_len <= 0) { + CERROR("bad bp_buflen[%d] @ %p: %d\n", desc->bd_md.niov, + bulk->bp_buf, bulk->bp_buflen); + CERROR("desc: xid %u, pages %d, ptl %d, ref %d\n", + xid, desc->bd_page_count, desc->bd_portal, + atomic_read(&desc->bd_refcount)); + LBUG(); + } desc->bd_md.niov++; desc->bd_md.length += bulk->bp_buflen; } @@ -384,22 +392,20 @@ int ptl_send_rpc(struct ptlrpc_request *request) /* add a ref, which will be balanced in request_out_callback */ atomic_inc(&request->rq_refcount); if (request->rq_replen != 0) { - /* request->rq_repmsg is set only when the reply comes in, in - * client_packet_callback() */ - if (request->rq_reply_md.start) { + if (request->rq_reply_md.start != NULL) { rc = PtlMEUnlink(request->rq_reply_me_h); - LASSERT (rc == PTL_OK); - OBD_FREE(request->rq_reply_md.start, - request->rq_replen); - /* If we're resending, rq_repmsg needs to be NULLed out - * again so that ptlrpc_check_reply doesn't trip early. - */ + if (rc != PTL_OK && rc != PTL_INV_ME) { + CERROR("rc %d\n", rc); + LBUG(); + } + repbuf = (char *)request->rq_reply_md.start; request->rq_repmsg = NULL; - } - OBD_ALLOC(repbuf, request->rq_replen); - if (!repbuf) { - LBUG(); - RETURN(ENOMEM); + } else { + OBD_ALLOC(repbuf, request->rq_replen); + if (!repbuf) { + LBUG(); + RETURN(ENOMEM); + } } rc = PtlMEAttach(request->rq_connection->c_peer.peer_ni, diff --git a/lustre/ptlrpc/recovd.c b/lustre/ptlrpc/recovd.c index 0bbc4b0..d544a19 100644 --- a/lustre/ptlrpc/recovd.c +++ b/lustre/ptlrpc/recovd.c @@ -23,7 +23,6 @@ /* dump_connection_list, but shorter for nicer debugging logs */ static void d_c_l(struct list_head *head) { - int sanity = 0; struct list_head *tmp; list_for_each(tmp, head) { @@ -33,8 +32,6 @@ static void d_c_l(struct list_head *head) CDEBUG(D_HA, " %p = %s (%d/%d)\n", conn, conn->c_remote_uuid, conn->c_recovd_data.rd_phase, conn->c_recovd_data.rd_next_phase); - if (sanity++ > 1000) - LBUG(); } } @@ -277,7 +274,7 @@ static int recovd_handle_event(struct recovd_obd *recovd) static int recovd_main(void *arg) { struct recovd_obd *recovd = (struct recovd_obd *)arg; - + unsigned long flags; ENTRY; lock_kernel(); @@ -287,10 +284,10 @@ static int recovd_main(void *arg) sigfillset(¤t->blocked); recalc_sigpending(); #else - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irqsave(¤t->sigmask_lock, flags); sigfillset(¤t->blocked); recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + spin_unlock_irqrestore(¤t->sigmask_lock, flags); #endif sprintf(current->comm, "lustre_recovd"); diff --git a/lustre/ptlrpc/recover.c b/lustre/ptlrpc/recover.c index acdecf8..b4f3c85 100644 --- a/lustre/ptlrpc/recover.c +++ b/lustre/ptlrpc/recover.c @@ -24,7 +24,8 @@ #include #include -int ptlrpc_reconnect_import(struct obd_import *imp, int rq_opc) +int ptlrpc_reconnect_import(struct obd_import *imp, int rq_opc, + struct ptlrpc_request **reqptr) { struct obd_device *obd = imp->imp_obd; struct client_obd *cli = &obd->u.cli; @@ -37,6 +38,8 @@ int ptlrpc_reconnect_import(struct obd_import *imp, int rq_opc) int rc; request = ptlrpc_prep_req(imp, rq_opc, 2, size, tmp); + if (!request) + RETURN(-ENOMEM); request->rq_level = LUSTRE_CONN_NEW; request->rq_replen = lustre_msg_size(0, NULL); /* @@ -60,7 +63,7 @@ int ptlrpc_reconnect_import(struct obd_import *imp, int rq_opc) sizeof (old_hdl.addr)) && !memcmp(&old_hdl.cookie, &request->rq_repmsg->cookie, sizeof (old_hdl.cookie))) { - CERROR("%s@%s didn't like our handle %Lx/%Lx, failed\n", + CERROR("%s@%s didn't like our handle "LPX64"/"LPX64", failed\n", cli->cl_target_uuid, conn->c_remote_uuid, (__u64)(unsigned long)ldlmexp, ldlmexp->exp_cookie); @@ -70,7 +73,7 @@ int ptlrpc_reconnect_import(struct obd_import *imp, int rq_opc) old_hdl.addr = request->rq_repmsg->addr; old_hdl.cookie = request->rq_repmsg->cookie; if (memcmp(&imp->imp_handle, &old_hdl, sizeof(old_hdl))) { - CERROR("%s@%s changed handle from %Lx/%Lx to %Lx/%Lx; " + CERROR("%s@%s changed handle from "LPX64"/"LPX64" to "LPX64"/"LPX64"; " "copying, but this may foreshadow disaster\n", cli->cl_target_uuid, conn->c_remote_uuid, old_hdl.addr, old_hdl.cookie, @@ -87,7 +90,7 @@ int ptlrpc_reconnect_import(struct obd_import *imp, int rq_opc) old_hdl = imp->imp_handle; imp->imp_handle.addr = request->rq_repmsg->addr; imp->imp_handle.cookie = request->rq_repmsg->cookie; - CERROR("now connected to %s@%s (%Lx/%Lx, was %Lx/%Lx)!\n", + CERROR("now connected to %s@%s ("LPX64"/"LPX64", was "LPX64"/"LPX64")!\n", cli->cl_target_uuid, conn->c_remote_uuid, imp->imp_handle.addr, imp->imp_handle.cookie, old_hdl.addr, old_hdl.cookie); @@ -99,7 +102,7 @@ int ptlrpc_reconnect_import(struct obd_import *imp, int rq_opc) } out_disc: - ptlrpc_req_finished(request); + *reqptr = request; return rc; } @@ -136,18 +139,19 @@ int ptlrpc_run_recovery_upcall(struct ptlrpc_connection *conn) RETURN(0); } -int ptlrpc_replay(struct obd_import *imp, int send_last_flag) +int ptlrpc_replay(struct obd_import *imp) { int rc = 0; struct list_head *tmp, *pos; struct ptlrpc_request *req; + unsigned long flags; __u64 committed = imp->imp_peer_committed_transno; ENTRY; /* It might have committed some after we last spoke, so make sure we * get rid of them now. */ - spin_lock(&imp->imp_lock); + spin_lock_irqsave(&imp->imp_lock, flags); ptlrpc_free_committed(imp); @@ -162,26 +166,20 @@ int ptlrpc_replay(struct obd_import *imp, int send_last_flag) list_for_each_safe(tmp, pos, &imp->imp_replay_list) { req = list_entry(tmp, struct ptlrpc_request, rq_list); - if (req->rq_transno == imp->imp_max_transno && - send_last_flag) { - req->rq_reqmsg->flags |= MSG_LAST_REPLAY; - DEBUG_REQ(D_HA, req, "LAST_REPLAY:"); - } else { - DEBUG_REQ(D_HA, req, "REPLAY:"); - } + DEBUG_REQ(D_HA, req, "REPLAY:"); + /* XXX locking WRT failure during replay? */ rc = ptlrpc_replay_req(req); - req->rq_reqmsg->flags &= ~MSG_LAST_REPLAY; if (rc) { - CERROR("recovery replay error %d for req %Ld\n", + CERROR("recovery replay error %d for req "LPD64"\n", rc, req->rq_xid); GOTO(out, rc); } } out: - spin_unlock(&imp->imp_lock); + spin_unlock_irqrestore(&imp->imp_lock, flags); return rc; } @@ -192,7 +190,7 @@ int ptlrpc_replay(struct obd_import *imp, int send_last_flag) static int resend_type(struct ptlrpc_request *req, __u64 committed) { - if (req->rq_transno < committed) { + if (req->rq_transno && req->rq_transno < committed) { if (req->rq_flags & PTL_RPC_FL_REPLIED) { /* Saw the reply and it was committed, no biggie. */ DEBUG_REQ(D_HA, req, "NO_RESEND"); @@ -217,11 +215,12 @@ int ptlrpc_resend(struct obd_import *imp) int rc = 0; struct list_head *tmp, *pos; struct ptlrpc_request *req; + unsigned long flags; __u64 committed = imp->imp_peer_committed_transno; ENTRY; - spin_lock(&imp->imp_lock); + spin_lock_irqsave(&imp->imp_lock, flags); list_for_each(tmp, &imp->imp_sending_list) { req = list_entry(tmp, struct ptlrpc_request, rq_list); DEBUG_REQ(D_HA, req, "SENDING: "); @@ -259,19 +258,21 @@ int ptlrpc_resend(struct obd_import *imp) } } + spin_unlock_irqrestore(&imp->imp_lock, flags); RETURN(rc); } void ptlrpc_wake_delayed(struct obd_import *imp) { + unsigned long flags; struct list_head *tmp, *pos; struct ptlrpc_request *req; - spin_lock(&imp->imp_lock); + spin_lock_irqsave(&imp->imp_lock, flags); list_for_each_safe(tmp, pos, &imp->imp_delayed_list) { req = list_entry(tmp, struct ptlrpc_request, rq_list); DEBUG_REQ(D_HA, req, "waking:"); wake_up(&req->rq_wait_for_rep); } - spin_unlock(&imp->imp_lock); + spin_unlock_irqrestore(&imp->imp_lock, flags); } diff --git a/lustre/ptlrpc/rpc.c b/lustre/ptlrpc/rpc.c index 1384b5d..200c029 100644 --- a/lustre/ptlrpc/rpc.c +++ b/lustre/ptlrpc/rpc.c @@ -46,16 +46,10 @@ int connmgr_setup(struct obd_device *obddev, obd_count len, void *buf) int err; ENTRY; - MOD_INC_USE_COUNT; memset(recovd, 0, sizeof(*recovd)); err = recovd_setup(recovd); - if (err) { - MOD_DEC_USE_COUNT; - RETURN(err); - } - - RETURN(0); + RETURN(err); } int connmgr_cleanup(struct obd_device *dev) @@ -64,15 +58,11 @@ int connmgr_cleanup(struct obd_device *dev) int err; err = recovd_cleanup(recovd); - if (err) - LBUG(); - - MOD_DEC_USE_COUNT; - RETURN(0); + RETURN(err); } -int connmgr_iocontrol(unsigned int cmd, struct lustre_handle *hdl, int len, void *karg, - void *uarg) +int connmgr_iocontrol(unsigned int cmd, struct lustre_handle *hdl, int len, + void *karg, void *uarg) { struct ptlrpc_connection *conn = NULL; struct obd_device *obd = class_conn2obd(hdl); @@ -85,7 +75,7 @@ int connmgr_iocontrol(unsigned int cmd, struct lustre_handle *hdl, int len, void if (cmd != OBD_IOC_RECOVD_NEWCONN && cmd != OBD_IOC_RECOVD_FAILCONN) RETURN(-EINVAL); /* XXX ENOSYS? */ - + /* Find the connection that's been rebuilt or has failed. */ spin_lock(&recovd->recovd_lock); list_for_each(tmp, &recovd->recovd_troubled_items) { @@ -106,9 +96,9 @@ int connmgr_iocontrol(unsigned int cmd, struct lustre_handle *hdl, int len, void list_for_each(tmp, &recovd->recovd_managed_items) { conn = list_entry(tmp, struct ptlrpc_connection, c_recovd_data.rd_managed_chain); - + LASSERT(conn->c_recovd_data.rd_recovd == recovd); - + if (!strcmp(conn->c_remote_uuid, data->ioc_inlbuf1)) break; conn = NULL; @@ -152,7 +142,7 @@ int connmgr_iocontrol(unsigned int cmd, struct lustre_handle *hdl, int len, void } ptlrpc_readdress_connection(conn, conn->c_remote_uuid); spin_unlock(&conn->c_lock); - + conn->c_recovd_data.rd_phase = RD_PREPARED; wake_up(&recovd->recovd_waitq); out: @@ -176,27 +166,29 @@ int conmgr_detach(struct obd_device *dev) { return lprocfs_dereg_obd(dev); } + /* use obd ops to offer management infrastructure */ static struct obd_ops recovd_obd_ops = { - o_attach: connmgr_attach, - o_detach: conmgr_detach, - o_setup: connmgr_setup, - o_cleanup: connmgr_cleanup, - o_iocontrol: connmgr_iocontrol, - o_connect: connmgr_connect, - o_disconnect: class_disconnect + o_owner: THIS_MODULE, + o_attach: connmgr_attach, + o_detach: conmgr_detach, + o_setup: connmgr_setup, + o_cleanup: connmgr_cleanup, + o_iocontrol: connmgr_iocontrol, + o_connect: connmgr_connect, + o_disconnect: class_disconnect }; static int __init ptlrpc_init(void) { - int rc; + int rc; rc = ptlrpc_init_portals(); - if (rc) + if (rc) RETURN(rc); ptlrpc_init_connection(); rc = class_register_type(&recovd_obd_ops, status_class_var, LUSTRE_HA_NAME); - if (rc) + if (rc) RETURN(rc); ptlrpc_put_connection_superhack = ptlrpc_put_connection; return 0; diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index d497668..c20fc48 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -128,7 +128,7 @@ ptlrpc_init_svc(__u32 nevents, __u32 nbufs, ptlrpc_link_svc_me(rqbd); } - CDEBUG(D_NET, "Starting service listening on portal %d (eq: %p)\n", + CDEBUG(D_NET, "Starting service listening on portal %d (eq: %lu)\n", service->srv_req_portal, service->srv_eq_h.handle_idx); RETURN(service); @@ -171,8 +171,7 @@ static int handle_incoming_request(struct obd_device *obddev, goto out; } - CDEBUG(D_RPCTRACE, "Handling RPC pid:xid:nid:opc %d:" - LPX64":%x:%d\n", + CDEBUG(D_RPCTRACE, "Handling RPC pid:xid:nid:opc %d:"LPX64":"LPX64":%d\n", NTOH__u32(request->rq_reqmsg->status), request->rq_xid, event->initiator.nid, @@ -254,7 +253,7 @@ static int ptlrpc_main(void *arg) struct ptlrpc_request *request; ptl_event_t *event; int rc = 0; - + unsigned long flags; ENTRY; lock_kernel(); @@ -264,10 +263,10 @@ static int ptlrpc_main(void *arg) sigfillset(¤t->blocked); recalc_sigpending(); #else - spin_lock_irq(¤t->sigmask_lock); + spin_lock_irqsave(¤t->sigmask_lock, flags); sigfillset(¤t->blocked); recalc_sigpending(current); - spin_unlock_irq(¤t->sigmask_lock); + spin_unlock_irqrestore(¤t->sigmask_lock, flags); #endif #ifdef __arch_um__ @@ -383,6 +382,9 @@ int ptlrpc_start_thread(struct obd_device *dev, struct ptlrpc_service *svc, list_add(&thread->t_link, &svc->srv_threads); spin_unlock(&svc->srv_lock); + /* CLONE_VM and CLONE_FILES just avoid a needless copy, because we + * just drop the VM and FILES in ptlrpc_daemonize() right away. + */ rc = kernel_thread(ptlrpc_main, (void *) &d, CLONE_VM | CLONE_FILES); if (rc < 0) { CERROR("cannot start thread\n"); diff --git a/lustre/tests/.cvsignore b/lustre/tests/.cvsignore index a6d15c2d..b9e1962 100644 --- a/lustre/tests/.cvsignore +++ b/lustre/tests/.cvsignore @@ -22,6 +22,7 @@ newfile openclose createdestroy createmany +statmany mkdirmany lovstripe *.xml @@ -29,3 +30,4 @@ stat setuid multifstat checkstat +wantedi diff --git a/lustre/tests/Makefile.am b/lustre/tests/Makefile.am index 12b7d52..aa00642 100644 --- a/lustre/tests/Makefile.am +++ b/lustre/tests/Makefile.am @@ -12,7 +12,7 @@ EXTRA_DIST = $(pkgexample_SCRIPTS) $(noinst_SCRIPTS) $(noinst_DATA) \ rundbench \ elan-client.cfg mds.cfg trivial.sh pkgexampledir = '${exec_prefix}/usr/lib/$(PACKAGE)/examples' -pkgexample_SCRIPTS = llmount.sh llmountcleanup.sh llecho.sh local.sh uml.sh lov.sh +pkgexample_SCRIPTS = llmount.sh llmountcleanup.sh llecho.sh llechocleanup.sh local.sh echo.sh uml.sh lov.sh noinst_SCRIPTS = llsetup.sh llrsetup.sh llcleanup.sh noinst_DATA = lustre.cfg noinst_SCRIPTS += fs.sh intent-test.sh intent-test2.sh leak_finder.pl \ @@ -24,9 +24,9 @@ noinst_SCRIPTS += fs.sh intent-test.sh intent-test2.sh leak_finder.pl \ runtests runvmstat snaprun.sh tbox.sh common.sh noinst_PROGRAMS = openunlink testreq truncate directio openme writeme mcreate noinst_PROGRAMS += munlink tchmod toexcl fsx test_brw openclose createdestroy -noinst_PROGRAMS += lovstripe stat createmany mkdirmany multifstat +noinst_PROGRAMS += stat createmany statmany mkdirmany multifstat # noinst_PROGRAMS += ldaptest -noinst_PROGRAMS += checkstat +noinst_PROGRAMS += checkstat wantedi # ldaptest_SOURCES = ldaptest.c tchmod_SOURCES = tchmod.c @@ -43,11 +43,12 @@ fsx_SOURCES = fsx.c test_brw_SOURCES = test_brw.c openclose_SOURCES = openclose.c createdestroy_SOURCES = createdestroy.c -lovstripe_SOURCES = lovstripe.c stat_SOURCES = stat.c createmany_SOURCES = createmany.c +statmany_SOURCES = statmany.c mkdirmany_SOURCES = mkdirmany.c multifstat_SOURCES = multifstat.c checkstat_SOURCES = checkstat.c +wantedi_SOURCES = wantedi.c include $(top_srcdir)/Rules diff --git a/lustre/tests/createmany.c b/lustre/tests/createmany.c index 77015a6..c56eda8 100644 --- a/lustre/tests/createmany.c +++ b/lustre/tests/createmany.c @@ -10,31 +10,71 @@ int main(int argc, char ** argv) { - int i, rc, count; + int i, rc = 0, do_open; char filename[4096]; + long int start, last, end, count; - if (argc < 3) { - printf("Usage %s filenamebase count\n", argv[0]); + if (argc != 4) { + printf("Usage %s <-o|-m> filenamebase \n", + argv[0]); return 1; } - if (strlen(argv[1]) > 4080) { + if (strcmp(argv[1], "-o") == 0) { + do_open = 1; + } else if (strcmp(argv[1], "-m") == 0) { + do_open = 0; + } else { + printf("Usage %s {-o|-m} filenamebase \n", + argv[0]); + return 1; + } + + if (strlen(argv[2]) > 4080) { printf("name too long\n"); return 1; } - count = strtoul(argv[2], NULL, 0); + start = last = time(0); + + end = strtol(argv[3], NULL, 0); - for (i=0 ; i < count ; i++) { - sprintf(filename, "%s-%d", argv[1], i); - rc = mknod(filename, S_IFREG| 0444, 0); - if (rc) { - printf("mknod(%s) error: %s\n", - filename, strerror(errno)); - break; + if (end > 0) { + count = end; + end = -1UL >> 1; + } else { + end = start - end; + count = -1UL >> 1; + } + + for (i = 0; i < count && time(0) < end; i++) { + sprintf(filename, "%s%d", argv[2], i); + if (do_open) { + int fd = open(filename, O_CREAT|O_RDWR, 0644); + if (fd < 0) { + printf("open(%s) error: %s\n", filename, + strerror(errno)); + rc = errno; + break; + } + close(fd); + } else { + rc = mknod(filename, S_IFREG| 0444, 0); + if (rc) { + printf("mknod(%s) error: %s\n", + filename, strerror(errno)); + rc = errno; + break; + } + } + if ((i % 10000) == 0) { + printf(" - created %d (time %ld ; total %ld ; last %ld)\n", + i, time(0), time(0) - start, time(0) - last); + last = time(0); } - if ((i % 10000) == 0) - printf(" - created %d (time %ld)\n", i, time(0)); } + printf("total: %d creates in %ld seconds: %f creates/second\n", i, + time(0) - start, ((float)i / (time(0) - start))); + return rc; } diff --git a/lustre/tests/echo.sh b/lustre/tests/echo.sh new file mode 100755 index 0000000..f30f056 --- /dev/null +++ b/lustre/tests/echo.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +config=${1:-$(basename $0 .sh).xml} +LMC=${LMC:-../utils/lmc -m $config} + +SERVER=localhost +CLIENT=localhost + +# FIXME: make LMC not require MDS for obdecho LOV +MDSDEV=$TMP/mds1 +MDSSIZE=10000 + +STRIPE_BYTES=65536 +STRIPES_PER_OBJ=2 # 0 means stripe over all OSTs + +LOV=0 +while [ "$1" ]; do + case $1 in + --lov) LOV="1" ;; + *) OPTS="$OPTS $1" ;; + esac + shift +done + +rm -f $config +# create nodes +$LMC --add node --node $SERVER || exit 1 +$LMC --add net --node $SERVER --nid $SERVER --nettype tcp || exit 2 + +if (($LOV)); then + $LMC --add mds --node $SERVER --mds mds1 --dev $MDSDEV --size $MDSSIZE || exit 10 + $LMC --add lov --lov lov1 --mds mds1 --stripe_sz $STRIPE_BYTES --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0 || exit 11 + $LMC --add ost --node $SERVER --lov lov1 --obdtype=obdecho || exit 12 + $LMC --add ost --node $SERVER --lov lov1 --obdtype=obdecho || exit 13 + OBD_NAME=lov1 +else + $LMC --add ost --obd obd1 --node $SERVER --obdtype=obdecho || exit 2 + OBD_NAME=obd1 +fi + +if [ "$SERVER" != "$CLIENT" ]; then + $LMC --add node --node $CLIENT || exit 1 + $LMC --add net --node $CLIENT --nid $CLIENT --nettype tcp || exit 2 +fi + +$LMC --add echo_client --node $CLIENT --obd ${OBD_NAME} || exit 3 + diff --git a/lustre/tests/llecho.sh b/lustre/tests/llecho.sh index e99289c..c077223 100644 --- a/lustre/tests/llecho.sh +++ b/lustre/tests/llecho.sh @@ -1,50 +1,12 @@ #!/bin/sh -config=echo.xml LCONF=${LCONF:-../utils/lconf} -LMC=${LMC:-../utils/lmc} +NAME=${NAME:-echo} -SERVER=localhost -CLIENT=cfs4 +config=$NAME.xml +mkconfig=./$NAME.sh -# FIXME: make LMC not require MDS for obdecho LOV -MDSDEV=$TMP/mds1 -MDSSIZE=10000 - -STRIPE_BYTES=65536 -STRIPES_PER_OBJ=2 # 0 means stripe over all OSTs - -LOV=0 -while [ "$1" ]; do - case $1 in - --lov) LOV="1" ;; - *) OPTS="$OPTS $1" ;; - esac - shift -done - -rm -f $config -# create nodes -$LMC -o $config --add node --node $SERVER || exit 1 -$LMC -m $config --add net --node $SERVER --nid $SERVER --nettype tcp || exit 2 - -if (($LOV)); then - $LMC -m $config --add mds --node $SERVER --mds mds1 --dev $MDSDEV --size $MDSSIZE || exit 10 - $LMC -m $config --add lov --lov lov1 --mds mds1 --stripe_sz $STRIPE_BYTES --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0 || exit 11 - $LMC -m $config --add ost --node $SERVER --lov lov1 --obdtype=obdecho || exit 12 - $LMC -m $config --add ost --node $SERVER --lov lov1 --obdtype=obdecho || exit 13 - OBD_NAME=lov1 -else - $LMC -m $config --add ost --obd obd1 --node $SERVER --obdtype=obdecho || exit 2 - OBD_NAME=obd1 -fi - -if [ "$SERVER" != "$CLIENT" ]; then - $LMC -m $config --add node --node $CLIENT || exit 1 - $LMC -m $config --add net --node $CLIENT --nid $CLIENT --nettype tcp || exit 2 -fi - -$LMC -m $config --add echo_client --node $CLIENT --obd ${OBD_NAME} || exit 3 +sh $mkconfig $config || exit 1 $LCONF --reformat --gdb $OPTS $config || exit 4 diff --git a/lustre/tests/llechocleanup.sh b/lustre/tests/llechocleanup.sh index de4b35d..2d63fa9 100755 --- a/lustre/tests/llechocleanup.sh +++ b/lustre/tests/llechocleanup.sh @@ -1,10 +1,15 @@ #!/bin/sh LCONF=../utils/lconf +NAME=${NAME:-echo} +TMP=${TMP:-/tmp} -if [ -f echo.xml ]; then - ${LCONF} --cleanup echo.xml -else - echo "no echo.xml found" +config=$NAME.xml +mkconfig=./$NAME.sh + +if [ ! -f $config ]; then + sh $mkconfig $config || exit 1 fi +${LCONF} --cleanup echo.xml + diff --git a/lustre/tests/llmount.sh b/lustre/tests/llmount.sh index eb4618b..efc7c0c 100755 --- a/lustre/tests/llmount.sh +++ b/lustre/tests/llmount.sh @@ -7,8 +7,6 @@ NAME=${NAME:-local} config=$NAME.xml mkconfig=./$NAME.sh -if [ ! -f $config -o $mkconfig -nt $config ]; then - sh $mkconfig $config || exit 1 -fi +sh $mkconfig $config || exit 1 ${LCONF} --reformat --gdb $config || exit 2 diff --git a/lustre/tests/llmountcleanup.sh b/lustre/tests/llmountcleanup.sh index 82f2a17..b8b99d9 100755 --- a/lustre/tests/llmountcleanup.sh +++ b/lustre/tests/llmountcleanup.sh @@ -13,14 +13,17 @@ fi sync; sleep 2; sync ${LCONF} --cleanup --dump $TMP/debug $config -LEAK=`dmesg | grep -v " 0 bytes" | grep leaked` -if [ "$LEAK" ]; then - echo "$LEAK" 1>&2 - mv $TMP/debug $TMP/debug.`date +%s` - #exit -1 -fi BUSY=`dmesg | grep -i destruct` if [ "$BUSY" ]; then echo "$BUSY" 1>&2 - #exit -2 + mv $TMP/debug $TMP/debug-busy.`date +%s` + exit -1 +fi +LEAK_LUSTRE=`dmesg | tail -20 | grep -v "leaked: 0" | grep leaked` +LEAK_PORTALS=`dmesg | tail -20 | grep "Portals memory leaked"` +if [ "$LEAK_LUSTRE" -o "$LEAK_PORTALS" ]; then + echo "$LEAK_LUSTRE" 1>&2 + echo "$LEAK_PORTALS" 1>&2 + mv $TMP/debug $TMP/debug-leak.`date +%s` + exit -2 fi diff --git a/lustre/tests/local.sh b/lustre/tests/local.sh index f680f4b..d892b58 100755 --- a/lustre/tests/local.sh +++ b/lustre/tests/local.sh @@ -5,11 +5,11 @@ config=${1:-local.xml} LMC="${LMC:-../utils/lmc} -m $config" TMP=${TMP:-/tmp} -MDSDEV=$TMP/mds1 -MDSSIZE=50000 +MDSDEV=${MDSDEV:-$TMP/mds1} +MDSSIZE=${MDSSIZE:-50000} -OSTDEV=$TMP/ost1 -OSTSIZE=200000 +OSTDEV=${OSTDEV:-$TMP/ost1} +OSTSIZE=${OSTSIZE:-200000} kver=`uname -r | cut -d "." -f 1,2` diff --git a/lustre/tests/lov.sh b/lustre/tests/lov.sh index 54d4c66..c0b2839 100755 --- a/lustre/tests/lov.sh +++ b/lustre/tests/lov.sh @@ -5,13 +5,13 @@ config=${1:-lov.xml} LMC=${LMC:-../utils/lmc} TMP=${TMP:-/tmp} -MDSDEV=$TMP/mds1 -MDSSIZE=50000 +MDSDEV=${MDSDEV:-$TMP/mds1} +MDSSIZE=${MDSSIZE:-50000} -OSTDEV1=$TMP/ost1 -OSTDEV2=$TMP/ost2 -OSTDEV3=$TMP/ost3 -OSTSIZE=100000 +OSTDEV1=${OSTDEV1:-$TMP/ost1} +OSTDEV2=${OSTDEV2:-$TMP/ost2} +OSTDEV3=${OSTDEV3:-$TMP/ost3} +OSTSIZE=${OSTSIZE:-100000} STRIPE_BYTES=65536 STRIPES_PER_OBJ=2 # 0 means stripe over all OSTs diff --git a/lustre/tests/lovstripe.c b/lustre/tests/lovstripe.c deleted file mode 100644 index 29769f1..0000000 --- a/lustre/tests/lovstripe.c +++ /dev/null @@ -1,164 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include - - -/****************** Custom includes ********************/ -#include -#include - - -/****************** Functions ******************/ -int write_file(char *name, struct lov_mds_md *striping, int bufsize, - char *buf1, char *buf2); - - -/************************ Main **********************/ - -#define STRIPE_SIZE 128 * 1024 - -int main(int argc, char *argv[]) -{ - struct lov_mds_md a_striping; - long bufsize = sizeof(long) * STRIPE_SIZE; - char *rbuf, *wbuf; - int data, *dp; - int result; - - rbuf = malloc(bufsize); - wbuf = malloc(bufsize); - if (!rbuf || !wbuf) { - fprintf(stderr, "%s: unable to allocate buffers\n", argv[0]); - return 1; - } - - /* Initialize to an easily-verified pattern */ - for (data = 0, dp = (int *)wbuf; data < STRIPE_SIZE; data++, dp++) - *dp = data; - - /* Init defaults on striping info */ - a_striping.lmm_magic = LOV_MAGIC; - a_striping.lmm_stripe_size = STRIPE_SIZE; - a_striping.lmm_stripe_pattern = 0; - - /* Write file for OST1 only */ - /* Start at OST 0, and use only 1 OST */ - a_striping.lmm_stripe_offset = 0; - a_striping.lmm_stripe_count = 1; - - result = write_file("/mnt/lustre/ost1", &a_striping, bufsize, - wbuf, rbuf); - - if (result < 0) - goto out; - - /* Write file for OST2 only */ - /* Start at OST 1, and use only 1 OST */ - a_striping.lmm_stripe_offset = 1; - a_striping.lmm_stripe_count = 1; - - result = write_file("/mnt/lustre/ost2", &a_striping, bufsize, - wbuf, rbuf); - - if (result < 0) - goto out; - - /* Write file across both OST1 and OST2 */ - /* Start at OST 0, and use only 2 OSTs */ - a_striping.lmm_stripe_offset = 0; - a_striping.lmm_stripe_count = 2; - - result = write_file("/mnt/lustre/ost1and2", &a_striping, bufsize, - wbuf, rbuf); - - if (result < 0) - goto out; - -out: - free(rbuf); - free(wbuf); - return result; -} - - -int write_file(char *name, struct lov_mds_md *striping, int bufsize, - char *wbuf, char *rbuf) -{ - int fd, result; - - printf("opening %s\n", name); - fd = open(name, O_CREAT | O_RDWR | O_LOV_DELAY_CREATE, 0644); - if (fd < 0) { - fprintf(stderr, "\nUnable to open '%s': %s\n", - name, strerror(errno)); - return -errno; - } - - printf("setting stripe data on %s\n", name); - result = ioctl(fd, LL_IOC_LOV_SETSTRIPE, striping); - if (result < 0) { - fprintf(stderr, "\nError on ioctl for '%s' (%d): %s\n", - name, fd, strerror(errno)); - close(fd); - return -errno; - } - - /* Write bogus data */ - printf("writing data to %s\n", name); - result = write(fd, wbuf, bufsize); - if (result < 0) { - fprintf(stderr, "\nerror: writing data to '%s' (%d): %s\n", - name, fd, strerror(errno)); - close(fd); - return -errno; - } - - if (result != bufsize) { - fprintf(stderr, "\nerror: short write to '%s' (%d): %d != %d\n", - name, fd, result, bufsize); - close(fd); - return -1; - } - - /* Seek to beginning again */ - printf("seeking in %s\n", name); - result = lseek(fd, 0, SEEK_SET); - if (result < 0) { - fprintf(stderr, "\nerror: seeking to beginning '%s' (%d): %s\n", - name, fd, strerror(errno)); - close(fd); - return -errno; - } - - /* Read bogus data back */ - printf("reading data from %s\n", name); - result = read(fd, rbuf, bufsize); - if (result < 0) { - fprintf(stderr, "\nerror: reading data from '%s' (%d): %s\n", - name, fd, strerror(errno)); - close(fd); - return -errno; - } - - if (result != bufsize) { - fprintf(stderr,"\nerror: short read from '%s' (%d): %d != %d\n", - name, fd, result, bufsize); - close(fd); - return -1; - } - - if (memcmp(wbuf, rbuf, bufsize)) { - fprintf(stderr, "\nerror: comparing data in '%s' (%d): %s\n", - name, fd, strerror(errno)); - close(fd); - return -1; - } - - close(fd); - - return 0; -} diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 324b161..258598b 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -441,7 +441,40 @@ pass $CLEAN $START -echo '== cleanup =========================================' +echo '== stripe sanity ================================= test27' +echo "--test 26.1 create one stripe" +mkdir $MOUNT/d27 +../utils/lstripe $MOUNT/d27/f0 4096 0 1 +$CHECKSTAT -t file $MOUNT/d27/f0 +echo "--test 26.2 write to one stripe file" +cp /etc/hosts $MOUNT/d27/f0 +pass +$CLEAN +$START + +echo "--test 26.3 create two stripes" +../utils/lstripe $MOUNT/d27/f01 4096 0 2 +echo "--test 26.4 write to two stripe file" +cp /etc/hosts $MOUNT/d27/f01 +pass +$CLEAN +$START + +echo "--test 26.5 lstripe existing file (should return error)" +../utils/lstripe $MOUNT/d27/f12 4096 1 2 +! ../utils/lstripe $MOUNT/d27/f12 4096 1 2 +pass +$CLEAN +$START + +echo "--test 26.6 lfind " +../utils/lfind $MOUNT/d27 +pass +$CLEAN +$START + + +echo '== cleanup =============================================' rm -r $MOUNT/[Rdfs][1-9]* echo '======================= finished =======================' diff --git a/lustre/tests/statmany.c b/lustre/tests/statmany.c new file mode 100644 index 0000000..f6370e3 --- /dev/null +++ b/lustre/tests/statmany.c @@ -0,0 +1,214 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if 0 +#include +#endif +#include +#include + +struct option longopts[] = { + {"ea", 0, 0, 'e'}, + {"lookup", 0, 0, 'l'}, + {"random", 0, 0, 'r'}, + {"stat", 0, 0, 's'}, + {NULL, 0, 0, 0}, +}; +char *shortopts = "ehlr:s0123456789"; + +static int usage(char *prog, FILE *out) +{ + fprintf(out, + "Usage: %s [-r rand_seed] {-s|-e|-l} filenamebase total_files iterations\n" + "-r : random seed\n" + "-s : regular stat() calls\n" + "-e : open then GET_EA ioctl\n" + "-l : lookup ioctl only\n", prog); + exit(out == stderr); +} + +#ifndef LONG_MAX +#define LONG_MAX (1 << ((8 * sizeof(long)) - 1)) +#endif + +int main(int argc, char ** argv) +{ + long i, count, iter = LONG_MAX, mode, offset; + long int start, length = LONG_MAX, last, rc = 0; + char parent[4096], *t; + char c, *prog = argv[0], *base; + int seed = 0; + int fd = -1; + + while ((c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1) { + char *e; + switch (c) { + case 'r': + seed = strtoul(optarg, &e, 0); + if (*e) { + fprintf(stderr, "bad -r option %s\n", optarg); + usage(prog, stderr); + } + break; + case 'e': + case 'l': + case 's': + mode = c; + break; + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + if (length == LONG_MAX) + length = c - '0'; + else + length = length * 10 + (c - '0'); + break; + case 'h': + usage(prog, stdout); + case '?': + usage(prog, stderr); + } + } + + if (optind + 2 + (length == LONG_MAX) != argc) { + fprintf(stderr, "missing filenamebase, total_files, or iterations\n"); + usage(prog, stderr); + } + + base = argv[optind]; + if (strlen(base) > 4080) { + fprintf(stderr, "filenamebase too long\n"); + exit(1); + } + + if (seed == 0) { + int f = open("/dev/urandom", O_RDONLY); + + if (f < 0 || read(f, &seed, sizeof(seed)) < sizeof(seed)) + seed = time(0); + if (f > 0) + close(f); + } + + printf("using seed %u\n", seed); + srand(seed); + + count = strtoul(argv[optind + 1], NULL, 0); + if (length == LONG_MAX) { + iter = strtoul(argv[optind + 2], NULL, 0); + printf("running for %lu iterations\n", iter); + } else + printf("running for %lu seconds\n", length); + + start = last = time(0); + + t = strrchr(base, '/'); + if (t == NULL) { + strcpy(parent, "."); + offset = -1; + } else { + strncpy(parent, base, t - base); + offset = t - base + 1; + } + + if (mode == 'l') { + fd = open(parent, O_RDONLY); + if (fd < 0) { + printf("open(%s) error: %s\n", parent, + strerror(errno)); + exit(errno); + } + } + + for (i = 0; i < iter && time(0) - start < length; i++) { + char filename[4096]; + int tmp; + + tmp = random() % count; + sprintf(filename, "%s%d", base, tmp); + + if (mode == 'e') { +#if 0 + fd = open(filename, O_RDWR|O_LARGEFILE); + if (fd < 0) { + printf("open(%s) error: %s\n", filename, + strerror(errno)); + break; + } + rc = ioctl(fd, EXTN_IOC_GETEA, NULL); + if (rc < 0) { + printf("ioctl(%s) error: %s\n", filename, + strerror(errno)); + break; + } + close(fd); + break; +#endif + } else if (mode == 's') { + struct stat buf; + + rc = stat(filename, &buf); + if (rc < 0) { + printf("stat(%s) error: %s\n", filename, + strerror(errno)); + break; + } + } else if (mode == 'l') { + struct obd_ioctl_data data; + char rawbuf[8192]; + char *buf = rawbuf; + int max = sizeof(rawbuf); + + memset(&data, 0, sizeof(data)); + data.ioc_version = OBD_IOCTL_VERSION; + data.ioc_len = sizeof(data); + if (offset >= 0) + data.ioc_inlbuf1 = filename + offset; + else + data.ioc_inlbuf1 = filename; + data.ioc_inllen1 = strlen(data.ioc_inlbuf1) + 1; + + if (obd_ioctl_pack(&data, &buf, max)) { + printf("ioctl_pack failed.\n"); + break; + } + + rc = ioctl(fd, IOC_MDC_LOOKUP, buf); + if (rc < 0) { + printf("ioctl(%s) error: %s\n", filename, + strerror(errno)); + break; + } + } + if ((i % 10000) == 0) { + printf(" - stat %lu (time %ld ; total %ld ; last %ld)\n", + i, time(0), time(0) - start, time(0) - last); + last = time(0); + } + } + + if (mode == 'l') + close(fd); + + printf("total: %lu stats in %ld seconds: %f stats/second\n", i, + time(0) - start, ((float)i / (time(0) - start))); + + exit(rc); +} diff --git a/lustre/tests/uml.sh b/lustre/tests/uml.sh index a8a381b..112a796 100644 --- a/lustre/tests/uml.sh +++ b/lustre/tests/uml.sh @@ -21,28 +21,28 @@ OSTSIZE=100000 # Three separate systems MDSNODE=uml1 -OSTNODE=uml2 +OSTNODES="uml2 uml2" CLIENTS="uml3" # Single system with additional clients #MDSNODE=uml1 -#OSTNODE=uml1 +#OSTNODES="uml1 uml1" #CLIENTS="$MDSNODE client" # Two systems with client on MDS, and additional clients (set up OST first) #MDSNODE=uml1 -#OSTNODE=uml2 +#OSTNODES="uml2 uml2" #CLIENTS="$MDSNODE client" # Two systems with client on OST, and additional clients (set up MDS first) #MDSNODE=uml1 -#OSTNODE=uml2 -#CLIENTS="$OSTNODE client" +#OSTNODES="uml2 uml2" +#CLIENTS="$OSTNODES client" rm -f $config # create nodes -for NODE in $MDSNODE $OSTNODE $CLIENTS; do +for NODE in $MDSNODE $OSTNODES $CLIENTS; do eval [ \$$NODE ] && continue ${LMC} -m $config --add net --node $NODE --nid $NODE --nettype tcp || exit 1 eval "$NODE=done" @@ -53,11 +53,14 @@ ${LMC} -m $config --add mds --format --node $MDSNODE --mds mds1 --dev $MDSDEV -- # configure ost ${LMC} -m $config --add lov --lov lov1 --mds mds1 --stripe_sz 65536 --stripe_cnt 0 --stripe_pattern 0 || exit 20 -${LMC} -m $config --add ost --node $OSTNODE --lov lov1 --dev $OSTDEV1 --size $OSTSIZE || exit 21 -${LMC} -m $config --add ost --node $OSTNODE --lov lov1 --dev $OSTDEV2 --size $OSTSIZE || exit 22 +COUNT=1 +for NODE in $OSTNODES; do + eval OSTDEV=\$OSTDEV$COUNT + ${LMC} -m $config --add ost --node $NODE --lov lov1 --dev $OSTDEV --size $OSTSIZE || exit 21 + COUNT=`expr $COUNT + 1` +done # create client config(s) for NODE in $CLIENTS; do ${LMC} -m $config --add mtpt --node $NODE --path /mnt/lustre --mds mds1 --lov lov1 || exit 30 done - diff --git a/lustre/tests/wantedi.c b/lustre/tests/wantedi.c new file mode 100644 index 0000000..426602f --- /dev/null +++ b/lustre/tests/wantedi.c @@ -0,0 +1,48 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int usage(char *prog, FILE *out) +{ + fprintf(out, + "Usage: %s \n", prog); + exit(out == stderr); +} + +#define EXTN_IOC_CREATE_INUM _IOW('f', 5, long) + +int main(int argc, char ** argv) +{ + int dirfd, wantedi, rc; + + if (argc < 2 || argc > 3) + usage(argv[0], stderr); + + dirfd = open(argv[1], O_RDONLY); + if (dirfd < 0) { + perror("open"); + exit(1); + } + + wantedi = atoi(argv[2]); + printf("Creating %s/%d with ino %d\n", argv[1], wantedi, wantedi); + + rc = ioctl(dirfd, EXTN_IOC_CREATE_INUM, wantedi); + if (rc < 0) { + perror("ioctl(EXTN_IOC_CREATE_INUM)"); + exit(2); + } + + return 0; +} diff --git a/lustre/utils/automatic-reconnect-sample b/lustre/utils/automatic-reconnect-sample new file mode 100755 index 0000000..bf9ecc4 --- /dev/null +++ b/lustre/utils/automatic-reconnect-sample @@ -0,0 +1,34 @@ +#!/bin/sh + +if [ -z "$1" ]; then + echo "No UUID given to Lustre upcall!" | wall + exit 1 +fi + +# FIXME: OSTHOST can't be hard-coded! +OST=$1 +OSTHOST=dev7 +LUSTRE=/home/pschwan/lustre/lustre + +while ( ! ping -c 1 -w 3 $OSTHOST ) ; do + sleep 2 +done; + +echo -n "OST $OSTHOST UUID $OST responding to pings : " +date + +$LUSTRE/utils/lctl < URL to fetch a config file +--ldapurl LDAP server URL, eg. ldap://localhost +--config Cluster config name used for LDAP query --node Load config for +--select service=nodeA,service2=nodeB U -d | --cleanup Cleans up config. (Shutdown) -f | --force Forced unmounting and/or obd detach during cleanup -v | --verbose Print system commands as they are run @@ -73,7 +75,7 @@ config.xml Lustre configuration in xml format. 30 - obd, mdd 40 - mds, ost 50 - mdc, osc - 60 - lov, lovconfig + 60 - lov 70 - mountpoint, echo_client --lustre=src_dir Base directory of lustre sources. This parameter will cause lconf to load modules from a source tree. @@ -112,8 +114,11 @@ class Config: self._portals_dir = '' self._minlevel = 0 self._maxlevel = 100 - self._timeout = -1 + self._timeout = 0 self._recovery_upcall = '' + self._ldapurl = '' + self._config_name = '' + self._select = {} def verbose(self, flag = None): if flag: self._verbose = flag @@ -151,10 +156,6 @@ class Config: if val: self._node = val return self._node - def url(self, val = None): - if val: self._url = val - return self._url - def gdb_script(self): if os.path.isdir('/r'): return '/r' + self._gdb_script @@ -170,7 +171,6 @@ class Config: def dump_file(self, val = None): if val: self._dump_file = val return self._dump_file - def minlevel(self, val = None): if val: self._minlevel = int(val) return self._minlevel @@ -195,6 +195,27 @@ class Config: if val: self._recovery_upcall = val return self._recovery_upcall + def ldapurl(self, val = None): + if val: self._ldapurl = val + return self._ldapurl + + def config_name(self, val = None): + if val: self._config_name = val + return self._config_name + + def init_select(self, arg): + # arg = "service=nodeA,service2=nodeB" + list = string.split(arg, ',') + for entry in list: + srv, node = string.split(entry, '=') + self._select[srv] = node + + def select(self, srv): + if self._select.has_key(srv): + return self._select[srv] + return None + + config = Config() # ============================================================ @@ -272,6 +293,10 @@ class LCTLInterface: else: raise CommandError('lctl', "unable to find lctl binary.") + def set_nonblock(self, fd): + fl = fcntl.fcntl(fd, FCNTL.F_GETFL) + fcntl.fcntl(fd, FCNTL.F_SETFL, fl | os.O_NDELAY) + def run(self, cmds): """ run lctl @@ -283,19 +308,42 @@ class LCTLInterface: """ debug("+", self.lctl, cmds) if config.noexec(): return (0, []) - p = popen2.Popen3(self.lctl, 1) - p.tochild.write(cmds + "\n") - p.tochild.close() - out = p.fromchild.readlines() - err = p.childerr.readlines() - ret = p.wait() + + child = popen2.Popen3(self.lctl, 1) # Capture stdout and stderr from command + child.tochild.write(cmds + "\n") + child.tochild.close() + + # From "Python Cookbook" from O'Reilly + outfile = child.fromchild + outfd = outfile.fileno() + self.set_nonblock(outfd) + errfile = child.childerr + errfd = errfile.fileno() + self.set_nonblock(errfd) + + outdata = errdata = '' + outeof = erreof = 0 + while 1: + ready = select.select([outfd,errfd],[],[]) # Wait for input + if outfd in ready[0]: + outchunk = outfile.read() + if outchunk == '': outeof = 1 + outdata = outdata + outchunk + if errfd in ready[0]: + errchunk = errfile.read() + if errchunk == '': erreof = 1 + errdata = errdata + errchunk + if outeof and erreof: break + # end of "borrowed" code + + ret = child.wait() if os.WIFEXITED(ret): rc = os.WEXITSTATUS(ret) else: rc = 0 - if rc or len(err): - raise CommandError(self.lctl, err, rc) - return rc, out + if rc or len(errdata): + raise CommandError(self.lctl, errdata, rc) + return rc, outdata def runcmd(self, *args): """ @@ -587,8 +635,12 @@ def init_loop(file, size, fstype): return dev if config.reformat() or not os.access(file, os.R_OK | os.W_OK): if size < 8000: - error(file, "size must be larger than 8MB") - run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size, file)) + panic(file, "size must be larger than 8MB, currently set to:", size) + (ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size, + file)) + if ret: + panic("Unable to create backing store:", file) + loop = loop_base() # find next free loop for n in xrange(0, MAX_LOOP_DEVICES): @@ -707,11 +759,11 @@ class Module: """ Base class for the rest of the modules. The default cleanup method is defined here, as well as some utilitiy funcs. """ - def __init__(self, module_name, dom_node): - self.dom_node = dom_node + def __init__(self, module_name, db): + self.db = db self.module_name = module_name - self.name = get_attr(dom_node, 'name') - self.uuid = get_attr(dom_node, 'uuid') + self.name = self.db.getName() + self.uuid = self.db.getUUID() self.kmodule_list = [] self._server = None self._connected = 0 @@ -720,10 +772,9 @@ class Module: msg = string.join(map(str,args)) print self.module_name + ":", self.name, self.uuid, msg - def lookup_server(self, srv_uuid): """ Lookup a server's network information """ - net = get_ost_net(self.dom_node.parentNode, srv_uuid) + net = self.db.get_ost_net(srv_uuid) if not net: panic ("Unable to find a server for:", srv_uuid) self._server = Network(net) @@ -806,13 +857,13 @@ class Module: class Network(Module): - def __init__(self,dom_node): - Module.__init__(self, 'NETWORK', dom_node) - self.net_type = get_attr(dom_node,'type') - self.nid = get_text(dom_node, 'server', '*') - self.port = get_text_int(dom_node, 'port', 0) - self.send_mem = get_text_int(dom_node, 'send_mem', DEFAULT_TCPBUF) - self.recv_mem = get_text_int(dom_node, 'recv_mem', DEFAULT_TCPBUF) + def __init__(self,db): + Module.__init__(self, 'NETWORK', db) + self.net_type = self.db.get_val('nettype') + self.nid = self.db.get_val('nid', '*') + self.port = self.db.get_val_int('port', 0) + self.send_mem = self.db.get_val_int('send_mem', DEFAULT_TCPBUF) + self.recv_mem = self.db.get_val_int('recv_mem', DEFAULT_TCPBUF) if '*' in self.nid: self.nid = get_local_address(self.net_type, self.nid) if not self.nid: @@ -842,20 +893,15 @@ class Network(Module): ret, out = run(TCP_ACCEPTOR, '-s', self.send_mem, '-r', self.recv_mem, nal_id, self.port) if ret: raise CommandError(TCP_ACCEPTOR, out, ret) - ret = self.dom_node.getElementsByTagName('route_tbl') - for a in ret: - for r in a.getElementsByTagName('route'): - net_type = get_attr(r, 'type') - gw = get_attr(r, 'gw') - lo = get_attr(r, 'lo') - hi = get_attr(r,'hi', '') - lctl.add_route(net_type, gw, lo, hi) - if net_type in ('tcp', 'toe') and net_type == self.net_type and hi == '': - srv = nid2server(self.dom_node.parentNode.parentNode, lo) - if not srv: - panic("no server for nid", lo) - else: - lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem) + for net_type, gw, lo, hi in self.db.get_route_tbl(): + lctl.add_route(net_type, gw, lo, hi) + if net_type in ('tcp', 'toe') and net_type == self.net_type and hi == '': + srvdb = self.db.nid2server(lo) + if not srv: + panic("no server for nid", lo) + else: + srv = Network(srvdb) + lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem) lctl.network(self.net_type, self.nid) @@ -863,28 +909,25 @@ class Network(Module): def cleanup(self): self.info(self.net_type, self.nid, self.port) - ret = self.dom_node.getElementsByTagName('route_tbl') - for a in ret: - for r in a.getElementsByTagName('route'): - lo = get_attr(r, 'lo') - hi = get_attr(r,'hi', '') - if self.net_type in ('tcp', 'toe') and hi == '': - srv = nid2server(self.dom_node.parentNode.parentNode, lo) - if not srv: - panic("no server for nid", lo) - else: - try: - lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid) - except CommandError, e: - print "disconnect failed: ", self.name - e.dump() - cleanup_error(e.rc) - try: - lctl.del_route(self.net_type, self.nid, lo, hi) - except CommandError, e: - print "del_route failed: ", self.name - e.dump() - cleanup_error(e.rc) + for net_type, gw, lo, hi in self.db.get_route_tbl(): + if self.net_type in ('tcp', 'toe') and hi == '': + srvdb = self.db.nid2server(lo) + if not srv: + panic("no server for nid", lo) + else: + srv = Network(srvdb) + try: + lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid) + except CommandError, e: + print "disconnect failed: ", self.name + e.dump() + cleanup_error(e.rc) + try: + lctl.del_route(self.net_type, self.nid, lo, hi) + except CommandError, e: + print "del_route failed: ", self.name + e.dump() + cleanup_error(e.rc) try: lctl.cleanup("RPCDEV", "RPCDEV_UUID") @@ -903,8 +946,8 @@ class Network(Module): run("killall acceptor") class LDLM(Module): - def __init__(self,dom_node): - Module.__init__(self, 'LDLM', dom_node) + def __init__(self,db): + Module.__init__(self, 'LDLM', db) self.add_lustre_module('ldlm', 'ldlm') def prepare(self): if is_prepared(self.uuid): @@ -914,19 +957,16 @@ class LDLM(Module): setup ="") class LOV(Module): - def __init__(self,dom_node): - Module.__init__(self, 'LOV', dom_node) - self.mds_uuid = get_first_ref(dom_node, 'mds') - mds= lookup(dom_node.parentNode, self.mds_uuid) - self.mds_name = getName(mds) - devs = dom_node.getElementsByTagName('devices') - if len(devs) > 0: - dev_node = devs[0] - self.stripe_sz = get_attr_int(dev_node, 'stripesize', 65536) - self.stripe_off = get_attr_int(dev_node, 'stripeoffset', 0) - self.pattern = get_attr_int(dev_node, 'pattern', 0) - self.devlist = get_all_refs(dev_node, 'obd') - self.stripe_cnt = get_attr_int(dev_node, 'stripecount', len(self.devlist)) + def __init__(self,db): + Module.__init__(self, 'LOV', db) + self.mds_uuid = self.db.get_first_ref('mds') + mds= self.db.lookup(self.mds_uuid) + self.mds_name = mds.getName() + self.stripe_sz = self.db.get_val_int('stripesize', 65536) + self.stripe_off = self.db.get_val_int('stripeoffset', 0) + self.pattern = self.db.get_val_int('stripepattern', 0) + self.devlist = self.db.get_refs('obd') + self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist)) self.add_lustre_module('mdc', 'mdc') self.add_lustre_module('lov', 'lov') @@ -934,7 +974,7 @@ class LOV(Module): if is_prepared(self.uuid): return for obd_uuid in self.devlist: - obd = lookup(self.dom_node.parentNode, obd_uuid) + obd = self.db.lookup(obd_uuid) osc = get_osc(obd) if osc: try: @@ -945,7 +985,7 @@ class LOV(Module): print "Error preparing OSC %s (inactive)\n" % osc_uuid else: panic('osc not found:', osc_uuid) - mdc_uuid = prepare_mdc(self.dom_node.parentNode, self.mds_uuid) + mdc_uuid = prepare_mdc(self.db, self.mds_uuid) self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz, self.stripe_off, self.pattern, self.devlist, self.mds_name) lctl.newdev(attach="lov %s %s" % (self.name, self.uuid), @@ -955,19 +995,19 @@ class LOV(Module): if not is_prepared(self.uuid): return for obd_uuid in self.devlist: - obd = lookup(self.dom_node.parentNode, obd_uuid) + obd = self.db.lookup(obd_uuid) osc = get_osc(obd) if osc: osc.cleanup() else: panic('osc not found:', osc_uuid) Module.cleanup(self) - cleanup_mdc(self.dom_node.parentNode, self.mds_uuid) + cleanup_mdc(self.db, self.mds_uuid) def load_module(self): for obd_uuid in self.devlist: - obd = lookup(self.dom_node.parentNode, obd_uuid) + obd = self.db.lookup(obd_uuid) osc = get_osc(obd) if osc: osc.load_module() @@ -980,7 +1020,7 @@ class LOV(Module): def cleanup_module(self): Module.cleanup_module(self) for obd_uuid in self.devlist: - obd = lookup(self.dom_node.parentNode, obd_uuid) + obd = self.db.lookup(obd_uuid) osc = get_osc(obd) if osc: osc.cleanup_module() @@ -989,10 +1029,11 @@ class LOV(Module): panic('osc not found:', osc_uuid) class LOVConfig(Module): - def __init__(self,dom_node): - Module.__init__(self, 'LOVConfig', dom_node) - self.lov_uuid = get_first_ref(dom_node, 'lov') - l = lookup(dom_node.parentNode, self.lov_uuid) + def __init__(self,db): + Module.__init__(self, 'LOVConfig', db) + + self.lov_uuid = self.db.get_first_ref('lov') + l = self.db.lookup(self.lov_uuid) self.lov = LOV(l) def prepare(self): @@ -1007,18 +1048,24 @@ class LOVConfig(Module): #nothing to do here pass - -class MDS(Module): - def __init__(self,dom_node): - Module.__init__(self, 'MDS', dom_node) - self.devname, self.size = get_device(dom_node) - self.fstype = get_text(dom_node, 'fstype') +class MDSDEV(Module): + def __init__(self,db): + Module.__init__(self, 'MDSDEV', db) + self.devname = self.db.get_val('devpath','') + self.size = self.db.get_val_int('devsize', 0) + self.fstype = self.db.get_val('fstype', '') + # overwrite the orignal MDSDEV name and uuid with the MDS name and uuid + self.uuid = self.db.get_first_ref('mds') + mds = self.db.lookup(self.uuid) + self.name = mds.getName() + self.lovconfig_uuids = mds.get_refs('lovconfig') # FIXME: if fstype not set, then determine based on kernel version - self.format = get_text(dom_node, 'autoformat', "no") + self.format = self.db.get_val('autoformat', "no") if self.fstype == 'extN': self.add_lustre_module('extN', 'extN') self.add_lustre_module('mds', 'mds') - self.add_lustre_module('obdclass', 'fsfilt_%s'%(self.fstype)) + if self.fstype: + self.add_lustre_module('obdclass', 'fsfilt_%s' % (self.fstype)) def prepare(self): if is_prepared(self.uuid): @@ -1030,6 +1077,11 @@ class MDS(Module): setup ="") lctl.newdev(attach="mds %s %s" % (self.name, self.uuid), setup ="%s %s" %(blkdev, self.fstype)) + for uuid in self.lovconfig_uuids: + db = self.db.lookup(uuid) + lovconfig = LOVConfig(db) + lovconfig.prepare() + def cleanup(self): if is_prepared('MDT_UUID'): try: @@ -1046,40 +1098,49 @@ class MDS(Module): # Very unusual case, as there is no MDC element in the XML anymore # Builds itself from an MDS node class MDC(Module): - def __init__(self,dom_node): - self.mds = MDS(dom_node) - self.dom_node = dom_node + def __init__(self,db): + self.mds_uuid = db.getUUID() + self.mds_name = db.getName() + self.db = db + node_name = config.select(self.mds_name) + if node_name: + self.mdd_uuid = self.db.get_mdd(node_name, self.mds_uuid) + else: + self.mdd_uuid = db.get_first_ref('active') + if not self.mdd_uuid: + panic("No MDSDEV found for MDS service:", self.mds_name) self.module_name = 'MDC' self.kmodule_list = [] self._server = None self._connected = 0 host = socket.gethostname() - self.name = 'MDC_%s' % (self.mds.name) + self.name = 'MDC_%s' % (self.mds_name) self.uuid = '%s_%05x_%05x' % (self.name, int(random.random() * 1048576), int(random.random() * 1048576)) - self.lookup_server(self.mds.uuid) + self.lookup_server(self.mdd_uuid) self.add_lustre_module('mdc', 'mdc') def prepare(self): if is_prepared(self.uuid): return - self.info(self.mds.uuid) + self.info(self.mds_uuid) srv = self.get_server() lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem) lctl.newdev(attach="mdc %s %s" % (self.name, self.uuid), - setup ="%s %s" %(self.mds.uuid, srv.uuid)) + setup ="%s %s" %(self.mds_uuid, srv.uuid)) class OBD(Module): - def __init__(self, dom_node): - Module.__init__(self, 'OBD', dom_node) - self.obdtype = get_attr(dom_node, 'type') - self.devname, self.size = get_device(dom_node) - self.fstype = get_text(dom_node, 'fstype') - self.active_target = get_text(dom_node, 'active_target') + def __init__(self, db): + Module.__init__(self, 'OBD', db) + self.obdtype = self.db.get_val('obdtype') + self.devname = self.db.get_val('devpath', '') + self.size = self.db.get_val_int('devsize', 0) + self.fstype = self.db.get_val('fstype', '') + self.active_target = self.db.get_first_ref('active') # FIXME: if fstype not set, then determine based on kernel version - self.format = get_text(dom_node, 'autoformat', 'yes') + self.format = self.db.get_val('autoformat', 'yes') if self.fstype == 'extN': self.add_lustre_module('extN', 'extN') self.add_lustre_module(self.obdtype, self.obdtype) @@ -1107,10 +1168,10 @@ class OBD(Module): clean_loop(self.devname) class COBD(Module): - def __init__(self, dom_node): - Module.__init__(self, 'COBD', dom_node) - self.real_uuid = get_first_ref(dom_node, 'real_obd') - self.cache_uuid = get_first_ref(dom_node, 'cache_obd') + def __init__(self, db): + Module.__init__(self, 'COBD', db) + self.real_uuid = self.db.get_first_ref('realobd') + self.cache_uuid = self.db.get_first_ref('cacheobd') self.add_lustre_module('cobd' , 'cobd') # need to check /proc/mounts and /etc/mtab before @@ -1124,9 +1185,9 @@ class COBD(Module): setup ="%s %s" %(self.real_uuid, self.cache_uuid)) class OST(Module): - def __init__(self,dom_node): - Module.__init__(self, 'OST', dom_node) - self.obd_uuid = get_first_ref(dom_node, 'obd') + def __init__(self,db): + Module.__init__(self, 'OST', db) + self.obd_uuid = self.db.get_first_ref('obd') self.add_lustre_module('ost', 'ost') def prepare(self): @@ -1139,12 +1200,12 @@ class OST(Module): # virtual interface for OSC and LOV class VOSC(Module): - def __init__(self,dom_node): - Module.__init__(self, 'VOSC', dom_node) - if dom_node.nodeName == 'lov': - self.osc = LOV(dom_node) + def __init__(self,db): + Module.__init__(self, 'VOSC', db) + if db.get_class() == 'lov': + self.osc = LOV(db) else: - self.osc = get_osc(dom_node) + self.osc = get_osc(db) def get_uuid(self): return self.osc.uuid def prepare(self): @@ -1158,8 +1219,8 @@ class VOSC(Module): class OSC(Module): - def __init__(self, dom_node, obd_name, obd_uuid, ost_uuid): - self.dom_node = dom_node + def __init__(self, db, obd_name, obd_uuid, ost_uuid): + self.db = db self.module_name = 'OSC' self.name = 'OSC_%s' % (obd_name) self.uuid = '%s_%05x' % (self.name, int(random.random() * 1048576)) @@ -1169,6 +1230,7 @@ class OSC(Module): self.obd_uuid = obd_uuid self.ost_uuid = ost_uuid + debug("OSC:", obd_uuid, ost_uuid) self.lookup_server(self.ost_uuid) self.add_lustre_module('osc', 'osc') @@ -1211,11 +1273,11 @@ class OSC(Module): class ECHO_CLIENT(Module): - def __init__(self,dom_node): - Module.__init__(self, 'ECHO_CLIENT', dom_node) + def __init__(self,db): + Module.__init__(self, 'ECHO_CLIENT', db) self.add_lustre_module('obdecho', 'obdecho') - self.obd_uuid = get_first_ref(dom_node, 'obd') - obd = lookup(self.dom_node.parentNode, self.obd_uuid) + self.obd_uuid = self.db.get_first_ref('obd') + obd = self.db.lookup(self.obd_uuid) self.osc = VOSC(obd) def prepare(self): @@ -1223,9 +1285,9 @@ class ECHO_CLIENT(Module): return self.osc.prepare() # XXX This is so cheating. -p self.info(self.obd_uuid) - + lctl.newdev(attach="echo_client %s %s" % (self.name, self.uuid), - setup = self.obd_uuid) + setup = self.osc.get_uuid()) def cleanup(self): if not is_prepared(self.uuid): @@ -1241,20 +1303,20 @@ class ECHO_CLIENT(Module): class Mountpoint(Module): - def __init__(self,dom_node): - Module.__init__(self, 'MTPT', dom_node) - self.path = get_text(dom_node, 'path') - self.mds_uuid = get_first_ref(dom_node, 'mds') - self.obd_uuid = get_first_ref(dom_node, 'obd') + def __init__(self,db): + Module.__init__(self, 'MTPT', db) + self.path = self.db.get_val('path') + self.mds_uuid = self.db.get_first_ref('mds') + self.obd_uuid = self.db.get_first_ref('obd') self.add_lustre_module('mdc', 'mdc') self.add_lustre_module('llite', 'llite') - obd = lookup(self.dom_node.parentNode, self.obd_uuid) + obd = self.db.lookup(self.obd_uuid) self.osc = VOSC(obd) def prepare(self): self.osc.prepare() - mdc_uuid = prepare_mdc(self.dom_node.parentNode, self.mds_uuid) + mdc_uuid = prepare_mdc(self.db, self.mds_uuid) self.info(self.path, self.mds_uuid, self.obd_uuid) cmd = "mount -t lustre_lite -o osc=%s,mdc=%s none %s" % \ (self.osc.get_uuid(), mdc_uuid, self.path) @@ -1277,7 +1339,7 @@ class Mountpoint(Module): panic("fs is still mounted:", self.path) self.osc.cleanup() - cleanup_mdc(self.dom_node.parentNode, self.mds_uuid) + cleanup_mdc(self.db, self.mds_uuid) def load_module(self): self.osc.load_module() @@ -1297,195 +1359,416 @@ def get_osc(obd_dom): osc = OSC(obd_dom, obd.name, obd.uuid, obd.active_target) return osc +class LustreDB: + def lookup(self, uuid): + """ lookup returns a new LustreDB instance""" + return self._lookup_by_uuid(uuid) + + def lookup_name(self, name, class_name = ""): + """ lookup returns a new LustreDB instance""" + return self._lookup_by_name(name, class_name) + + def lookup_class(self, class_name): + """ lookup returns a new LustreDB instance""" + return self._lookup_by_class(class_name) + + def get_val(self, tag, default=None): + v = self._get_val(tag) + if v: + return v + if default != None: + return default + debug("LustreDB", self.getName(), " no value for:", tag) + return None -def get_device(obd): - list = obd.getElementsByTagName('device') - if len(list) > 0: - dev = list[0] - dev.normalize(); - size = get_attr_int(dev, 'size', 0) - return dev.firstChild.data, size - return '', 0 - -# Get the text content from the first matching child -# If there is no content (or it is all whitespace), return -# the default -def get_text(dom_node, tag, default=""): - list = dom_node.getElementsByTagName(tag) - if len(list) > 0: - dom_node = list[0] - dom_node.normalize() - if dom_node.firstChild: - txt = string.strip(dom_node.firstChild.data) - if txt: - return txt - return default - -def get_text_int(dom_node, tag, default=0): - list = dom_node.getElementsByTagName(tag) - n = default - if len(list) > 0: - dom_node = list[0] - dom_node.normalize() - if dom_node.firstChild: - txt = string.strip(dom_node.firstChild.data) - if txt: - try: - n = int(txt) - except ValueError: - panic("text value is not integer:", txt) - return n - -def get_attr(dom_node, attr, default=""): - v = dom_node.getAttribute(attr) - if v: - return v - return default - -def get_attr_int(dom_node, attr, default=0): - n = default - v = dom_node.getAttribute(attr) - if v: + def get_class(self): + return self._get_class() + + def get_val_int(self, tag, default=0): + str = self._get_val(tag) try: - n = int(v) + if str: + return int(str) + return default except ValueError: - panic("attr value is not integer", v) - return n - -def get_first_ref(dom_node, tag): - """ Get the first uuidref of the type TAG. Used one only - one is expected. Returns the uuid.""" - uuid = None - refname = '%s_ref' % tag - list = dom_node.getElementsByTagName(refname) - if len(list) > 0: - uuid = getRef(list[0]) - return uuid + panic("text value is not integer:", str) + + def get_first_ref(self, tag): + """ Get the first uuidref of the type TAG. Only + one is expected. Returns the uuid.""" + uuids = self._get_refs(tag) + if len(uuids) > 0: + return uuids[0] + return None -def get_all_refs(dom_node, tag): - """ Get all the refs of type TAG. Returns list of uuids. """ - uuids = [] - refname = '%s_ref' % tag - list = dom_node.getElementsByTagName(refname) - if len(list) > 0: - for i in list: - uuids.append(getRef(i)) - return uuids - -def get_ost_net(dom_node, uuid): - ost = lookup(dom_node, uuid) - uuid = get_first_ref(ost, 'network') - if not uuid: + def get_refs(self, tag): + """ Get all the refs of type TAG. Returns list of uuids. """ + uuids = self._get_refs(tag) + return uuids + + def get_all_refs(self): + """ Get all the refs. Returns list of uuids. """ + uuids = self._get_all_refs() + return uuids + + def get_ost_net(self, uuid): + ost = self.lookup(uuid) + uuid = ost.get_first_ref('network') + if not uuid: + return None + return ost.lookup(uuid) + + def nid2server(self, nid): + netlist = self.parent.parent.attrs['network'] + for net_db in netlist: + if net_db.get_val('nid') == nid: + return net return None - return lookup(dom_node, uuid) - -def nid2server(dom_node, nid): - netlist = dom_node.getElementsByTagName('network') - for net_node in netlist: - if get_text(net_node, 'server') == nid: - return Network(net_node) - return None -def lookup(dom_node, uuid): - for n in dom_node.childNodes: - if n.nodeType == n.ELEMENT_NODE: - if getUUID(n) == uuid: - return n + # the tag name is the service type + # fixme: this should do some checks to make sure the dom_node is a service + # + # determine what "level" a particular node is at. + + # the order of iniitailization is based on level. + def getServiceLevel(self): + type = self.get_class() + ret=0; + if type in ('network',): + ret = 10 + elif type in ('device', 'ldlm'): + ret = 20 + elif type in ('obd', 'mdd', 'cobd'): + ret = 30 + elif type in ('mdsdev','ost'): + ret = 40 + elif type in ('mdc','osc'): + ret = 50 + elif type in ('lov',): + ret = 60 + elif type in ('mountpoint', 'echoclient'): + ret = 70 + + if ret < config.minlevel() or ret > config.maxlevel(): + ret = 0 + return ret + + # + # return list of services in a profile. list is a list of tuples + # [(level, db_object),] + def getServices(self): + list = [] + for ref_class, ref_uuid in self.get_all_refs(): + servdb = self.lookup(ref_uuid) + if servdb: + level = servdb.getServiceLevel() + if level > 0: + list.append((level, servdb)) + else: + panic('service not found: ' + ref_uuid) + + list.sort() + return list + + # Find the mdsdev attached to node_name that points to + # mds_uuid + # node->profiles->mdsdev_refs->mds + def get_mdd(self, node_name, mds_uuid): + node_db = self.lookup_name(node_name) + if not node_db: + return None + prof_list = node_db.get_refs('profile') + for prof_uuid in prof_list: + prof_db = node_db.lookup(prof_uuid) + mdd_list = prof_db.get_refs('mdsdev') + for mdd_uuid in mdd_list: + mdd = self.lookup(mdd_uuid) + if mdd.get_first_ref('mds') == mds_uuid: + return mdd_uuid + return None + + +class LustreDB_XML(LustreDB): + def __init__(self, dom, root_node): + # init xmlfile + self.dom_node = dom + self.root_node = root_node + + def xmltext(self, dom_node, tag): + list = dom_node.getElementsByTagName(tag) + if len(list) > 0: + dom_node = list[0] + dom_node.normalize() + if dom_node.firstChild: + txt = string.strip(dom_node.firstChild.data) + if txt: + return txt + + def xmlattr(self, dom_node, attr): + return dom_node.getAttribute(attr) + + def _get_val(self, tag): + """a value could be an attribute of the current node + or the text value in a child node""" + ret = self.xmlattr(self.dom_node, tag) + if not ret: + ret = self.xmltext(self.dom_node, tag) + return ret + + def _get_class(self): + return self.dom_node.nodeName + + # + # [(ref_class, ref_uuid),] + def _get_all_refs(self): + list = [] + for n in self.dom_node.childNodes: + if n.nodeType == n.ELEMENT_NODE: + ref_uuid = self.xml_get_ref(n) + ref_class = n.nodeName + list.append((ref_class, ref_uuid)) + + list.sort() + return list + + def _get_refs(self, tag): + """ Get all the refs of type TAG. Returns list of uuids. """ + uuids = [] + refname = '%s_ref' % tag + reflist = self.dom_node.getElementsByTagName(refname) + for r in reflist: + uuids.append(self.xml_get_ref(r)) + return uuids + + def xmllookup_by_uuid(self, dom_node, uuid): + for n in dom_node.childNodes: + if n.nodeType == n.ELEMENT_NODE: + if self.xml_get_uuid(n) == uuid: + return n + else: + n = self.xmllookup_by_uuid(n, uuid) + if n: return n + return None + + def _lookup_by_uuid(self, uuid): + dom = self. xmllookup_by_uuid(self.root_node, uuid) + if dom: + return LustreDB_XML(dom, self.root_node) + + def xmllookup_by_name(self, dom_node, name): + for n in dom_node.childNodes: + if n.nodeType == n.ELEMENT_NODE: + if self.xml_get_name(n) == name: + return n + else: + n = self.xmllookup_by_name(n, name) + if n: return n + return None + + def _lookup_by_name(self, name, class_name): + dom = self.xmllookup_by_name(self.root_node, name) + if dom: + return LustreDB_XML(dom, self.root_node) + + def xmllookup_by_class(self, dom_node, class_name): + return dom_node.getElementsByTagName(class_name) + + def _lookup_by_class(self, class_name): + ret = [] + domlist = self.xmllookup_by_class(self.root_node, class_name) + for node in domlist: + ret.append(LustreDB_XML(node, self.root_node)) + return ret + + def xml_get_name(self, n): + return n.getAttribute('name') + + def getName(self): + return self.xml_get_name(self.dom_node) + + def xml_get_ref(self, n): + return n.getAttribute('uuidref') + + def xml_get_uuid(self, dom_node): + return dom_node.getAttribute('uuid') + + def getUUID(self): + return self.xml_get_uuid(self.dom_node) + + def get_routes(self, type, gw): + """ Return the routes as a list of tuples of the form: + [(type, gw, lo, hi),]""" + res = [] + tbl = self.dom_node.getElementsByTagName('route_tbl') + for t in tbl: + routes = t.getElementsByTagName('route') + for r in routes: + lo = self.xmlattr(r, 'lo') + hi = self.xmlattr(r, 'hi', '') + res.append((type, gw, lo, hi)) + return res + + def get_route_tbl(self): + ret = [] + tbls = self.dom_node.getElementsByTagName('route_tbl') + for tbl in tbls: + for r in tbl.getElementsByTagName('route'): + net_type = self.xmlattr(r, 'type') + gw = self.xmlattr(r, 'gw') + lo = self.xmlattr(r, 'lo') + hi = self.xmlattr(r,'hi', '') + ret.append((net_type, gw, lo, hi)) + return ret + + +# ================================================================ +# LDAP Support +class LustreDB_LDAP(LustreDB): + def __init__(self, name, attrs, + base = "fs=lustre", + parent = None, + url = "ldap://localhost", + user = "cn=Manager, fs=lustre", + pw = "secret" + ): + self._name = name + self._attrs = attrs + self._base = base + self._parent = parent + self._url = url + self._user = user + self._pw = pw + if parent: + self.l = parent.l + self._base = parent._base + else: + self.open() + + def open(self): + import ldap + try: + self.l = ldap.initialize(self._url) + # Set LDAP protocol version used + self.l.protocol_version=ldap.VERSION3 + # user and pw only needed if modifying db + self.l.bind_s("", "", ldap.AUTH_SIMPLE); + except ldap.LDAPerror, e: + panic(e) + # FIXME, do something useful here + + def close(self): + self.l.unbind_s() + + def ldap_search(self, filter): + """Return list of uuids matching the filter.""" + import ldap + dn = self._base + ret = [] + uuids = [] + try: + for name, attrs in self.l.search_s(dn, ldap.SCOPE_ONELEVEL, + filter, ["uuid"]): + for v in attrs['uuid']: + uuids.append(v) + except ldap.NO_SUCH_OBJECT, e: + pass + except ldap.LDAPError, e: + print e # FIXME: die here? + if len(uuids) > 0: + for uuid in uuids: + ret.append(self._lookup_by_uuid(uuid)) + return ret + + def _lookup_by_name(self, name, class_name): + list = self.ldap_search("lustreName=%s" %(name)) + if len(list) == 1: + return list[0] + return [] + + def _lookup_by_class(self, class_name): + return self.ldap_search("objectclass=%s" %(string.upper(class_name))) + + def _lookup_by_uuid(self, uuid): + import ldap + dn = "uuid=%s,%s" % (uuid, self._base) + ret = None + try: + for name, attrs in self.l.search_s(dn, ldap.SCOPE_BASE, + "objectclass=*"): + ret = LustreDB_LDAP(name, attrs, parent = self) + + except ldap.NO_SUCH_OBJECT, e: + debug("NO_SUCH_OBJECT:", uuid) + pass # just return empty list + except ldap.LDAPError, e: + print e # FIXME: die here? + return ret + + + def _get_val(self, k): + ret = None + if self._attrs.has_key(k): + v = self._attrs[k] + if type(v) == types.ListType: + ret = str(v[0]) else: - n = lookup(n, uuid) - if n: return n - return None - -# Get name attribute of dom_node -def getName(dom_node): - return dom_node.getAttribute('name') + ret = str(v) + return ret -def getRef(dom_node): - return dom_node.getAttribute('uuidref') + def _get_class(self): + return string.lower(self._attrs['objectClass'][0]) -# Get name attribute of dom_node -def getUUID(dom_node): - return dom_node.getAttribute('uuid') + # + # [(ref_class, ref_uuid),] + def _get_all_refs(self): + list = [] + for k in self._attrs.keys(): + if re.search('.*Ref', k): + for uuid in self._attrs[k]: + list.append((k, uuid)) + return list -# the tag name is the service type -# fixme: this should do some checks to make sure the dom_node is a service -def getServiceType(dom_node): - return dom_node.nodeName + def _get_refs(self, tag): + """ Get all the refs of type TAG. Returns list of uuids. """ + uuids = [] + refname = '%sRef' % tag + if self._attrs.has_key(refname): + return self._attrs[refname] + return [] -# -# determine what "level" a particular node is at. -# the order of iniitailization is based on level. -def getServiceLevel(dom_node): - type = getServiceType(dom_node) - ret=0; - if type in ('network',): - ret = 10 - elif type in ('device', 'ldlm'): - ret = 20 - elif type in ('obd', 'mdd', 'cobd'): - ret = 30 - elif type in ('mds','ost'): - ret = 40 - elif type in ('mdc','osc'): - ret = 50 - elif type in ('lov', 'lovconfig'): - ret = 60 - elif type in ('mountpoint', 'echo_client'): - ret = 70 - - if ret < config.minlevel() or ret > config.maxlevel(): - ret = 0 - return ret + def getName(self): + return self._get_val('lustreName') -# -# return list of services in a profile. list is a list of tuples -# [(level, dom_node),] -def getServices(lustreNode, profileNode): - list = [] - for n in profileNode.childNodes: - if n.nodeType == n.ELEMENT_NODE: - servNode = lookup(lustreNode, getRef(n)) - if not servNode: - print n - panic('service not found: ' + getRef(n)) - level = getServiceLevel(servNode) - if level > 0: - list.append((level, servNode)) - list.sort() - return list - -def getByName(lustreNode, name, tag): - ndList = lustreNode.getElementsByTagName(tag) - for nd in ndList: - if getName(nd) == name: - return nd - return None - + def getUUID(self): + return self._get_val('uuid') + + def get_route_tbl(self): + return [] ############################################################ # MDC UUID hack - # FIXME: clean this mess up! # saved_mdc = {} -def prepare_mdc(dom_node, mds_uuid): +def prepare_mdc(db, mds_uuid): global saved_mdc - mds_node = lookup(dom_node, mds_uuid); - if not mds_node: + mds_db = db.lookup(mds_uuid); + if not mds_db: panic("no mds:", mds_uuid) if saved_mdc.has_key(mds_uuid): return saved_mdc[mds_uuid] - mdc = MDC(mds_node) + mdc = MDC(mds_db) mdc.prepare() saved_mdc[mds_uuid] = mdc.uuid return mdc.uuid -def cleanup_mdc(dom_node, mds_uuid): +def cleanup_mdc(db, mds_uuid): global saved_mdc - mds_node = lookup(dom_node, mds_uuid); - if not mds_node: + mds_db = db.lookup(mds_uuid); + if not mds_db: panic("no mds:", mds_uuid) if not saved_mdc.has_key(mds_uuid): - mdc = MDC(mds_node) + mdc = MDC(mds_db) mdc.cleanup() saved_mdc[mds_uuid] = mdc.uuid @@ -1497,58 +1780,45 @@ routes = [] local_node = [] router_flag = 0 -def init_node(dom_node): +def init_node(node_db): global local_node, router_flag - netlist = dom_node.getElementsByTagName('network') - for dom_net in netlist: - type = get_attr(dom_net, 'type') - gw = get_text(dom_net, 'server') + netlist = node_db.lookup_class('network') + for db in netlist: + type = db.get_val('nettype') + gw = db.get_val('nid') local_node.append((type, gw)) def node_needs_router(): return router_flag -def get_routes(type, gw, dom_net): - """ Return the routes as a list of tuples of the form: - [(type, gw, lo, hi),]""" - res = [] - tbl = dom_net.getElementsByTagName('route_tbl') - for t in tbl: - routes = t.getElementsByTagName('route') - for r in routes: - lo = get_attr(r, 'lo') - hi = get_attr(r, 'hi', '') - res.append((type, gw, lo, hi)) - return res - - def init_route_config(lustre): """ Scan the lustre config looking for routers. Build list of routes. """ global routes, router_flag routes = [] - list = lustre.getElementsByTagName('node') - for node in list: - if get_attr(node, 'router'): + list = lustre.lookup_class('node') + for node_db in list: + if node_db.get_val_int('router', 0): router_flag = 1 for (local_type, local_nid) in local_node: gw = None - netlist = node.getElementsByTagName('network') - for dom_net in netlist: - if local_type == get_attr(dom_net, 'type'): - gw = get_text(dom_net, 'server') + netlist = node_db.lookup_class('network') + for db in netlist: + if local_type == db.get_val('type'): + gw = db.get_val('server') break if not gw: continue - for dom_net in netlist: - if local_type != get_attr(dom_net, 'type'): - for route in get_routes(local_type, gw, dom_net): + for db in netlist: + if local_type != db.get_val('type'): + for route in db.get_routes(local_type, gw): routes.append(route) def local_net(net): global local_node for iface in local_node: + #debug("local_net a:", net.net_type, "b:", iface[0]) if net.net_type == iface[0]: return 1 return 0 @@ -1565,40 +1835,37 @@ def find_route(net): return None - ############################################################ # lconf level logic # Start a service. -def startService(dom_node, module_flag): - type = getServiceType(dom_node) - debug('Service:', type, getName(dom_node), getUUID(dom_node)) +def startService(db, module_flag): + type = db.get_class() + debug('Service:', type, db.getName(), db.getUUID()) # there must be a more dynamic way of doing this... n = None if type == 'ldlm': - n = LDLM(dom_node) + n = LDLM(db) elif type == 'lov': - n = LOV(dom_node) - elif type == 'lovconfig': - n = LOVConfig(dom_node) + n = LOV(db) elif type == 'network': - n = Network(dom_node) + n = Network(db) elif type == 'obd': - n = OBD(dom_node) + n = OBD(db) elif type == 'cobd': - n = COBD(dom_node) + n = COBD(db) elif type == 'ost': - n = OST(dom_node) - elif type == 'mds': - n = MDS(dom_node) + n = OST(db) + elif type == 'mdsdev': + n = MDSDEV(db) elif type == 'osc': - n = VOSC(dom_node) + n = VOSC(db) elif type == 'mdc': - n = MDC(dom_node) + n = MDC(db) elif type == 'mountpoint': - n = Mountpoint(dom_node) - elif type == 'echo_client': - n = ECHO_CLIENT(dom_node) + n = Mountpoint(db) + elif type == 'echoclient': + n = ECHO_CLIENT(db) else: panic ("unknown service type:", type) @@ -1625,10 +1892,10 @@ def startService(dom_node, module_flag): # * make sure partitions are in place and prepared # * initialize devices with lctl # Levels is important, and needs to be enforced. -def startProfile(lustreNode, profileNode, module_flag): - if not profileNode: +def startProfile(prof_db, module_flag): + if not prof_db: panic("profile:", profile, "not found.") - services = getServices(lustreNode, profileNode) + services = prof_db.getServices() if config.cleanup(): services.reverse() for s in services: @@ -1637,35 +1904,33 @@ def startProfile(lustreNode, profileNode, module_flag): # # Load profile for -def doHost(lustreNode, hosts): +def doHost(lustreDB, hosts): global routes global router_flag - dom_node = None + node_db = None for h in hosts: - dom_node = getByName(lustreNode, h, 'node') - if dom_node: + node_db = lustreDB.lookup_name(h, 'node') + if node_db: break - if not dom_node: + if not node_db: print 'No host entry found.' return - if get_attr(dom_node, 'router'): - router_flag = 1 - else: - router_flag = 0 - recovery_upcall = get_attr(dom_node, 'recovery_upcall') - timeout = get_attr_int(dom_node, 'timeout') + router_flag = node_db.get_val_int('router', 0) + recovery_upcall = node_db.get_val('recovery_upcall', '') + timeout = node_db.get_val_int('timeout', 0) if not router_flag: - init_node(dom_node) - init_route_config(lustreNode) + init_node(node_db) + init_route_config(lustreDB) # Two step process: (1) load modules, (2) setup lustre # if not cleaning, load modules first. module_flag = not config.cleanup() - reflist = dom_node.getElementsByTagName('profile') - for profile in reflist: - startProfile(lustreNode, profile, module_flag) + prof_list = node_db.get_refs('profile') + for prof_uuid in prof_list: + prof_db = node_db.lookup(prof_uuid) + startProfile(prof_db, module_flag) if not config.cleanup(): sys_set_debug_path() @@ -1678,10 +1943,10 @@ def doHost(lustreNode, hosts): sys_set_timeout(timeout) sys_set_recovery_upcall(recovery_upcall) - module_flag = not module_flag - for profile in reflist: - startProfile(lustreNode, profile, module_flag) + for prof_uuid in prof_list: + prof_db = node_db.lookup(prof_uuid) + startProfile(prof_db, module_flag) ############################################################ # Command line processing @@ -1692,7 +1957,8 @@ def parse_cmdline(argv): "portals=", "makeldiff", "cleanup", "noexec", "help", "node=", "nomod", "nosetup", "dump=", "force", "minlevel=", "maxlevel=", - "timeout=", "recovery_upcall="] + "timeout=", "recovery_upcall=", + "ldapurl=", "config=", "select="] opts = [] args = [] @@ -1730,14 +1996,21 @@ def parse_cmdline(argv): config.dump_file(a) if o in ("-f", "--force"): config.force(1) - if o in ("--minlevel",): + if o == "--minlevel": config.minlevel(a) - if o in ("--maxlevel",): + if o == "--maxlevel": config.maxlevel(a) - if o in ("--timeout",): + if o == "--timeout": config.timeout(a) - if o in ("--recovery_upcall",): + if o == "--recovery_upcall": config.recovery_upcall(a) + if o == "--ldapurl": + config.ldapurl(a) + if o == "--config": + config.config_name(a) + if o == "--select": + config.init_select(a) + return args def fetch(url): @@ -1793,9 +2066,9 @@ def sys_set_recovery_upcall(upcall): def sys_set_timeout(timeout): # the command overrides the value in the node config - if config.timeout() >= 0: + if config.timeout() > 0: timeout = config.timeout() - if timeout >= 0: + if timeout > 0: debug("setting timeout:", timeout) sysctl('lustre/timeout', timeout) @@ -1867,10 +2140,17 @@ def main(): if not os.access(args[0], os.R_OK): print 'File not found or readable:', args[0] sys.exit(1) - dom = xml.dom.minidom.parse(args[0]) - elif config.url(): - xmldata = fetch(config.url()) - dom = xml.dom.minidom.parseString(xmldata) + try: + dom = xml.dom.minidom.parse(args[0]) + except Exception: + panic("%s does not appear to be a config file." % (args[0])) + sys.exit(1) # make sure to die here, even in debug mode. + db = LustreDB_XML(dom.documentElement, dom.documentElement) + elif config.ldapurl(): + if not config.config_name(): + panic("--ldapurl requires --config name") + dn = "config=%s,fs=lustre" % (config.config_name()) + db = LustreDB_LDAP('', {}, base=dn, url = config.ldapurl()) else: usage() @@ -1902,7 +2182,8 @@ def main(): sys_make_devices() sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF) sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF) - doHost(dom.documentElement, node_list) + + doHost(db, node_list) if __name__ == "__main__": try: diff --git a/lustre/utils/lctl.c b/lustre/utils/lctl.c index 2e6324c..2217058 100644 --- a/lustre/utils/lctl.c +++ b/lustre/utils/lctl.c @@ -179,6 +179,7 @@ command_t cmdlist[] = { "usage: lov_set_osc_active <1|0 (active|inactive)>"}, {"newconn", jt_obd_newconn, 0, "newconn [newuuid]"}, {"failconn", jt_obd_failconn, 0, "failconn "}, + {"lookup", jt_obd_mdc_lookup, 0, "usage: lookup "}, /* Debug commands */ {"======== debug =========", jt_noop, 0, "debug"}, diff --git a/lustre/utils/lfind.c b/lustre/utils/lfind.c index 26f6a3f..93777d6 100644 --- a/lustre/utils/lfind.c +++ b/lustre/utils/lfind.c @@ -16,7 +16,7 @@ #include #include -#warning Max obds per lov currently hardcoded to 1000 in lov/lov_obd.c +/* XXX Max obds per lov currently hardcoded to 1000 in lov/lov_obd.c */ #define MAX_LOV_UUID_COUNT 1000 #define OBD_NOT_FOUND ((__u32)-1) @@ -128,7 +128,7 @@ init() else buflen = lmmlen; -#warning max ioctl buffer size currently hardcoded to 8192 + /* XXX max ioctl buffer size currently hardcoded to 8192 */ if (buflen > 8192) { int nuuids, remaining, nluoinfos; @@ -194,12 +194,12 @@ processFile(const char *path, const struct stat *sp, int flag, struct FTW *ftwp) if (flag != FTW_F) return 0; - if ((obdcount == 0) && (getobdindex(path) == OBD_NOT_FOUND)) { + if (getobdindex(path) == OBD_NOT_FOUND && obdcount == 0) { /* terminate nftw walking this tree */ return(1); } - if ((fd = open(path, O_RDONLY)) < 0) { + if ((fd = open(path, O_RDONLY | O_LOV_DELAY_CREATE)) < 0) { errMsg("open \"%.20s\" failed.", path); perror("open"); exit(1); @@ -212,21 +212,24 @@ processFile(const char *path, const struct stat *sp, int flag, struct FTW *ftwp) if ((rc = ioctl(fd, LL_IOC_LOV_GETSTRIPE, (void *)lmm)) < 0) { errMsg("LL_IOC_LOV_GETSTRIPE ioctl failed."); perror("ioctl"); - exit(1); + return 0; } close(fd); - if (query || verbose || lmm->lmm_objects[obdindex].l_object_id) + if (query || verbose || + (obdindex != OBD_NOT_FOUND && + lmm->lmm_objects[obdindex].l_object_id)) printf("%s\n", path); if (verbose) { printf("lmm_magic: 0x%x\n", lmm->lmm_magic); printf("lmm_object_id: "LPX64"\n", lmm->lmm_object_id); - printf("lmm_stripe_offset: %d\n", lmm->lmm_stripe_offset); - printf("lmm_stripe_count: %d\n", lmm->lmm_stripe_count); - printf("lmm_ost_count: %d\n", lmm->lmm_ost_count); - printf("lmm_stripe_pattern: %d\n", lmm->lmm_stripe_pattern); + printf("lmm_stripe_offset: %u\n", (int)lmm->lmm_stripe_offset); + printf("lmm_stripe_count: %u\n", (int)lmm->lmm_stripe_count); + printf("lmm_stripe_size: %u\n", (int)lmm->lmm_stripe_size); + printf("lmm_ost_count: %u\n", lmm->lmm_ost_count); + printf("lmm_stripe_pattern: %d\n", lmm->lmm_magic & 0xf); } count = lmm->lmm_ost_count; diff --git a/lustre/utils/lmc b/lustre/utils/lmc index 3ea5265..4d40a5b 100755 --- a/lustre/utils/lmc +++ b/lustre/utils/lmc @@ -53,6 +53,7 @@ Object creation command summary: --node node_name --mds mds_name --dev path + --fstype extN|ext3 --size size --add lov @@ -68,6 +69,7 @@ Object creation command summary: --lov lov_name --dev path --size size + --fstype extN|ext3 --obduuid uuid --add mtpt - Mountpoint @@ -179,13 +181,13 @@ class GenConfig: def network(self, name, uuid, hostname, net, port=0, tcpbuf=0): """create node""" network = self.newService("network", name, uuid) - network.setAttribute("type", net); - self.addElement(network, "server", hostname) + network.setAttribute("nettype", net); + self.addElement(network, "nid", hostname) if port: self.addElement(network, "port", "%d" %(port)) if tcpbuf: - self.addElement(network, "send_mem", "%d" %(tcpbuf)) - self.addElement(network, "recv_mem", "%d" %(tcpbuf)) + self.addElement(network, "sendmem", "%d" %(tcpbuf)) + self.addElement(network, "recvmem", "%d" %(tcpbuf)) return network @@ -199,10 +201,15 @@ class GenConfig: ref.setAttribute("hi", hi) return ref - def node(self, name, uuid): + def profile(self, name, uuid): + """ create a host """ + profile = self.newService("profile", name, uuid) + return profile + + def node(self, name, uuid, prof_uuid): """ create a host """ node = self.newService("node", name, uuid) - self.addElement(node, 'profile') + node.appendChild(self.ref("profile", prof_uuid)) return node def ldlm(self, name, uuid): @@ -212,27 +219,21 @@ class GenConfig: def obd(self, name, uuid, fs, obdtype, devname, format, ost_uuid, dev_size=0): obd = self.newService("obd", name, uuid) - obd.setAttribute('type', obdtype) - self.addElement(obd, 'active_target', ost_uuid) + obd.setAttribute('obdtype', obdtype) + obd.appendChild(self.ref("active", ost_uuid)) if fs: self.addElement(obd, "fstype", fs) if devname: - dev = self.addElement(obd, "device", devname) - if (dev_size): - dev.setAttribute("size", "%s" % (dev_size)) + dev = self.addElement(obd, "devpath", devname) self.addElement(obd, "autoformat", format) + if dev_size: + self.addElement(obd, "devsize", "%s" % (dev_size)) return obd -# def osc(self, name, uuid, obd_uuid, net_uuid): -# osc = self.newService("osc", name, uuid) -# osc.appendChild(self.ref("ost", net_uuid)) -# osc.appendChild(self.ref("obd", obd_uuid)) -# return osc - def cobd(self, name, uuid, real_uuid, cache_uuid): cobd = self.newService("cobd", name, uuid) - cobd.appendChild(self.ref("real_obd",real_uuid)) - cobd.appendChild(self.ref("cache_obd",cache_uuid)) + cobd.appendChild(self.ref("realobd",real_uuid)) + cobd.appendChild(self.ref("cacheobd",cache_uuid)) return cobd def ost(self, name, uuid, obd_uuid, net_uuid): @@ -244,10 +245,9 @@ class GenConfig: def lov(self, name, uuid, mds_uuid, stripe_sz, stripe_cnt, pattern): lov = self.newService("lov", name, uuid) lov.appendChild(self.ref("mds", mds_uuid)) - devs = self.addElement(lov, "devices" ) - devs.setAttribute("stripesize", stripe_sz) - devs.setAttribute("stripecount", stripe_cnt) - devs.setAttribute("pattern", pattern) + lov.setAttribute("stripesize", stripe_sz) + lov.setAttribute("stripecount", stripe_cnt) + lov.setAttribute("stripepattern", pattern) return lov def lovconfig(self, name, uuid, lov_uuid): @@ -255,20 +255,23 @@ class GenConfig: lovconfig.appendChild(self.ref("lov", lov_uuid)) return lovconfig - def mds(self, name, uuid, fs, devname, format, net_uuid, node_uuid, - failover_uuid = "", dev_size=0 ): + def mds(self, name, uuid, mdd_uuid): mds = self.newService("mds", name, uuid) - self.addElement(mds, "fstype", fs) - dev = self.addElement(mds, "device", devname) - if dev_size: - dev.setAttribute("size", "%s" % (dev_size)) - self.addElement(mds, "autoformat", format) - mds.appendChild(self.ref("network", net_uuid)) - mds.appendChild(self.ref("node", node_uuid)) - if failover_uuid: - mds.appendChild(self.ref("failover", failover_uuid)) + mds.appendChild(self.ref("active",mdd_uuid)) return mds + def mdsdev(self, name, uuid, fs, devname, format, net_uuid, node_uuid, + mds_uuid, dev_size=0 ): + mdd = self.newService("mdsdev", name, uuid) + self.addElement(mdd, "fstype", fs) + dev = self.addElement(mdd, "devpath", devname) + self.addElement(mdd, "autoformat", format) + if dev_size: + self.addElement(mdd, "devsize", "%s" % (dev_size)) + mdd.appendChild(self.ref("network", net_uuid)) + mdd.appendChild(self.ref("mds", mds_uuid)) + return mdd + def mountpoint(self, name, uuid, mds_uuid, osc_uuid, path): mtpt = self.newService("mountpoint", name, uuid) mtpt.appendChild(self.ref("mds", mds_uuid)) @@ -277,7 +280,7 @@ class GenConfig: return mtpt def echo_client(self, name, uuid, osc_uuid): - ec = self.newService("echo_client", name, uuid) + ec = self.newService("echoclient", name, uuid) ec.appendChild(self.ref("obd", osc_uuid)) return ec @@ -314,19 +317,6 @@ def lookup(node, uuid): n = lookup(n, uuid) if n: return n return None - - -def mds2node(lustre, mds_name): - """ Find the node a MDS is configured on """ - mds = findByName(lustre, mds_name, 'mds') - ref = mds.getElementsByTagName('node_ref') - if not ref: - error("mds2node:", "no node_ref found for", '"'+mds_name+'"') - node_uuid = ref[0].getAttribute('uuidref') - node = lookup(lustre, node_uuid) - if not node: - error('mds2node:', "no node found for :", '"'+mds_name+'"') - return node def name2uuid(lustre, name, tag="", fatal=1): @@ -353,18 +343,16 @@ def get_net_uuid(lustre, node_name): def lov_add_obd(gen, lov, osc_uuid): - devs = lov.getElementsByTagName('devices') - if len(devs) == 1: - devs[0].appendChild(gen.ref("obd", osc_uuid)) - else: - error("No devices element found for LOV:", lov) - + lov.appendChild(gen.ref("obd", osc_uuid)) def node_add_profile(gen, node, ref, uuid): - ret = node.getElementsByTagName('profile') + refname = "%s_ref" % "profile" + ret = node.getElementsByTagName(refname) if not ret: - error('node has no profile:', node) - ret[0].appendChild(gen.ref(ref, uuid)) + error('node has no profile ref:', node) + prof_uuid = ret[0].getAttribute('uuidref') + profile = lookup(node.parentNode, prof_uuid) + profile.appendChild(gen.ref(ref, uuid)) def get_attr(dom_node, attr, default=""): v = dom_node.getAttribute(attr) @@ -377,7 +365,13 @@ def get_attr(dom_node, attr, default=""): # def do_add_node(gen, lustre, options, node_name): uuid = new_uuid(node_name) - node = gen.node(node_name, uuid) + prof_name = new_name("PROFILE_" + node_name) + prof_uuid = new_uuid(prof_name) + profile = gen.profile(prof_name, prof_uuid) + node = gen.node(node_name, uuid, prof_uuid) + lustre.appendChild(node) + lustre.appendChild(profile) + node_add_profile(gen, node, 'ldlm', ldlm_uuid) if has_option(options, 'router'): node.setAttribute('router', '1') @@ -385,7 +379,6 @@ def do_add_node(gen, lustre, options, node_name): node.setAttribute('timeout', get_option(options, 'timeout')) if has_option(options, 'recovery_upcall'): node.setAttribute('recovery_upcall', get_option(options, 'recovery_upcall')) - lustre.appendChild(node) return node @@ -393,7 +386,6 @@ def add_node(gen, lustre, options): """ create a node with a network config """ node_name = get_option(options, 'node') - ret = findByName(lustre, node_name, "node") if ret: print "Node:", node_name, "exists." @@ -444,37 +436,41 @@ def add_route(gen, lustre, options): netlist = node.getElementsByTagName('network') net = netlist[0] - rlist = net.getElementsByTagName('route_tbl') + rlist = net.getElementsByTagName('routetbl') if len(rlist) > 0: rtbl = rlist[0] else: - rtbl = gen.addElement(net, 'route_tbl') + rtbl = gen.addElement(net, 'routetbl') rtbl.appendChild(gen.route(net_type, gw, lo, hi)) def add_mds(gen, lustre, options): node_name = get_option(options, 'node') - mds_orig = get_option(options, 'mds') - mds_name = new_name(mds_orig) - if mds_name != mds_orig: - warning("name:", mds_orig, "already used. using:", mds_name) + mds_name = get_option(options, 'mds') + mdd_name = new_name("MDD_" + mds_name +"_" + node_name) + mdd_uuid = new_uuid(mdd_name) + + mds_uuid = name2uuid(lustre, mds_name, fatal=0) + if not mds_uuid: + mds_uuid = new_uuid(mds_name) + mds = gen.mds(mds_name, mds_uuid, mdd_uuid) + lustre.appendChild(mds) + devname = get_option(options, 'dev') size = get_option(options, 'size', 0) fstype = get_option(options, 'fstype', 'extN') - mds_uuid = new_uuid(mds_name) - node_uuid = name2uuid(lustre, node_name, 'node') node = findByName(lustre, node_name, "node") - node_add_profile(gen, node, "mds", mds_uuid) + node_add_profile(gen, node, "mdsdev", mdd_uuid) net_uuid = get_net_uuid(lustre, node_name) if not net_uuid: error("NODE: ", node_name, "not found") - mds = gen.mds(mds_name, mds_uuid, fstype, devname, get_format_flag(options), - net_uuid, node_uuid, dev_size=size) - lustre.appendChild(mds) + mdd = gen.mdsdev(mdd_name, mdd_uuid, fstype, devname, get_format_flag(options), + net_uuid, node_uuid, mds_uuid, dev_size=size) + lustre.appendChild(mdd) def add_ost(gen, lustre, options): @@ -552,7 +548,7 @@ def add_echo_client(gen, lustre, options): echoname = new_name('ECHO_'+ node_name) echo_uuid = new_uuid(echoname) - node_add_profile(gen, node, 'echo_client', echo_uuid) + node_add_profile(gen, node, 'echoclient', echo_uuid) lov_uuid = name2uuid(lustre, lov_name, tag='lov', fatal=0) if not lov_uuid: @@ -584,11 +580,11 @@ def add_lov(gen, lustre, options): lov = gen.lov(name, uuid, mds_uuid, stripe_sz, stripe_cnt, pattern) lustre.appendChild(lov) - # add an lovconfig entry to the mds profile + # add an lovconfig entry to the active mdsdev profile lovconfig_name = new_name('LVCFG_' + name) lovconfig_uuid = new_uuid(lovconfig_name) - node = mds2node(lustre, mds_name) - node_add_profile(gen, node, "lovconfig", lovconfig_uuid) + mds = findByName(lustre, mds_name) + mds.appendChild(gen.ref("lovconfig", lovconfig_uuid)) lovconfig = gen.lovconfig(lovconfig_name, lovconfig_uuid, uuid) lustre.appendChild(lovconfig) @@ -882,5 +878,3 @@ def main(): if __name__ == "__main__": main() - - diff --git a/lustre/utils/lstripe.c b/lustre/utils/lstripe.c index 65055a5..1aa9d91 100644 --- a/lustre/utils/lstripe.c +++ b/lustre/utils/lstripe.c @@ -17,28 +17,11 @@ void usage(char *pgm) { - fprintf(stderr, "\nIncorrect parameters! Correct usage:\n\n" ); - fprintf(stderr, "%s \n", pgm); + fprintf(stderr, "usage: %s \n", pgm); - fprintf(stderr, "\n\nArgument explanations:\n---------------------\n\n"); - fprintf(stderr, " = the full name and path of the output file to create\n"); - fprintf(stderr, " = the number of bytes to have in each stripe.\n"); - fprintf(stderr, " = the OST number to start the striping on.\n"); - fprintf(stderr, " = the number of stripes to use.\n"); - - fprintf(stderr, "\n\nExamples:\n---------\n\n"); - - fprintf(stderr, "%s /mnt/lustre/ost1 131072 0 1\n", pgm); - fprintf(stderr, "\t\tcreates a file only on ost1.\n\n"); - - fprintf(stderr, "%s /mnt/lustre/ost2 131072 1 1\n", pgm); - fprintf(stderr, "\t\tcreates a file only on ost2.\n\n"); - - fprintf(stderr, "%s /mnt/lustre/ost1and2 131072 0 2\n", pgm); - fprintf(stderr, "\t\tcreates a 128k file with 2 stripes, on ost1 and ost2.\n"); - - fprintf(stderr, "%s /mnt/lustre/ost1and2 131072 1 2\n", pgm); - fprintf(stderr, "\t\tcreates a 128k file with 2 stripes, on ost2 and ost1.\n"); + fprintf(stderr, "\tstripe size: number of bytes in each stripe\n"); + fprintf(stderr, "\tstripe start: OST index which holds first stripe\n"); + fprintf(stderr, "\tstripe count: number of OSTs to stripe over\n"); } int create_file(char *name, long stripe_size, int stripe_offset, @@ -49,7 +32,6 @@ int create_file(char *name, long stripe_size, int stripe_offset, /* Initialize IOCTL striping pattern structure */ a_striping.lmm_magic = LOV_MAGIC; - a_striping.lmm_stripe_pattern = 0; a_striping.lmm_stripe_size = stripe_size; a_striping.lmm_stripe_offset = stripe_offset; a_striping.lmm_stripe_count = stripe_count; diff --git a/lustre/utils/lustre.dtd b/lustre/utils/lustre.dtd deleted file mode 100644 index 2df183a..0000000 --- a/lustre/utils/lustre.dtd +++ /dev/null @@ -1,110 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/lustre/utils/obd.c b/lustre/utils/obd.c index ba22a9e..8c329ff 100644 --- a/lustre/utils/obd.c +++ b/lustre/utils/obd.c @@ -1273,7 +1273,7 @@ int jt_obd_lov_setconfig(int argc, char **argv) if (strlen(argv[1]) > sizeof(desc.ld_uuid) - 1) { fprintf(stderr, - "error: %s: LOV uuid '%s' longer than %zd characters\n", + "error: %s: LOV uuid '%s' longer than "LPSZ" characters\n", cmdname(argv[0]), argv[1], sizeof(desc.ld_uuid) - 1); return -EINVAL; } @@ -1375,18 +1375,24 @@ int jt_obd_lov_getconfig(int argc, char **argv) struct obd_ioctl_data data; struct lov_desc desc; obd_uuid_t *uuidarray; - int rc; + char *path; + int rc, tmpfd; + /* FIXME: ug. IOCINIT checks fd. */ + tmpfd = fd; + fd = 1; IOCINIT(data); + fd = tmpfd; if (argc != 2) return CMD_HELP; - if (strlen(argv[1]) > sizeof(desc.ld_uuid) - 1) { - fprintf(stderr, - "error: %s: LOV uuid '%s' longer than %zd characters\n", - cmdname(argv[0]), argv[1], sizeof(desc.ld_uuid) - 1); - return -EINVAL; + path = argv[1]; + tmpfd = open(path, O_RDONLY); + if (tmpfd < 0) { + fprintf(stderr, "open \"%s\" failed: %s\n", path, + strerror(errno)); + return -1; } memset(&desc, 0, sizeof(desc)); @@ -1397,7 +1403,8 @@ repeat: if (!uuidarray) { fprintf(stderr, "error: %s: no memory for %d uuid's\n", cmdname(argv[0]), desc.ld_tgt_count); - return -ENOMEM; + rc = -ENOMEM; + goto out; } data.ioc_inllen1 = sizeof(desc); @@ -1410,7 +1417,7 @@ repeat: rc = -EINVAL; goto out; } - rc = ioctl(fd, OBD_IOC_LOV_GET_CONFIG, buf); + rc = ioctl(tmpfd, OBD_IOC_LOV_GET_CONFIG, buf); if (rc == -ENOSPC) { free(uuidarray); goto repeat; @@ -1440,6 +1447,7 @@ repeat: } out: free(uuidarray); + close(tmpfd); return rc; } @@ -1596,6 +1604,55 @@ int jt_obd_failconn(int argc, char **argv) return rc; } +int jt_obd_mdc_lookup(int argc, char **argv) +{ + struct obd_ioctl_data data; + char *parent, *child; + int rc, tmpfd, verbose = 1; + + if (argc < 3 || argc > 4) + return CMD_HELP; + + parent = argv[1]; + child = argv[2]; + if (argc == 4) + verbose = get_verbose(argv[0], argv[3]); + + /* FIXME: ug. IOCINIT checks fd. */ + tmpfd = fd; + fd = 1; + IOCINIT(data); + fd = tmpfd; + + data.ioc_inllen1 = strlen(child) + 1; + data.ioc_inlbuf1 = child; + + IOC_PACK(argv[0], data); + + tmpfd = open(parent, O_RDONLY); + if (tmpfd < 0) { + fprintf(stderr, "open \"%s\" failed: %s\n", parent, + strerror(errno)); + return -1; + } + + rc = ioctl(tmpfd, IOC_MDC_LOOKUP, buf); + if (rc < 0) { + fprintf(stderr, "error: %s: ioctl error: %s\n", + cmdname(argv[0]), strerror(rc = errno)); + } + close(tmpfd); + + if (verbose) { + IOC_UNPACK(argv[0], data); + printf("%s: mode %o uid %d gid %d\n", child, + data.ioc_obdo1.o_mode, data.ioc_obdo1.o_uid, + data.ioc_obdo1.o_gid); + } + + return rc; +} + static void signal_server(int sig) { if (sig == SIGINT) { diff --git a/lustre/utils/obdctl.h b/lustre/utils/obdctl.h index 01ece92..acc5c5f 100644 --- a/lustre/utils/obdctl.h +++ b/lustre/utils/obdctl.h @@ -58,6 +58,7 @@ int jt_obd_dump_ldlm(int argc, char **argv); int jt_obd_lov_set_osc_active(int argc, char **argv); int jt_obd_newconn(int argc, char **argv); int jt_obd_failconn(int argc, char **argv); +int jt_obd_mdc_lookup(int argc, char **argv); int jt_get_version(int argc, char **argv); #endif -- 1.8.3.1