Whamcloud - gitweb
b=22458 fix concurrent mgs lock revocation.
[fs/lustre-release.git] / lustre / mgs / mgs_handler.c
index b0ecdcd..fbbea81 100644 (file)
@@ -1,33 +1,48 @@
 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
  * vim:expandtab:shiftwidth=8:tabstop=8:
  *
- *  lustre/mgs/mgs_handler.c
- *  Lustre Management Server (mgs) request handler
+ * GPL HEADER START
  *
- *  Copyright (C) 2006 Cluster File Systems, Inc.
- *   Author: Nathan Rutman <nathan@clusterfs.com>
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
- *   This file is part of Lustre, http://www.lustre.org.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
  *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
  *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
  *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/mgs/mgs_handler.c
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
  */
 
 #ifndef EXPORT_SYMTAB
 # define EXPORT_SYMTAB
 #endif
 #define DEBUG_SUBSYSTEM S_MGS
-#define D_MGS D_CONFIG/*|D_WARNING*/
+#define D_MGS D_CONFIG
 
 #ifdef __KERNEL__
 # include <linux/module.h>
 #include <lustre_dlm.h>
 #include <lprocfs_status.h>
 #include <lustre_fsfilt.h>
-#include <lustre_commit_confd.h>
 #include <lustre_disk.h>
 #include "mgs_internal.h"
 
 
 /* Establish a connection to the MGS.*/
 static int mgs_connect(const struct lu_env *env,
-                       struct lustre_handle *conn, struct obd_device *obd,
+                       struct obd_export **exp, struct obd_device *obd,
                        struct obd_uuid *cluuid, struct obd_connect_data *data,
                        void *localdata)
 {
-        struct obd_export *exp;
+        struct obd_export *lexp;
+        struct lustre_handle conn = { 0 };
         int rc;
         ENTRY;
 
-        if (!conn || !obd || !cluuid)
+        if (!exp || !obd || !cluuid)
                 RETURN(-EINVAL);
 
-        rc = class_connect(conn, obd, cluuid);
+        rc = class_connect(&conn, obd, cluuid);
         if (rc)
                 RETURN(rc);
-        exp = class_conn2export(conn);
-        LASSERT(exp);
 
-        exp->exp_flvr.sf_rpc = SPTLRPC_FLVR_NULL;
+        lexp = class_conn2export(&conn);
+        LASSERT(lexp);
 
-        mgs_counter_incr(exp, LPROC_MGS_CONNECT);
+        mgs_counter_incr(lexp, LPROC_MGS_CONNECT);
 
         if (data != NULL) {
                 data->ocd_connect_flags &= MGS_CONNECT_SUPPORTED;
-                exp->exp_connect_flags = data->ocd_connect_flags;
+                lexp->exp_connect_flags = data->ocd_connect_flags;
                 data->ocd_version = LUSTRE_VERSION_CODE;
         }
 
+        rc = mgs_client_add(obd, lexp, localdata);
+
         if (rc) {
-                class_disconnect(exp);
+                class_disconnect(lexp);
         } else {
-                class_export_put(exp);
+                *exp = lexp;
         }
 
         RETURN(rc);
 }
 
+static int mgs_reconnect(const struct lu_env *env,
+                         struct obd_export *exp, struct obd_device *obd,
+                         struct obd_uuid *cluuid, struct obd_connect_data *data,
+                         void *localdata)
+{
+        ENTRY;
+
+        if (exp == NULL || obd == NULL || cluuid == NULL)
+                RETURN(-EINVAL);
+
+        mgs_counter_incr(exp, LPROC_MGS_CONNECT);
+
+        if (data != NULL) {
+                data->ocd_connect_flags &= MGS_CONNECT_SUPPORTED;
+                exp->exp_connect_flags = data->ocd_connect_flags;
+                data->ocd_version = LUSTRE_VERSION_CODE;
+        }
+
+        RETURN(0);
+}
+
 static int mgs_disconnect(struct obd_export *exp)
 {
         int rc;
@@ -95,24 +132,7 @@ static int mgs_disconnect(struct obd_export *exp)
         class_export_get(exp);
         mgs_counter_incr(exp, LPROC_MGS_DISCONNECT);
 
-        /* Disconnect early so that clients can't keep using export */
-        rc = class_disconnect(exp);
-        ldlm_cancel_locks_for_export(exp);
-
-        /* complete all outstanding replies */
-        spin_lock(&exp->exp_lock);
-        while (!list_empty(&exp->exp_outstanding_replies)) {
-                struct ptlrpc_reply_state *rs =
-                        list_entry(exp->exp_outstanding_replies.next,
-                                   struct ptlrpc_reply_state, rs_exp_list);
-                struct ptlrpc_service *svc = rs->rs_service;
-
-                spin_lock(&svc->srv_lock);
-                list_del_init(&rs->rs_exp_list);
-                ptlrpc_schedule_difficult_reply(rs);
-                spin_unlock(&svc->srv_lock);
-        }
-        spin_unlock(&exp->exp_lock);
+        rc = server_disconnect_export(exp);
 
         class_export_put(exp);
         RETURN(rc);
@@ -122,8 +142,7 @@ static int mgs_cleanup(struct obd_device *obd);
 static int mgs_handle(struct ptlrpc_request *req);
 
 static int mgs_llog_init(struct obd_device *obd, struct obd_llog_group *olg,
-                         struct obd_device *tgt, int count,
-                         struct llog_catid *logid, struct obd_uuid *uuid)
+                         struct obd_device *tgt, int *index)
 {
         int rc;
         ENTRY;
@@ -171,6 +190,12 @@ static int mgs_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
         if (IS_ERR(obd->obd_fsops))
                 GOTO(err_put, rc = PTR_ERR(obd->obd_fsops));
 
+        if (lvfs_check_rdonly(lvfs_sbdev(mnt->mnt_sb))) {
+                CERROR("%s: Underlying device is marked as read-only. "
+                       "Setup failed\n", obd->obd_name);
+                GOTO(err_ops, rc = -EROFS);
+        }
+
         /* namespace for mgs llog */
         obd->obd_namespace = ldlm_namespace_new(obd ,"MGS", LDLM_NAMESPACE_SERVER,
                                                 LDLM_NAMESPACE_MODEST);
@@ -181,8 +206,6 @@ static int mgs_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
         ptlrpc_init_client(LDLM_CB_REQUEST_PORTAL, LDLM_CB_REPLY_PORTAL,
                            "mgs_ldlm_client", &obd->obd_ldlm_client);
 
-        LASSERT(!lvfs_check_rdonly(lvfs_sbdev(mnt->mnt_sb)));
-
         rc = mgs_fs_setup(obd, mnt);
         if (rc) {
                 CERROR("%s: MGS filesystem method init failed: rc = %d\n",
@@ -190,7 +213,7 @@ static int mgs_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
                 GOTO(err_ns, rc);
         }
 
-        rc = obd_llog_init(obd, &obd->obd_olg, obd, 0, NULL, NULL);
+        rc = obd_llog_init(obd, &obd->obd_olg, obd, NULL);
         if (rc)
                 GOTO(err_fs, rc);
 
@@ -199,17 +222,26 @@ static int mgs_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 
         /* Internal mgs setup */
         mgs_init_fsdb_list(obd);
-        sema_init(&mgs->mgs_sem, 1);
+        cfs_sema_init(&mgs->mgs_sem, 1);
+
+        /* Setup proc */
+        lprocfs_mgs_init_vars(&lvars);
+        if (lprocfs_obd_setup(obd, lvars.obd_vars) == 0) {
+                lproc_mgs_setup(obd);
+                rc = lprocfs_alloc_md_stats(obd, LPROC_MGS_LAST);
+                if (rc)
+                        GOTO(err_llog, rc);
+        }
 
         /* Start the service threads */
         mgs->mgs_service =
                 ptlrpc_init_svc(MGS_NBUFS, MGS_BUFSIZE, MGS_MAXREQSIZE,
                                 MGS_MAXREPSIZE, MGS_REQUEST_PORTAL,
-                                MGC_REPLY_PORTAL, 2000,
+                                MGC_REPLY_PORTAL, 2,
                                 mgs_handle, LUSTRE_MGS_NAME,
                                 obd->obd_proc_entry, target_print_req,
                                 MGS_THREADS_AUTO_MIN, MGS_THREADS_AUTO_MAX,
-                                "ll_mgs", LCT_MD_THREAD);
+                                "ll_mgs", LCT_MD_THREAD, NULL);
 
         if (!mgs->mgs_service) {
                 CERROR("failed to start service\n");
@@ -220,12 +252,6 @@ static int mgs_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
         if (rc)
                 GOTO(err_thread, rc);
 
-        /* Setup proc */
-        lprocfs_mgs_init_vars(&lvars);
-        if (lprocfs_obd_setup(obd, lvars.obd_vars) == 0) {
-                lproc_mgs_setup(obd);
-        }
-
         ping_evictor_start();
 
         LCONSOLE_INFO("MGS %s started\n", obd->obd_name);
@@ -235,6 +261,7 @@ static int mgs_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 err_thread:
         ptlrpc_unregister_service(mgs->mgs_service);
 err_llog:
+        lproc_mgs_cleanup(obd);
         obd_llog_finish(obd, 0);
 err_fs:
         /* No extra cleanup needed for llog_init_commit_thread() */
@@ -323,6 +350,26 @@ static int mgs_put_cfg_lock(struct lustre_handle *lockh)
         RETURN(0);
 }
 
+void mgs_revoke_lock(struct obd_device *obd, struct fs_db *fsdb)
+{
+        struct lustre_handle lockh;
+        int                  lockrc;
+
+        LASSERT(fsdb->fsdb_name[0] != '\0');
+
+        if (cfs_test_and_set_bit(1, &fsdb->fsdb_revoking_lock) == 0) {
+                lockrc = mgs_get_cfg_lock(obd, fsdb->fsdb_name, &lockh);
+                /* clear the bit before lock put */
+                cfs_clear_bit(1, &fsdb->fsdb_revoking_lock);
+
+                if (lockrc != ELDLM_OK)
+                        CERROR("lock error %d for fs %s\n",
+                               lockrc, fsdb->fsdb_name);
+                else
+                        mgs_put_cfg_lock(&lockh);
+        }
+}
+
 /* rc=0 means ok
       1 means update
      <0 means error */
@@ -334,10 +381,9 @@ static int mgs_check_target(struct obd_device *obd, struct mgs_target_info *mti)
         rc = mgs_check_index(obd, mti);
         if (rc == 0) {
                 LCONSOLE_ERROR_MSG(0x13b, "%s claims to have registered, but "
-                                   "this MGS does not know about it. Assuming"
-                                   " writeconf.\n", mti->mti_svname);
-                mti->mti_flags |= LDD_F_WRITECONF;
-                rc = 1;
+                                   "this MGS does not know about it, preventing "
+                                   "registration.\n", mti->mti_svname);
+                rc = -ENOENT;
         } else if (rc == -1) {
                 LCONSOLE_ERROR_MSG(0x13c, "Client log %s-client has "
                                    "disappeared! Regenerating all logs.\n",
@@ -359,9 +405,9 @@ static int mgs_check_target(struct obd_device *obd, struct mgs_target_info *mti)
 static int mgs_handle_target_reg(struct ptlrpc_request *req)
 {
         struct obd_device *obd = req->rq_export->exp_obd;
-        struct lustre_handle lockh;
         struct mgs_target_info *mti, *rep_mti;
-        int rc = 0, lockrc;
+        struct fs_db *fsdb;
+        int rc = 0;
         ENTRY;
 
         mgs_counter_incr(req->rq_export, LPROC_MGS_TARGET_REG);
@@ -379,24 +425,8 @@ static int mgs_handle_target_reg(struct ptlrpc_request *req)
                         GOTO(out_nolock, rc);
         }
 
-        /* Revoke the config lock to make sure nobody is reading. */
-        /* Although actually I think it should be alright if
-           someone was reading while we were updating the logs - if we
-           revoke at the end they will just update from where they left off. */
-        lockrc = mgs_get_cfg_lock(obd, mti->mti_fsname, &lockh);
-        if (lockrc != ELDLM_OK) {
-                LCONSOLE_ERROR_MSG(0x13d, "%s: Can't signal other nodes to "
-                                   "update their configuration (%d). Updating "
-                                   "local logs anyhow; you might have to "
-                                   "manually restart other nodes to get the "
-                                   "latest configuration.\n",
-                                   obd->obd_name, lockrc);
-        }
-
         OBD_FAIL_TIMEOUT(OBD_FAIL_MGS_PAUSE_TARGET_REG, 10);
 
-        /* Log writing contention is handled by the fsdb_sem */
-
         if (mti->mti_flags & LDD_F_WRITECONF) {
                 if (mti->mti_flags & LDD_F_SV_TYPE_MDT &&
                     mti->mti_stripe_index == 0) {
@@ -414,17 +444,31 @@ static int mgs_handle_target_reg(struct ptlrpc_request *req)
                 }
                 mti->mti_flags |= LDD_F_UPDATE;
                 /* Erased logs means start from scratch. */
-                mti->mti_flags &= ~LDD_F_UPGRADE14; 
+                mti->mti_flags &= ~LDD_F_UPGRADE14;
+        }
+
+        rc = mgs_find_or_make_fsdb(obd, mti->mti_fsname, &fsdb);
+        if (rc) {
+                CERROR("Can't get db for %s: %d\n", mti->mti_fsname, rc);
+                GOTO(out_nolock, rc);
         }
 
+        /*
+         * Log writing contention is handled by the fsdb_sem.
+         *
+         * It should be alright if someone was reading while we were
+         * updating the logs - if we revoke at the end they will just update
+         * from where they left off.
+         */
+
         /* COMPAT_146 */
         if (mti->mti_flags & LDD_F_UPGRADE14) {
-                rc = mgs_upgrade_sv_14(obd, mti);
+                rc = mgs_upgrade_sv_14(obd, mti, fsdb);
                 if (rc) {
                         CERROR("Can't upgrade from 1.4 (%d)\n", rc);
                         GOTO(out, rc);
                 }
-                
+
                 /* We're good to go */
                 mti->mti_flags |= LDD_F_UPDATE;
         }
@@ -436,7 +480,7 @@ static int mgs_handle_target_reg(struct ptlrpc_request *req)
 
                 /* create or update the target log
                    and update the client/mdt logs */
-                rc = mgs_write_log_target(obd, mti);
+                rc = mgs_write_log_target(obd, mti, fsdb);
                 if (rc) {
                         CERROR("Failed to write %s log (%d)\n",
                                mti->mti_svname, rc);
@@ -450,9 +494,8 @@ static int mgs_handle_target_reg(struct ptlrpc_request *req)
         }
 
 out:
-        /* done with log update */
-        if (lockrc == ELDLM_OK)
-                mgs_put_cfg_lock(&lockh);
+        mgs_revoke_lock(obd, fsdb);
+
 out_nolock:
         CDEBUG(D_MGS, "replying with %s, index=%d, rc=%d\n", mti->mti_svname,
                mti->mti_stripe_index, rc);
@@ -473,8 +516,7 @@ static int mgs_set_info_rpc(struct ptlrpc_request *req)
 {
         struct obd_device *obd = req->rq_export->exp_obd;
         struct mgs_send_param *msp, *rep_msp;
-        struct lustre_handle lockh;
-        int lockrc, rc;
+        int rc;
         struct lustre_cfg_bufs bufs;
         struct lustre_cfg *lcfg;
         char fsname[MTI_NAME_MAXLEN];
@@ -494,19 +536,6 @@ static int mgs_set_info_rpc(struct ptlrpc_request *req)
                 RETURN(rc);
         }
 
-        /* Revoke lock so everyone updates.  Should be alright if
-         * someone was already reading while we were updating the logs,
-         * so we don't really need to hold the lock while we're
-         * writing.
-         */
-        if (fsname[0]) {
-                lockrc = mgs_get_cfg_lock(obd, fsname, &lockh);
-                if (lockrc != ELDLM_OK)
-                        CERROR("lock error %d for fs %s\n", lockrc,
-                               fsname);
-                else
-                        mgs_put_cfg_lock(&lockh);
-        }
         lustre_cfg_free(lcfg);
 
         rc = req_capsule_server_pack(&req->rq_pill);
@@ -517,6 +546,60 @@ static int mgs_set_info_rpc(struct ptlrpc_request *req)
         RETURN(rc);
 }
 
+/*
+ * similar as in ost_connect_check_sptlrpc()
+ */
+static int mgs_connect_check_sptlrpc(struct ptlrpc_request *req)
+{
+        struct obd_export     *exp = req->rq_export;
+        struct obd_device     *obd = exp->exp_obd;
+        struct fs_db          *fsdb;
+        struct sptlrpc_flavor  flvr;
+        int                    rc = 0;
+
+        if (exp->exp_flvr.sf_rpc == SPTLRPC_FLVR_INVALID) {
+                rc = mgs_find_or_make_fsdb(obd, MGSSELF_NAME, &fsdb);
+                if (rc)
+                        return rc;
+
+                cfs_down(&fsdb->fsdb_sem);
+                if (sptlrpc_rule_set_choose(&fsdb->fsdb_srpc_gen,
+                                            LUSTRE_SP_MGC, LUSTRE_SP_MGS,
+                                            req->rq_peer.nid,
+                                            &flvr) == 0) {
+                        /* by defualt allow any flavors */
+                        flvr.sf_rpc = SPTLRPC_FLVR_ANY;
+                }
+                cfs_up(&fsdb->fsdb_sem);
+
+                cfs_spin_lock(&exp->exp_lock);
+
+                exp->exp_sp_peer = req->rq_sp_from;
+                exp->exp_flvr = flvr;
+
+                if (exp->exp_flvr.sf_rpc != SPTLRPC_FLVR_ANY &&
+                    exp->exp_flvr.sf_rpc != req->rq_flvr.sf_rpc) {
+                        CERROR("invalid rpc flavor %x, expect %x, from %s\n",
+                               req->rq_flvr.sf_rpc, exp->exp_flvr.sf_rpc,
+                               libcfs_nid2str(req->rq_peer.nid));
+                        rc = -EACCES;
+                }
+
+                cfs_spin_unlock(&exp->exp_lock);
+        } else {
+                if (exp->exp_sp_peer != req->rq_sp_from) {
+                        CERROR("RPC source %s doesn't match %s\n",
+                               sptlrpc_part2name(req->rq_sp_from),
+                               sptlrpc_part2name(exp->exp_sp_peer));
+                        rc = -EACCES;
+                } else {
+                        rc = sptlrpc_target_export_check(exp, req);
+                }
+        }
+
+        return rc;
+}
+
 /* Called whenever a target cleans up. */
 /* XXX - Currently unused */
 static int mgs_handle_target_del(struct ptlrpc_request *req)
@@ -548,8 +631,14 @@ int mgs_handle(struct ptlrpc_request *req)
 
         LASSERT(current->journal_info == NULL);
         opc = lustre_msg_get_opc(req->rq_reqmsg);
+
+        if (opc == SEC_CTX_INIT ||
+            opc == SEC_CTX_INIT_CONT ||
+            opc == SEC_CTX_FINI)
+                GOTO(out, rc = 0);
+
         if (opc != MGS_CONNECT) {
-                if (req->rq_export == NULL) {
+                if (!class_connected_export(req->rq_export)) {
                         CERROR("lustre_mgs: operation %d on unconnected MGS\n",
                                opc);
                         req->rq_status = -ENOTCONN;
@@ -563,6 +652,9 @@ int mgs_handle(struct ptlrpc_request *req)
                 /* MGS and MDS have same request format for connect */
                 req_capsule_set(&req->rq_pill, &RQF_MDS_CONNECT);
                 rc = target_handle_connect(req);
+                if (rc == 0)
+                        rc = mgs_connect_check_sptlrpc(req);
+
                 if (!rc && (lustre_msg_get_conn_cnt(req->rq_reqmsg) > 1))
                         /* Make clients trying to reconnect after a MGS restart
                            happy; also requires obd_replayable */
@@ -660,15 +752,147 @@ out:
         RETURN(0);
 }
 
+static inline int mgs_init_export(struct obd_export *exp)
+{
+        cfs_spin_lock(&exp->exp_lock);
+        exp->exp_connecting = 1;
+        cfs_spin_unlock(&exp->exp_lock);
+
+        return ldlm_init_export(exp);
+}
+
 static inline int mgs_destroy_export(struct obd_export *exp)
 {
         ENTRY;
 
         target_destroy_export(exp);
+        mgs_client_free(exp);
+        ldlm_destroy_export(exp);
+
+        RETURN(0);
+}
+
+static int mgs_extract_fs_pool(char * arg, char *fsname, char *poolname)
+{
+        char *ptr;
+
+        ENTRY;
+        for (ptr = arg;  (*ptr != '\0') && (*ptr != '.'); ptr++ ) {
+                *fsname = *ptr;
+                fsname++;
+        }
+        if (*ptr == '\0')
+                return -EINVAL;
+        *fsname = '\0';
+        ptr++;
+        strcpy(poolname, ptr);
 
         RETURN(0);
 }
 
+static int mgs_iocontrol_pool(struct obd_device *obd,
+                              struct obd_ioctl_data *data)
+{
+        int rc;
+        struct lustre_cfg *lcfg = NULL;
+        struct llog_rec_hdr rec;
+        char *fsname = NULL;
+        char *poolname = NULL;
+        ENTRY;
+
+        OBD_ALLOC(fsname, MTI_NAME_MAXLEN);
+        if (fsname == NULL)
+                RETURN(-ENOMEM);
+
+        OBD_ALLOC(poolname, LOV_MAXPOOLNAME + 1);
+        if (poolname == NULL) {
+                rc = -ENOMEM;
+                GOTO(out_pool, rc);
+        }
+        rec.lrh_len = llog_data_len(data->ioc_plen1);
+
+        if (data->ioc_type == LUSTRE_CFG_TYPE) {
+                rec.lrh_type = OBD_CFG_REC;
+        } else {
+                CERROR("unknown cfg record type:%d \n", data->ioc_type);
+                rc = -EINVAL;
+                GOTO(out_pool, rc);
+        }
+
+        if (data->ioc_plen1 > CFS_PAGE_SIZE) {
+                rc = -E2BIG;
+                GOTO(out_pool, rc);
+        }
+
+        OBD_ALLOC(lcfg, data->ioc_plen1);
+        if (lcfg == NULL)
+                GOTO(out_pool, rc = -ENOMEM);
+
+        if (cfs_copy_from_user(lcfg, data->ioc_pbuf1, data->ioc_plen1))
+                GOTO(out_pool, rc = -EFAULT);
+
+        if (lcfg->lcfg_bufcount < 2) {
+                GOTO(out_pool, rc = -EFAULT);
+        }
+
+        /* first arg is always <fsname>.<poolname> */
+        mgs_extract_fs_pool(lustre_cfg_string(lcfg, 1), fsname,
+                            poolname);
+
+        switch (lcfg->lcfg_command) {
+        case LCFG_POOL_NEW: {
+                if (lcfg->lcfg_bufcount != 2)
+                        RETURN(-EINVAL);
+                rc = mgs_pool_cmd(obd, LCFG_POOL_NEW, fsname,
+                                  poolname, NULL);
+                break;
+        }
+        case LCFG_POOL_ADD: {
+                if (lcfg->lcfg_bufcount != 3)
+                        RETURN(-EINVAL);
+                rc = mgs_pool_cmd(obd, LCFG_POOL_ADD, fsname, poolname,
+                                  lustre_cfg_string(lcfg, 2));
+                break;
+        }
+        case LCFG_POOL_REM: {
+                if (lcfg->lcfg_bufcount != 3)
+                        RETURN(-EINVAL);
+                rc = mgs_pool_cmd(obd, LCFG_POOL_REM, fsname, poolname,
+                                  lustre_cfg_string(lcfg, 2));
+                break;
+        }
+        case LCFG_POOL_DEL: {
+                if (lcfg->lcfg_bufcount != 2)
+                        RETURN(-EINVAL);
+                rc = mgs_pool_cmd(obd, LCFG_POOL_DEL, fsname,
+                                  poolname, NULL);
+                break;
+        }
+        default: {
+                 rc = -EINVAL;
+                 GOTO(out_pool, rc);
+        }
+        }
+
+        if (rc) {
+                CERROR("OBD_IOC_POOL err %d, cmd %X for pool %s.%s\n",
+                       rc, lcfg->lcfg_command, fsname, poolname);
+                GOTO(out_pool, rc);
+        }
+
+out_pool:
+        if (lcfg != NULL)
+                OBD_FREE(lcfg, data->ioc_plen1);
+
+        if (fsname != NULL)
+                OBD_FREE(fsname, MTI_NAME_MAXLEN);
+
+        if (poolname != NULL)
+                OBD_FREE(poolname, LOV_MAXPOOLNAME + 1);
+
+        RETURN(rc);
+}
+
 /* from mdt_iocontrol */
 int mgs_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
                   void *karg, void *uarg)
@@ -684,11 +908,9 @@ int mgs_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
         switch (cmd) {
 
         case OBD_IOC_PARAM: {
-                struct lustre_handle lockh;
                 struct lustre_cfg *lcfg;
                 struct llog_rec_hdr rec;
                 char fsname[MTI_NAME_MAXLEN];
-                int lockrc;
 
                 rec.lrh_len = llog_data_len(data->ioc_plen1);
 
@@ -702,9 +924,8 @@ int mgs_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
                 OBD_ALLOC(lcfg, data->ioc_plen1);
                 if (lcfg == NULL)
                         RETURN(-ENOMEM);
-                rc = copy_from_user(lcfg, data->ioc_pbuf1, data->ioc_plen1);
-                if (rc)
-                        GOTO(out_free, rc);
+                if (cfs_copy_from_user(lcfg, data->ioc_pbuf1, data->ioc_plen1))
+                        GOTO(out_free, rc = -EFAULT);
 
                 if (lcfg->lcfg_bufcount < 1)
                         GOTO(out_free, rc = -EINVAL);
@@ -714,25 +935,15 @@ int mgs_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
                         CERROR("setparam err %d\n", rc);
                         GOTO(out_free, rc);
                 }
-
-                /* Revoke lock so everyone updates.  Should be alright if
-                   someone was already reading while we were updating the logs,
-                   so we don't really need to hold the lock while we're
-                   writing (above). */
-                if (fsname[0]) {
-                        lockrc = mgs_get_cfg_lock(obd, fsname, &lockh);
-                        if (lockrc != ELDLM_OK)
-                                CERROR("lock error %d for fs %s\n", lockrc,
-                                       fsname);
-                        else
-                                mgs_put_cfg_lock(&lockh);
-                }
-
 out_free:
                 OBD_FREE(lcfg, data->ioc_plen1);
                 RETURN(rc);
         }
 
+        case OBD_IOC_POOL: {
+                RETURN(mgs_iocontrol_pool(obd, data));
+        }
+
         case OBD_IOC_DUMP_LOG: {
                 struct llog_ctxt *ctxt;
                 ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT);
@@ -769,10 +980,12 @@ out_free:
 static struct obd_ops mgs_obd_ops = {
         .o_owner           = THIS_MODULE,
         .o_connect         = mgs_connect,
+        .o_reconnect       = mgs_reconnect,
         .o_disconnect      = mgs_disconnect,
         .o_setup           = mgs_setup,
         .o_precleanup      = mgs_precleanup,
         .o_cleanup         = mgs_cleanup,
+        .o_init_export     = mgs_init_export,
         .o_destroy_export  = mgs_destroy_export,
         .o_iocontrol       = mgs_iocontrol,
         .o_llog_init       = mgs_llog_init,
@@ -795,7 +1008,7 @@ static void /*__exit*/ mgs_exit(void)
         class_unregister_type(LUSTRE_MGS_NAME);
 }
 
-MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
 MODULE_DESCRIPTION("Lustre  Management Server (MGS)");
 MODULE_LICENSE("GPL");