From 5b7f7c3da1266765a5de5a3e26feb8441a762d2c Mon Sep 17 00:00:00 2001 From: rread Date: Fri, 20 Dec 2002 08:22:42 +0000 Subject: [PATCH] * merge b_recovery into HEAD (the midnight merge troll rides again) --- lustre/include/linux/lustre_ha.h | 1 + lustre/include/linux/lustre_lib.h | 1 + lustre/include/linux/lustre_net.h | 1 + lustre/lib/client.c | 18 +++++ lustre/lov/lov_obd.c | 2 +- lustre/mdc/mdc_request.c | 7 ++ lustre/osc/osc_request.c | 7 +- lustre/ptlrpc/client.c | 64 ++++++++++++--- lustre/ptlrpc/rpc.c | 1 + lustre/tests/local.sh | 14 ++-- lustre/tests/lov.xml | 70 ++++++++++++++++ lustre/utils/lconf.in | 162 ++++++++++++++++++++++++++------------ lustre/utils/lmc | 117 +++++++++++++++++++-------- 13 files changed, 363 insertions(+), 102 deletions(-) create mode 100644 lustre/tests/lov.xml diff --git a/lustre/include/linux/lustre_ha.h b/lustre/include/linux/lustre_ha.h index 1e6596b..bfac4c3 100644 --- a/lustre/include/linux/lustre_ha.h +++ b/lustre/include/linux/lustre_ha.h @@ -29,6 +29,7 @@ struct ptlrpc_connection; #define PTLRPC_RECOVD_PHASE_PREPARE 1 #define PTLRPC_RECOVD_PHASE_RECOVER 2 #define PTLRPC_RECOVD_PHASE_FAILURE 3 +#define PTLRPC_RECOVD_PHASE_NOTCONN 4 typedef int (*ptlrpc_recovery_cb_t)(struct recovd_data *, int); diff --git a/lustre/include/linux/lustre_lib.h b/lustre/include/linux/lustre_lib.h index 0372504..aa58c49 100644 --- a/lustre/include/linux/lustre_lib.h +++ b/lustre/include/linux/lustre_lib.h @@ -64,6 +64,7 @@ int client_obd_disconnect(struct lustre_handle *conn); int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf); int client_obd_cleanup(struct obd_device * obddev); struct client_obd *client_conn2cli(struct lustre_handle *conn); +struct obd_device *client_tgtuuid2obd(char *tgtuuid); int target_revoke_connection(struct recovd_data *rd, int phase); diff --git a/lustre/include/linux/lustre_net.h b/lustre/include/linux/lustre_net.h index fb060d0..142db3b 100644 --- a/lustre/include/linux/lustre_net.h +++ b/lustre/include/linux/lustre_net.h @@ -308,6 +308,7 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req); void ptlrpc_continue_req(struct ptlrpc_request *req); int ptlrpc_replay_req(struct ptlrpc_request *req); void ptlrpc_restart_req(struct ptlrpc_request *req); +void ptlrpc_abort_inflight(struct obd_import *imp); struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, int opcode, int count, int *lengths, char **bufs); diff --git a/lustre/lib/client.c b/lustre/lib/client.c index c75b399..03fa4e2 100644 --- a/lustre/lib/client.c +++ b/lustre/lib/client.c @@ -40,6 +40,24 @@ struct client_obd *client_conn2cli(struct lustre_handle *conn) return &export->exp_obd->u.cli; } +struct obd_device *client_tgtuuid2obd(char *tgtuuid) +{ + int i; + + for (i=0; i < MAX_OBD_DEVICES; i++) { + struct obd_device *obd = &obd_dev[i]; + if ((strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) == 0) || + (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0)) { + struct client_obd *cli = &obd->u.cli; + if (strncmp(tgtuuid, cli->cl_target_uuid, + sizeof(cli->cl_target_uuid)) == 0) + return obd; + } + } + + return NULL; +} + int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf) { struct obd_ioctl_data* data = buf; diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c index d2dc23c..fe5aad4 100644 --- a/lustre/lov/lov_obd.c +++ b/lustre/lov/lov_obd.c @@ -158,7 +158,7 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd, memcpy(lov->tgts[i].uuid, uuidarray[i], sizeof(*uuidarray)); for (i = 0; i < desc->ld_tgt_count; i++) { - struct obd_device *tgt = class_uuid2obd(uuidarray[i]); + struct obd_device *tgt = client_tgtuuid2obd(uuidarray[i]); int rc2; if (!tgt) { diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c index daeccf1..c856d10 100644 --- a/lustre/mdc/mdc_request.c +++ b/lustre/mdc/mdc_request.c @@ -646,6 +646,7 @@ static int mdc_recover(struct obd_import *imp, int phase) NULL, LDLM_FL_LOCAL_ONLY); RETURN(0); case PTLRPC_RECOVD_PHASE_RECOVER: + reconnect: rc = ptlrpc_reconnect_import(imp, MDS_CONNECT); if (rc == EALREADY) RETURN(ptlrpc_replay(imp, 0)); @@ -671,6 +672,12 @@ static int mdc_recover(struct obd_import *imp, int phase) RETURN(rc); RETURN(0); + + case PTLRPC_RECOVD_PHASE_NOTCONN: + ldlm_namespace_cleanup(imp->imp_obd->obd_namespace, 1); + ptlrpc_abort_inflight(imp); + goto reconnect; + default: RETURN(-EINVAL); } diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index 30aa36d..1e2f72e 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -958,10 +958,11 @@ static int osc_recover(struct obd_import *imp, int phase) ENTRY; switch(phase) { + case PTLRPC_RECOVD_PHASE_PREPARE: { struct ldlm_namespace *ns = imp->imp_obd->obd_namespace; ldlm_namespace_cleanup(ns, 1 /* no network ops */); - abort_inflight_for_import(imp); + ptlrpc_abort_inflight(imp); set_osc_active(imp, 0 /* inactive */); RETURN(0); } @@ -981,6 +982,10 @@ static int osc_recover(struct obd_import *imp, int phase) set_osc_active(imp, 1 /* active */); RETURN(0); + case PTLRPC_RECOVD_PHASE_NOTCONN: + osc_recover(imp, PTLRPC_RECOVD_PHASE_PREPARE); + RETURN(osc_recover(imp, PTLRPC_RECOVD_PHASE_RECOVER)); + default: RETURN(-EINVAL); } diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index b909b75..ccaa108 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -620,9 +620,9 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req) NTOH__u32(req->rq_reqmsg->status), req->rq_xid, conn->c_peer.peer_nid, NTOH__u32(req->rq_reqmsg->opc)); + spin_lock(&imp->imp_lock); + EIO_IF_INVALID(req); if (req->rq_level > imp->imp_level) { - spin_lock(&imp->imp_lock); - EIO_IF_INVALID(req); list_del(&req->rq_list); list_add_tail(&req->rq_list, &imp->imp_delayed_list); spin_unlock(&imp->imp_lock); @@ -647,9 +647,6 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req) CERROR("process %d resumed\n", current->pid); } resend: - req->rq_timeout = obd_timeout; - spin_lock(&imp->imp_lock); - EIO_IF_INVALID(req); LASSERT(list_empty(&req->rq_list)); list_add_tail(&req->rq_list, &imp->imp_sending_list); @@ -716,11 +713,28 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req) GOTO(out, rc = -EINVAL); } #endif - CDEBUG(D_NET, "got rep "LPU64"\n", req->rq_xid); - if (req->rq_repmsg->status == 0) - CDEBUG(D_NET, "--> buf %p len %d status %d\n", req->rq_repmsg, - req->rq_replen, req->rq_repmsg->status); + DEBUG_REQ(D_NET, req, "status %d\n", req->rq_repmsg->status); + /* We're a rejected connection, need to invalidate and rebuild. */ + if (req->rq_repmsg->status == -ENOTCONN) { + spin_lock(&imp->imp_lock); + /* If someone else is reconnecting us (CONN_RECOVD) or has + * already completed it (handle mismatch), then we just need + * to get out. + */ + if (imp->imp_level == LUSTRE_CONN_RECOVD || + imp->imp_handle.addr != req->rq_reqmsg->addr || + imp->imp_handle.cookie != req->rq_reqmsg->cookie) { + spin_unlock(&imp->imp_lock); + GOTO(out, rc = -EIO); + } + imp->imp_level = LUSTRE_CONN_RECOVD; + spin_unlock(&imp->imp_lock); + rc = imp->imp_recover(imp, PTLRPC_RECOVD_PHASE_NOTCONN); + if (rc) + LBUG(); + GOTO(out, rc = -EIO); + } if (req->rq_import->imp_flags & IMP_REPLAYABLE) { spin_lock(&imp->imp_lock); @@ -819,3 +833,35 @@ int ptlrpc_replay_req(struct ptlrpc_request *req) req->rq_level = old_level; RETURN(rc); } + +/* XXX looks a lot like super.c:invalidate_request_list, don't it? */ +void ptlrpc_abort_inflight(struct obd_import *imp) +{ + struct list_head *tmp, *n; + + /* Make sure that no new requests get processed for this import. + * ptlrpc_queue_wait must (and does) hold imp_lock while testing this + * flag and then putting requests on sending_list or delayed_list. + */ + spin_lock(&imp->imp_lock); + imp->imp_flags |= IMP_INVALID; + spin_unlock(&imp->imp_lock); + + list_for_each_safe(tmp, n, &imp->imp_sending_list) { + struct ptlrpc_request *req = + list_entry(tmp, struct ptlrpc_request, rq_list); + + DEBUG_REQ(D_HA, req, "inflight"); + req->rq_flags |= PTL_RPC_FL_ERR; + wake_up(&req->rq_wait_for_rep); + } + + list_for_each_safe(tmp, n, &imp->imp_delayed_list) { + struct ptlrpc_request *req = + list_entry(tmp, struct ptlrpc_request, rq_list); + + DEBUG_REQ(D_HA, req, "aborting waiting req"); + req->rq_flags |= PTL_RPC_FL_ERR; + wake_up(&req->rq_wait_for_rep); + } +} diff --git a/lustre/ptlrpc/rpc.c b/lustre/ptlrpc/rpc.c index eb6acb1..1384b5d 100644 --- a/lustre/ptlrpc/rpc.c +++ b/lustre/ptlrpc/rpc.c @@ -255,6 +255,7 @@ EXPORT_SYMBOL(ptlrpc_free_bulk); EXPORT_SYMBOL(ptlrpc_prep_bulk_page); EXPORT_SYMBOL(ptlrpc_free_bulk_page); EXPORT_SYMBOL(ll_brw_sync_wait); +EXPORT_SYMBOL(ptlrpc_abort_inflight); /* service.c */ EXPORT_SYMBOL(ptlrpc_init_svc); diff --git a/lustre/tests/local.sh b/lustre/tests/local.sh index 7d369f4..f680f4b 100755 --- a/lustre/tests/local.sh +++ b/lustre/tests/local.sh @@ -2,7 +2,7 @@ config=${1:-local.xml} -LMC=${LMC:-../utils/lmc} +LMC="${LMC:-../utils/lmc} -m $config" TMP=${TMP:-/tmp} MDSDEV=$TMP/mds1 @@ -22,15 +22,17 @@ case $kver in esac +rm -f $config + # create nodes -${LMC} -o $config --add node --node localhost || exit 10 -${LMC} -o $config --add net --node localhost --nid localhost --nettype tcp || exit 11 +${LMC} --add node --node localhost || exit 10 +${LMC} --add net --node localhost --nid localhost --nettype tcp || exit 11 # configure mds server -${LMC} -m $config --add mds --node localhost --mds mds1 --dev $MDSDEV --size $MDSSIZE || exit 20 +${LMC} --add mds --node localhost --mds mds1 --dev $MDSDEV --size $MDSSIZE || exit 20 # configure ost -${LMC} -m $config --add ost --node localhost --obd obd1 --dev $OSTDEV --size $OSTSIZE || exit 30 +${LMC} --add ost --node localhost --obd obd1 --dev $OSTDEV --size $OSTSIZE || exit 30 # create client config -${LMC} -m $config --add mtpt --node localhost --path /mnt/lustre --mds mds1 --obd obd1 || exit 40 +${LMC} --add mtpt --node localhost --path /mnt/lustre --mds mds1 --obd obd1 || exit 40 diff --git a/lustre/tests/lov.xml b/lustre/tests/lov.xml new file mode 100644 index 0000000..532c1ec2 --- /dev/null +++ b/lustre/tests/lov.xml @@ -0,0 +1,70 @@ + + + + + + + + + + + + + + + + + + localhost + 988 + + + + extN + /tmp/mds1 + yes + + + + + + + + + + + + + + + extN + /tmp/ost1 + no + + + + + + + + + + + extN + /tmp/ost2 + no + + + + + + + + + + + + + /mnt/lustre + + diff --git a/lustre/utils/lconf.in b/lustre/utils/lconf.in index 0f39037..170c5d0 100755 --- a/lustre/utils/lconf.in +++ b/lustre/utils/lconf.in @@ -112,6 +112,8 @@ class Config: self._portals_dir = '' self._minlevel = 0 self._maxlevel = 100 + self._timeout = -1 + self._recovery_upcall = '' def verbose(self, flag = None): if flag: self._verbose = flag @@ -185,6 +187,13 @@ class Config: if val: self._lustre_dir = val return self._lustre_dir + def timeout(self, val = None): + if val: self._timeout = val + return self._timeout + + def recovery_upcall(self, val = None): + if val: self._recovery_upcall = val + return self._recovery_upcall config = Config() @@ -481,7 +490,6 @@ def find_prog(cmd): syspath.insert(0, os.path.join(cmdpath, config.portals_dir()+'/linux/utils/')) for d in syspath: prog = os.path.join(d,cmd) - debug(prog) if os.access(prog, os.X_OK): return prog return '' @@ -917,7 +925,7 @@ class LOV(Module): self.stripe_sz = get_attr_int(dev_node, 'stripesize', 65536) self.stripe_off = get_attr_int(dev_node, 'stripeoffset', 0) self.pattern = get_attr_int(dev_node, 'pattern', 0) - self.devlist = get_all_refs(dev_node, 'osc') + self.devlist = get_all_refs(dev_node, 'obd') self.stripe_cnt = get_attr_int(dev_node, 'stripecount', len(self.devlist)) self.add_lustre_module('mdc', 'mdc') self.add_lustre_module('lov', 'lov') @@ -925,14 +933,14 @@ class LOV(Module): def prepare(self): if is_prepared(self.uuid): return - for osc_uuid in self.devlist: - osc = lookup(self.dom_node.parentNode, osc_uuid) + for obd_uuid in self.devlist: + obd = lookup(self.dom_node.parentNode, obd_uuid) + osc = get_osc(obd) if osc: - n = OSC(osc) try: # Ignore connection failures, because the LOV will DTRT with # an unconnected OSC. - n.prepare(ignore_connect_failure=1) + osc.prepare(ignore_connect_failure=1) except CommandError: print "Error preparing OSC %s (inactive)\n" % osc_uuid else: @@ -946,11 +954,11 @@ class LOV(Module): def cleanup(self): if not is_prepared(self.uuid): return - for osc_uuid in self.devlist: - osc = lookup(self.dom_node.parentNode, osc_uuid) + for obd_uuid in self.devlist: + obd = lookup(self.dom_node.parentNode, obd_uuid) + osc = get_osc(obd) if osc: - n = OSC(osc) - n.cleanup() + osc.cleanup() else: panic('osc not found:', osc_uuid) Module.cleanup(self) @@ -958,11 +966,11 @@ class LOV(Module): def load_module(self): - for osc_uuid in self.devlist: - osc = lookup(self.dom_node.parentNode, osc_uuid) + for obd_uuid in self.devlist: + obd = lookup(self.dom_node.parentNode, obd_uuid) + osc = get_osc(obd) if osc: - n = OSC(osc) - n.load_module() + osc.load_module() break else: panic('osc not found:', osc_uuid) @@ -971,11 +979,11 @@ class LOV(Module): def cleanup_module(self): Module.cleanup_module(self) - for osc_uuid in self.devlist: - osc = lookup(self.dom_node.parentNode, osc_uuid) + for obd_uuid in self.devlist: + obd = lookup(self.dom_node.parentNode, obd_uuid) + osc = get_osc(obd) if osc: - n = OSC(osc) - n.cleanup_module() + osc.cleanup_module() break else: panic('osc not found:', osc_uuid) @@ -1069,6 +1077,7 @@ class OBD(Module): self.obdtype = get_attr(dom_node, 'type') self.devname, self.size = get_device(dom_node) self.fstype = get_text(dom_node, 'fstype') + self.active_target = get_text(dom_node, 'active_target') # FIXME: if fstype not set, then determine based on kernel version self.format = get_text(dom_node, 'autoformat', 'yes') if self.fstype == 'extN': @@ -1135,7 +1144,9 @@ class VOSC(Module): if dom_node.nodeName == 'lov': self.osc = LOV(dom_node) else: - self.osc = OSC(dom_node) + self.osc = get_osc(dom_node) + def get_uuid(self): + return self.osc.uuid def prepare(self): self.osc.prepare() def cleanup(self): @@ -1147,10 +1158,17 @@ class VOSC(Module): class OSC(Module): - def __init__(self,dom_node): - Module.__init__(self, 'OSC', dom_node) - self.obd_uuid = get_first_ref(dom_node, 'obd') - self.ost_uuid = get_first_ref(dom_node, 'ost') + def __init__(self, dom_node, obd_name, obd_uuid, ost_uuid): + self.dom_node = dom_node + self.module_name = 'OSC' + self.name = 'OSC_%s' % (obd_name) + self.uuid = '%s_%05x' % (self.name, int(random.random() * 1048576)) + self.kmodule_list = [] + self._server = None + self._connected = 0 + + self.obd_uuid = obd_uuid + self.ost_uuid = ost_uuid self.lookup_server(self.ost_uuid) self.add_lustre_module('osc', 'osc') @@ -1176,8 +1194,6 @@ class OSC(Module): setup ="%s %s" %(self.obd_uuid, srv.uuid)) def cleanup(self): - if not is_prepared(self.uuid): - return srv = self.get_server() if local_net(srv): Module.cleanup(self) @@ -1198,18 +1214,18 @@ class ECHO_CLIENT(Module): def __init__(self,dom_node): Module.__init__(self, 'ECHO_CLIENT', dom_node) self.add_lustre_module('obdecho', 'obdecho') - self.lov_uuid = get_first_ref(dom_node, 'osc') - l = lookup(self.dom_node.parentNode, self.lov_uuid) - self.osc = VOSC(l) + self.obd_uuid = get_first_ref(dom_node, 'obd') + obd = lookup(self.dom_node.parentNode, self.obd_uuid) + self.osc = VOSC(obd) def prepare(self): if is_prepared(self.uuid): return self.osc.prepare() # XXX This is so cheating. -p - self.info(self.lov_uuid) + self.info(self.obd_uuid) lctl.newdev(attach="echo_client %s %s" % (self.name, self.uuid), - setup = self.lov_uuid) + setup = self.obd_uuid) def cleanup(self): if not is_prepared(self.uuid): @@ -1229,25 +1245,26 @@ class Mountpoint(Module): Module.__init__(self, 'MTPT', dom_node) self.path = get_text(dom_node, 'path') self.mds_uuid = get_first_ref(dom_node, 'mds') - self.lov_uuid = get_first_ref(dom_node, 'osc') + self.obd_uuid = get_first_ref(dom_node, 'obd') self.add_lustre_module('mdc', 'mdc') self.add_lustre_module('llite', 'llite') - l = lookup(self.dom_node.parentNode, self.lov_uuid) - self.osc = VOSC(l) + obd = lookup(self.dom_node.parentNode, self.obd_uuid) + self.osc = VOSC(obd) + def prepare(self): self.osc.prepare() mdc_uuid = prepare_mdc(self.dom_node.parentNode, self.mds_uuid) - self.info(self.path, self.mds_uuid, self.lov_uuid) + self.info(self.path, self.mds_uuid, self.obd_uuid) cmd = "mount -t lustre_lite -o osc=%s,mdc=%s none %s" % \ - (self.lov_uuid, mdc_uuid, self.path) + (self.osc.get_uuid(), mdc_uuid, self.path) run("mkdir", self.path) ret, val = run(cmd) if ret: panic("mount failed:", self.path) def cleanup(self): - self.info(self.path, self.mds_uuid,self.lov_uuid) + self.info(self.path, self.mds_uuid,self.obd_uuid) if fs_is_mounted(self.path): if config.force(): (rc, out) = run("umount", "-f", self.path) @@ -1259,7 +1276,6 @@ class Mountpoint(Module): if fs_is_mounted(self.path): panic("fs is still mounted:", self.path) - l = lookup(self.dom_node.parentNode, self.lov_uuid) self.osc.cleanup() cleanup_mdc(self.dom_node.parentNode, self.mds_uuid) @@ -1273,9 +1289,14 @@ class Mountpoint(Module): # ============================================================ # XML processing and query -# TODO: Change query funcs to use XPath, which is muc cleaner -# Or not. Originally both lconf and lmc used XPath, but it was many -# orders of magnitute slower, and lmc was unusable. - robert + +# OSC is no longer in the xml, so we have to fake it. +# this is getting ugly and begging for another refactoring +def get_osc(obd_dom): + obd = OBD(obd_dom) + osc = OSC(obd_dom, obd.name, obd.uuid, obd.active_target) + return osc + def get_device(obd): list = obd.getElementsByTagName('device') @@ -1618,6 +1639,7 @@ def startProfile(lustreNode, profileNode, module_flag): # Load profile for def doHost(lustreNode, hosts): global routes + global router_flag dom_node = None for h in hosts: dom_node = getByName(lustreNode, h, 'node') @@ -1627,12 +1649,16 @@ def doHost(lustreNode, hosts): print 'No host entry found.' return - if not get_attr(dom_node, 'router'): + if get_attr(dom_node, 'router'): + router_flag = 1 + else: + router_flag = 0 + recovery_upcall = get_attr(dom_node, 'recovery_upcall') + timeout = get_attr_int(dom_node, 'timeout') + + if not router_flag: init_node(dom_node) init_route_config(lustreNode) - else: - global router_flag - router_flag = 1 # Two step process: (1) load modules, (2) setup lustre # if not cleaning, load modules first. @@ -1649,6 +1675,9 @@ def doHost(lustreNode, hosts): # dump /tmp/ogdb and sleep/pause here log ("The GDB module script is in", script) time.sleep(5) + sys_set_timeout(timeout) + sys_set_recovery_upcall(recovery_upcall) + module_flag = not module_flag for profile in reflist: @@ -1662,7 +1691,8 @@ def parse_cmdline(argv): long_opts = ["ldap", "reformat", "lustre=", "verbose", "gdb", "portals=", "makeldiff", "cleanup", "noexec", "help", "node=", "nomod", "nosetup", - "dump=", "force", "minlevel=", "maxlevel="] + "dump=", "force", "minlevel=", "maxlevel=", + "timeout=", "recovery_upcall="] opts = [] args = [] @@ -1704,6 +1734,10 @@ def parse_cmdline(argv): config.minlevel(a) if o in ("--maxlevel",): config.maxlevel(a) + if o in ("--timeout",): + config.timeout(a) + if o in ("--recovery_upcall",): + config.recovery_upcall(a) return args def fetch(url): @@ -1734,19 +1768,43 @@ def setupModulePath(cmd, portals_dir = PORTALS_DIR): dir = os.path.join(config.lustre_dir(), dir) config.portals_dir(dir) -def sys_set_debug_path(): - debug("debug path: ", config.debug_path()) +def sysctl(path, val): if config.noexec(): return try: - fp = open('/proc/sys/portals/debug_path', 'w') - fp.write(config.debug_path()) + fp = open(os.path.join('/proc/sys', path), 'w') + fp.write(str(val)) fp.close() except IOError, e: print e - -#/proc/sys/net/core/rmem_max -#/proc/sys/net/core/wmem_max + + +def sys_set_debug_path(): + debug("debug path: ", config.debug_path()) + sysctl('portals/debug_path', config.debug_path()) + +def sys_set_recovery_upcall(upcall): + # the command overrides the value in the node config + if config.recovery_upcall(): + upcall = config.recovery_upcall() + if upcall: + debug("setting recovery_upcall:", upcall) + sysctl('lustre/recovery_upcall', upcall) + +def sys_set_timeout(timeout): + # the command overrides the value in the node config + if config.timeout() >= 0: + timeout = config.timeout() + if timeout >= 0: + debug("setting timeout:", timeout) + sysctl('lustre/timeout', timeout) + +def sys_set_ptldebug(ptldebug): + # the command overrides the value in the node config + if config.ptldebug(): + ptldebug = config.ptldebug() + sysctl('portals/debug', ptldebug) + def sys_set_netmem_max(path, max): debug("setting", path, "to at least", max) if config.noexec(): diff --git a/lustre/utils/lmc b/lustre/utils/lmc index b4f92ea..3ea5265 100755 --- a/lustre/utils/lmc +++ b/lustre/utils/lmc @@ -32,7 +32,50 @@ from xml.dom.ext import PrettyPrint DEFAULT_PORT = 988 def usage(): - print """usage: lmc --add object [object parameters]""" + print """usage: lmc --add object [object parameters] + +Object creation command summary: + +--add node + --node node_name + --timeout num + --recovery_upcall path + +--add net + --node node_name + --nid addr + --nettype tcp|elan|toe|gm + --port port + --tcpbuf size + --router + +--add mds + --node node_name + --mds mds_name + --dev path + --size size + +--add lov + --lov lov_name + --mds mds_name + --stripe_sz num + --stripe_cnt num + --stripe_pattern num + +-add ost + --node node_name + --obd obd_name + --lov lov_name + --dev path + --size size + --obduuid uuid + +--add mtpt - Mountpoint + --node node_name + --path /mnt/point + --mds mds_name + --obd obd_name OR --lov lovname +""" sys.exit(1) def error(*args): @@ -167,9 +210,10 @@ class GenConfig: ldlm = self.newService("ldlm", name, uuid) return ldlm - def obd(self, name, uuid, fs, obdtype, devname, format, dev_size=0): + def obd(self, name, uuid, fs, obdtype, devname, format, ost_uuid, dev_size=0): obd = self.newService("obd", name, uuid) obd.setAttribute('type', obdtype) + self.addElement(obd, 'active_target', ost_uuid) if fs: self.addElement(obd, "fstype", fs) if devname: @@ -179,18 +223,18 @@ class GenConfig: self.addElement(obd, "autoformat", format) return obd +# def osc(self, name, uuid, obd_uuid, net_uuid): +# osc = self.newService("osc", name, uuid) +# osc.appendChild(self.ref("ost", net_uuid)) +# osc.appendChild(self.ref("obd", obd_uuid)) +# return osc + def cobd(self, name, uuid, real_uuid, cache_uuid): cobd = self.newService("cobd", name, uuid) cobd.appendChild(self.ref("real_obd",real_uuid)) cobd.appendChild(self.ref("cache_obd",cache_uuid)) return cobd - def osc(self, name, uuid, obd_uuid, net_uuid): - osc = self.newService("osc", name, uuid) - osc.appendChild(self.ref("ost", net_uuid)) - osc.appendChild(self.ref("obd", obd_uuid)) - return osc - def ost(self, name, uuid, obd_uuid, net_uuid): ost = self.newService("ost", name, uuid) ost.appendChild(self.ref("network", net_uuid)) @@ -228,13 +272,13 @@ class GenConfig: def mountpoint(self, name, uuid, mds_uuid, osc_uuid, path): mtpt = self.newService("mountpoint", name, uuid) mtpt.appendChild(self.ref("mds", mds_uuid)) - mtpt.appendChild(self.ref("osc", osc_uuid)) + mtpt.appendChild(self.ref("obd", osc_uuid)) self.addElement(mtpt, "path", path) return mtpt def echo_client(self, name, uuid, osc_uuid): ec = self.newService("echo_client", name, uuid) - ec.appendChild(self.ref("osc", osc_uuid)) + ec.appendChild(self.ref("obd", osc_uuid)) return ec ############################################################ @@ -308,10 +352,10 @@ def get_net_uuid(lustre, node_name): return None -def lov_add_osc(gen, lov, osc_uuid): +def lov_add_obd(gen, lov, osc_uuid): devs = lov.getElementsByTagName('devices') if len(devs) == 1: - devs[0].appendChild(gen.ref("osc", osc_uuid)) + devs[0].appendChild(gen.ref("obd", osc_uuid)) else: error("No devices element found for LOV:", lov) @@ -335,8 +379,12 @@ def do_add_node(gen, lustre, options, node_name): uuid = new_uuid(node_name) node = gen.node(node_name, uuid) node_add_profile(gen, node, 'ldlm', ldlm_uuid) - if options.has_key('router'): + if has_option(options, 'router'): node.setAttribute('router', '1') + if has_option(options, 'timeout'): + node.setAttribute('timeout', get_option(options, 'timeout')) + if has_option(options, 'recovery_upcall'): + node.setAttribute('recovery_upcall', get_option(options, 'recovery_upcall')) lustre.appendChild(node) return node @@ -446,7 +494,6 @@ def add_ost(gen, lustre, options): obdname = get_option(options, 'obd', 'OBD_'+ node_name) obdname = new_name(obdname) - oscname = new_name('OSC_'+ obdname) ostname = new_name('OST_'+ obdname) if options.has_key('obduuid'): obd_uuid = options['obduuid'] @@ -456,28 +503,26 @@ def add_ost(gen, lustre, options): else: obd_uuid = new_uuid(obdname) ost_uuid = new_uuid(ostname) - osc_uuid = new_uuid(oscname) net_uuid = get_net_uuid(lustre, node_name) if not net_uuid: error("NODE: ", node_name, "not found") - obd = gen.obd(obdname, obd_uuid, fstype, obdtype, devname, get_format_flag(options), size) + obd = gen.obd(obdname, obd_uuid, fstype, obdtype, devname, get_format_flag(options), ost_uuid, + size) ost = gen.ost(ostname, ost_uuid, obd_uuid, net_uuid) - osc = gen.osc(oscname, osc_uuid, obd_uuid, ost_uuid) if lovname: lov = findByName(lustre, lovname, "lov") if not lov: error('add_ost:', '"'+lovname+'"', "lov element not found.") - lov_add_osc(gen, lov, osc_uuid) + lov_add_obd(gen, lov, obd_uuid) node = findByName(lustre, node_name, "node") node_add_profile(gen, node, 'obd', obd_uuid) node_add_profile(gen, node, 'ost', ost_uuid) lustre.appendChild(obd) - lustre.appendChild(osc) lustre.appendChild(ost) @@ -488,12 +533,9 @@ def add_cobd(gen, lustre, options): real_name = get_option(options, 'real_obd') cache_name = get_option(options, 'cache_obd') - # temp hack until merged with b_recover and OSC is removed - real_name = 'OSC_' + real_name - cache_name = 'OSC_' + cache_name - real_uuid = name2uuid(lustre, real_name, tag='osc') - cache_uuid = name2uuid(lustre, cache_name, tag='osc') + real_uuid = name2uuid(lustre, real_name, tag='obd') + cache_uuid = name2uuid(lustre, cache_name, tag='obd') node = findByName(lustre, node_name, "node") node_add_profile(gen, node, "cobd", uuid) @@ -514,9 +556,7 @@ def add_echo_client(gen, lustre, options): lov_uuid = name2uuid(lustre, lov_name, tag='lov', fatal=0) if not lov_uuid: - # remove this hack when the osc uuids are removed - lov_name = 'OSC_' + lov_name - lov_uuid = name2uuid(lustre, lov_name, tag='osc', fatal=1) + lov_uuid = name2uuid(lustre, lov_name, tag='obd', fatal=1) echo = gen.echo_client(echoname, echo_uuid, lov_uuid) lustre.appendChild(echo) @@ -574,9 +614,7 @@ def add_mtpt(gen, lustre, options): mds_uuid = name2uuid(lustre, mds_name, tag='mds') lov_uuid = name2uuid(lustre, lov_name, tag='lov', fatal=0) if not lov_uuid: - # remove this hack when OSC is removed - lov_name = 'OSC_' + lov_name - lov_uuid = name2uuid(lustre, lov_name, tag='osc', fatal=1) + lov_uuid = name2uuid(lustre, lov_name, tag='obd', fatal=1) uuid = new_uuid(name) mtpt = gen.mountpoint(name, uuid, mds_uuid, lov_uuid, path) @@ -604,6 +642,12 @@ class OptionError (exceptions.Exception): def __init__(self, args): self.args = args +def has_option(options, tag): + """Look for tag in options hash and return the true if set""" + if options.has_key(tag): + return 1 + return 0 + def get_option(options, tag, default = None): """Look for tag in options hash and return the value if set. If not set, then if return default it is set, otherwise exception.""" @@ -627,7 +671,8 @@ def parse_cmdline(argv): "mds=", "route", "router", "merge=", "format", "reformat", "output=", "dev=", "size=", "obd=", "obdtype=", "obduuid=", "in=", "path=", "help", "batch=", "lov=", "gw=", "lo=", "hi=", - "oscref", "osc=", "real_obd=", "cache_obd=", "fstype="] + "oscref", "osc=", "real_obd=", "cache_obd=", "fstype=", + "timeout=", "recovery_upcall="] opts = [] args = [] options = {} @@ -652,6 +697,14 @@ def parse_cmdline(argv): if o == "--obd": options['obd'] = a + # node options + if o == "--timeout": + options['timeout'] = a + if o == "--recovery_upcall": + options['recovery_upcall'] = a + if o == "--router": + options['router'] = 1 + # network options if o == "--nid": options['nid'] = a @@ -667,8 +720,6 @@ def parse_cmdline(argv): options['mtpt'] = 1 if o == "--route": options['route'] = 1 - if o == "--router": - options['router'] = 1 # ost options if o == "--dev": -- 1.8.3.1