3 # Copyright (C) 2002 Cluster File Systems, Inc.
4 # Author: Robert Read <rread@clusterfs.com>
6 # This file is part of Lustre, http://www.lustre.org.
8 # Lustre is free software; you can redistribute it and/or
9 # modify it under the terms of version 2 of the GNU General Public
10 # License as published by the Free Software Foundation.
12 # Lustre is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Lustre; if not, write to the Free Software
19 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 # lconf - lustre configuration tool
23 # lconf is the main driver script for starting and stopping
24 # lustre filesystem services.
26 # Based in part on the XML obdctl modifications done by Brian Behlendorf
29 import string, os, stat, popen2, socket, time
31 import xml.dom.minidom
36 DEFAULT_TCPBUF = 1048576
38 # Maximum number of devices to search for.
39 # (the /dev/loop* nodes need to be created beforehand)
40 MAX_LOOP_DEVICES = 256
44 print """usage: lconf config.xml
46 config.xml Lustre configuration in xml format.
47 --get <url> URL to fetch a config file
48 --node <nodename> Load config for <nodename>
49 -d | --cleanup Cleans up config. (Shutdown)
50 -v | --verbose Print system commands as they are run
51 -h | --help Print this help
52 --gdb Prints message after creating gdb module script
53 and sleeps for 5 seconds.
54 -n | --noexec Prints the commands and steps that will be run for a
55 config without executing them. This can used to check if a
56 config file is doing what it should be doing. (Implies -v)
57 --nomod Skip load/unload module step.
58 --nosetup Skip device setup/cleanup step.
59 --reformat Reformat all devices (without question)
60 --dump Dump the kernel debug log before portals is unloaded
63 --ldap server LDAP server with lustre config database
64 --makeldiff Translate xml source to LDIFF
65 This are perhaps not needed:
66 --lustre="src dir" Base directory of lustre sources. Used to search
68 --portals=src Portals source
72 # ============================================================
73 # Config parameters, encapsulated in a class
88 self._gdb_script = '/tmp/ogdb'
89 self._debug_path = '/tmp/lustre-log'
90 self._dump_file = None
93 def verbose(self, flag = None):
94 if flag: self._verbose = flag
97 def noexec(self, flag = None):
98 if flag: self._noexec = flag
101 def reformat(self, flag = None):
102 if flag: self._reformat = flag
103 return self._reformat
105 def cleanup(self, flag = None):
106 if flag: self._cleanup = flag
109 def gdb(self, flag = None):
110 if flag: self._gdb = flag
113 def nomod(self, flag = None):
114 if flag: self._nomod = flag
117 def nosetup(self, flag = None):
118 if flag: self._nosetup = flag
121 def node(self, val = None):
122 if val: self._node = val
125 def url(self, val = None):
126 if val: self._url = val
129 def gdb_script(self):
130 if os.path.isdir('/r'):
131 return '/r' + self._gdb_script
133 return self._gdb_script
135 def debug_path(self):
136 if os.path.isdir('/r'):
137 return '/r' + self._debug_path
139 return self._debug_path
141 def src_dir(self, val = None):
142 if val: self._src_dir = val
145 def dump_file(self, val = None):
146 if val: self._dump_file = val
147 return self._dump_file
151 # ============================================================
152 # debugging and error funcs
154 def fixme(msg = "this feature"):
155 raise LconfError, msg + ' not implmemented yet.'
158 msg = string.join(map(str,args))
159 if not config.noexec():
160 raise LconfError(msg)
165 msg = string.join(map(str,args))
170 print string.strip(s)
174 msg = string.join(map(str,args))
177 # ============================================================
178 # locally defined exceptions
179 class CommandError (exceptions.Exception):
180 def __init__(self, cmd_name, cmd_err, rc=None):
181 self.cmd_name = cmd_name
182 self.cmd_err = cmd_err
187 if type(self.cmd_err) == types.StringType:
189 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
191 print "! %s: %s" % (self.cmd_name, self.cmd_err)
192 elif type(self.cmd_err) == types.ListType:
194 print "! %s (error %d):" % (self.cmd_name, self.rc)
196 print "! %s:" % (self.cmd_name)
197 for s in self.cmd_err:
198 print "> %s" %(string.strip(s))
202 class LconfError (exceptions.Exception):
203 def __init__(self, args):
207 # ============================================================
208 # handle lctl interface
211 Manage communication with lctl
214 def __init__(self, cmd):
216 Initialize close by finding the lctl binary.
218 self.lctl = find_prog(cmd)
221 debug('! lctl not found')
224 raise CommandError('lctl', "unable to find lctl binary.")
229 the cmds are written to stdin of lctl
230 lctl doesn't return errors when run in script mode, so
232 should modify command line to accept multiple commands, or
233 create complex command line options
235 debug("+", self.lctl, cmds)
236 if config.noexec(): return (0, [])
237 p = popen2.Popen3(self.lctl, 1)
238 p.tochild.write(cmds + "\n")
240 out = p.fromchild.readlines()
241 err = p.childerr.readlines()
244 raise CommandError(self.lctl, err, ret)
248 def network(self, net, nid):
249 """ initialized network and add "self" """
250 # Idea: "mynid" could be used for all network types to add "self," and then
251 # this special case would be gone and the "self" hack would be hidden.
257 quit""" % (net, nid, nid)
266 # create a new connection
267 def connect(self, net, nid, port, servuuid, send_mem, recv_mem):
275 quit""" % (net, servuuid, nid, send_mem, recv_mem, nid, port, )
281 quit""" % (net, servuuid, nid, nid, port, )
285 # add a route to a range
286 def add_route(self, net, gw, lo, hi):
290 quit """ % (net, gw, lo, hi)
294 # add a route to a range
295 def del_route(self, net, gw, lo, hi):
302 # add a route to a host
303 def add_route_host(self, net, uuid, gw, tgt):
308 quit """ % (net, uuid, tgt, gw, tgt)
311 # add a route to a range
312 def del_route_host(self, net, uuid, gw, tgt):
317 quit """ % (net, uuid, tgt)
320 # disconnect one connection
321 def disconnect(self, net, nid, port, servuuid):
326 quit""" % (net, nid, servuuid)
330 def disconnectAll(self, net):
338 # create a new device with lctl
339 def newdev(self, attach, setup = ""):
344 quit""" % (attach, setup)
348 def cleanup(self, name, uuid):
357 def lovconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist):
361 lovconfig %s %d %d %d %s %s
362 quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
366 def dump(self, dump_file):
369 quit""" % (dump_file)
372 # ============================================================
373 # Various system-level functions
374 # (ideally moved to their own module)
376 # Run a command and return the output and status.
377 # stderr is sent to /dev/null, could use popen3 to
378 # save it if necessary
380 cmd = string.join(map(str,args))
382 if config.noexec(): return (0, [])
383 f = os.popen(cmd + ' 2>&1')
392 # Run a command in the background.
393 def run_daemon(*args):
394 cmd = string.join(map(str,args))
396 if config.noexec(): return 0
397 f = os.popen(cmd + ' 2>&1')
405 # Determine full path to use for an external command
406 # searches dirname(argv[0]) first, then PATH
408 syspath = string.split(os.environ['PATH'], ':')
409 cmdpath = os.path.dirname(sys.argv[0])
410 syspath.insert(0, cmdpath);
411 syspath.insert(0, os.path.join(cmdpath, '../../portals/linux/utils/'))
413 prog = os.path.join(d,cmd)
414 if os.access(prog, os.X_OK):
418 # Recursively look for file starting at base dir
419 def do_find_file(base, mod):
420 fullname = os.path.join(base, mod)
421 if os.access(fullname, os.R_OK):
423 for d in os.listdir(base):
424 dir = os.path.join(base,d)
425 if os.path.isdir(dir):
426 module = do_find_file(dir, mod)
430 def find_module(src_dir, dev_dir, modname):
431 mod = '%s.o' % (modname)
432 module = src_dir +'/'+ dev_dir +'/'+ mod
434 if os.access(module, os.R_OK):
440 # is the path a block device?
447 return stat.S_ISBLK(s[stat.ST_MODE])
449 # build fs according to type
451 def mkfs(fstype, dev):
452 if(fstype in ('ext3', 'extN')):
453 mkfs = 'mkfs.ext2 -j -b 4096'
455 print 'unsupported fs type: ', fstype
456 if not is_block(dev):
460 (ret, out) = run (mkfs, force, dev)
462 panic("Unable to build fs:", dev)
463 # enable hash tree indexing on fs
465 htree = 'echo "feature FEATURE_C5" | debugfs -w'
466 (ret, out) = run (htree, dev)
468 panic("Unable to enable htree:", dev)
470 # some systems use /dev/loopN, some /dev/loop/N
474 if not os.access(loop + str(0), os.R_OK):
476 if not os.access(loop + str(0), os.R_OK):
477 panic ("can't access loop devices")
480 # find loop device assigned to thefile
483 for n in xrange(0, MAX_LOOP_DEVICES):
485 if os.access(dev, os.R_OK):
486 (stat, out) = run('losetup', dev)
487 if (out and stat == 0):
488 m = re.search(r'\((.*)\)', out[0])
489 if m and file == m.group(1):
495 # create file if necessary and assign the first free loop device
496 def init_loop(file, size, fstype):
497 dev = find_loop(file)
499 print 'WARNING file:', file, 'already mapped to', dev
501 if config.reformat() or not os.access(file, os.R_OK | os.W_OK):
502 run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size, file))
504 # find next free loop
505 for n in xrange(0, MAX_LOOP_DEVICES):
507 if os.access(dev, os.R_OK):
508 (stat, out) = run('losetup', dev)
510 run('losetup', dev, file)
513 print "out of loop devices"
515 print "out of loop devices"
518 # undo loop assignment
519 def clean_loop(file):
520 dev = find_loop(file)
522 ret, out = run('losetup -d', dev)
524 log('unable to clean loop device:', dev, 'for file:', file)
527 # determine if dev is formatted as a <fstype> filesystem
528 def need_format(fstype, dev):
529 # FIXME don't know how to implement this
532 # initialize a block device if needed
533 def block_dev(dev, size, fstype, format):
534 if config.noexec(): return dev
535 if not is_block(dev):
536 dev = init_loop(dev, size, fstype)
537 if config.reformat() or (need_format(fstype, dev) and format == 'yes'):
541 # panic("device:", dev,
542 # "not prepared, and autoformat is not set.\n",
543 # "Rerun with --reformat option to format ALL filesystems")
548 """lookup IP address for an interface"""
549 rc, out = run("/sbin/ifconfig", iface)
552 addr = string.split(out[1])[1]
553 ip = string.split(addr, ':')[1]
556 def get_local_address(net_type, wildcard):
557 """Return the local address for the network type."""
559 if net_type == 'tcp':
561 iface, star = string.split(wildcard, ':')
562 local = if2addr(iface)
564 panic ("unable to determine ip for:", wildcard)
566 host = socket.gethostname()
567 local = socket.gethostbyname(host)
568 elif net_type == 'elan':
569 # awk '/NodeId/ { print $2 }' '/proc/elan/device0/position'
571 fp = open('/proc/elan/device0/position', 'r')
572 lines = fp.readlines()
581 elif net_type == 'gm':
582 fixme("automatic local address for GM")
587 # ============================================================
588 # Classes to prepare and cleanup the various objects
591 """ Base class for the rest of the modules. The default cleanup method is
592 defined here, as well as some utilitiy funcs.
594 def __init__(self, module_name, dom_node):
595 self.dom_node = dom_node
596 self.module_name = module_name
597 self.name = get_attr(dom_node, 'name')
598 self.uuid = get_attr(dom_node, 'uuid')
599 self.kmodule_list = []
603 def info(self, *args):
604 msg = string.join(map(str,args))
605 print self.module_name + ":", self.name, self.uuid, msg
608 def lookup_server(self, srv_uuid):
609 """ Lookup a server's network information """
610 net = get_ost_net(self.dom_node.parentNode, srv_uuid)
612 panic ("Unable to find a server for:", srv_uuid)
613 self._server = Network(net)
615 def get_server(self):
619 """ default cleanup, used for most modules """
621 srv = self.get_server()
622 if srv and local_net(srv):
624 lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
625 except CommandError, e:
626 log(self.module_name, "disconnect failed: ", self.name)
629 lctl.cleanup(self.name, self.uuid)
630 except CommandError, e:
631 log(self.module_name, "cleanup failed: ", self.name)
634 def add_module(self, dev_dir, modname):
635 """Append a module to list of modules to load."""
636 self.kmodule_list.append((dev_dir, modname))
638 def mod_loaded(self, modname):
639 """Check if a module is already loaded. Look in /proc/modules for it."""
640 fp = open('/proc/modules')
641 lines = fp.readlines()
643 # please forgive my tired fingers for this one
644 ret = filter(lambda word, mod=modname: word == mod,
645 map(lambda line: string.split(line)[0], lines))
648 def load_module(self):
649 """Load all the modules in the list in the order they appear."""
650 for dev_dir, mod in self.kmodule_list:
651 # (rc, out) = run ('/sbin/lsmod | grep -s', mod)
652 if self.mod_loaded(mod) and not config.noexec():
654 log ('loading module:', mod)
656 module = find_module(config.src_dir(),dev_dir, mod)
658 panic('module not found:', mod)
659 (rc, out) = run('/sbin/insmod', module)
661 raise CommandError('insmod', out, rc)
663 (rc, out) = run('/sbin/modprobe', mod)
665 raise CommandError('modprobe', out, rc)
667 def cleanup_module(self):
668 """Unload the modules in the list in reverse order."""
669 rev = self.kmodule_list
671 for dev_dir, mod in rev:
672 if not self.mod_loaded(mod):
675 if mod == 'portals' and config.dump_file():
676 lctl.dump(config.dump_file())
677 log('unloading module:', mod)
680 (rc, out) = run('/sbin/rmmod', mod)
682 log('! unable to unload module:', mod)
686 class Network(Module):
687 def __init__(self,dom_node):
688 Module.__init__(self, 'NETWORK', dom_node)
689 self.net_type = get_attr(dom_node,'type')
690 self.nid = get_text(dom_node, 'server', '*')
691 self.port = get_text_int(dom_node, 'port', 0)
692 self.send_mem = get_text_int(dom_node, 'send_mem', DEFAULT_TCPBUF)
693 self.recv_mem = get_text_int(dom_node, 'recv_mem', DEFAULT_TCPBUF)
695 self.nid = get_local_address(self.net_type, self.nid)
697 panic("unable to set nid for", self.net_type, self.nid)
698 debug("nid:", self.nid)
700 self.add_module('portals/linux/oslib/', 'portals')
701 if node_needs_router():
702 self.add_module('portals/linux/router', 'kptlrouter')
703 if self.net_type == 'tcp':
704 self.add_module('portals/linux/socknal', 'ksocknal')
705 if self.net_type == 'elan':
706 self.add_module('portals/linux/rqswnal', 'kqswnal')
707 if self.net_type == 'gm':
708 self.add_module('portals/linux/gmnal', 'kgmnal')
709 self.add_module('lustre/obdclass', 'obdclass')
710 self.add_module('lustre/ptlrpc', 'ptlrpc')
713 self.info(self.net_type, self.nid, self.port)
714 if self.net_type == 'tcp':
715 ret, out = run(TCP_ACCEPTOR, '-s', self.send_mem, '-r', self.recv_mem, self.port)
717 raise CommandError(TCP_ACCEPTOR, out, ret)
718 ret = self.dom_node.getElementsByTagName('route_tbl')
720 for r in a.getElementsByTagName('route'):
721 net_type = get_attr(r, 'type')
722 gw = get_attr(r, 'gw')
723 lo = get_attr(r, 'lo')
724 hi = get_attr(r,'hi', '')
725 lctl.add_route(net_type, gw, lo, hi)
726 if self.net_type == 'tcp' and hi == '':
727 srv = nid2server(self.dom_node.parentNode.parentNode, lo)
729 panic("no server for nid", lo)
731 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
734 lctl.network(self.net_type, self.nid)
735 lctl.newdev(attach = "ptlrpc RPCDEV")
738 self.info(self.net_type, self.nid, self.port)
739 ret = self.dom_node.getElementsByTagName('route_tbl')
741 for r in a.getElementsByTagName('route'):
742 lo = get_attr(r, 'lo')
743 hi = get_attr(r,'hi', '')
744 if self.net_type == 'tcp' and hi == '':
745 srv = nid2server(self.dom_node.parentNode.parentNode, lo)
747 panic("no server for nid", lo)
750 lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
751 except CommandError, e:
752 print "disconnect failed: ", self.name
755 lctl.del_route(self.net_type, self.nid, lo, hi)
756 except CommandError, e:
757 print "del_route failed: ", self.name
761 lctl.cleanup("RPCDEV", "")
762 except CommandError, e:
763 print "cleanup failed: ", self.name
766 lctl.disconnectAll(self.net_type)
767 except CommandError, e:
768 print "disconnectAll failed: ", self.name
770 if self.net_type == 'tcp':
771 # yikes, this ugly! need to save pid in /var/something
772 run("killall acceptor")
775 def __init__(self,dom_node):
776 Module.__init__(self, 'LDLM', dom_node)
777 self.add_module('lustre/ldlm', 'ldlm')
780 lctl.newdev(attach="ldlm %s %s" % (self.name, self.uuid),
784 def __init__(self,dom_node):
785 Module.__init__(self, 'LOV', dom_node)
786 self.mds_uuid = get_first_ref(dom_node, 'mds')
787 mds= lookup(dom_node.parentNode, self.mds_uuid)
788 self.mds_name = getName(mds)
789 devs = dom_node.getElementsByTagName('devices')
792 self.stripe_sz = get_attr_int(dev_node, 'stripesize', 65536)
793 self.stripe_off = get_attr_int(dev_node, 'stripeoffset', 0)
794 self.pattern = get_attr_int(dev_node, 'pattern', 0)
795 self.devlist = get_all_refs(dev_node, 'osc')
796 self.stripe_cnt = get_attr_int(dev_node, 'stripecount', len(self.devlist))
797 self.add_module('lustre/mdc', 'mdc')
798 self.add_module('lustre/lov', 'lov')
801 for osc_uuid in self.devlist:
802 osc = lookup(self.dom_node.parentNode, osc_uuid)
807 panic('osc not found:', osc_uuid)
808 mdc_uuid = prepare_mdc(self.dom_node.parentNode, self.mds_uuid)
809 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
810 self.stripe_off, self.pattern, self.devlist, self.mds_name)
811 lctl.newdev(attach="lov %s %s" % (self.name, self.uuid),
812 setup ="%s" % (mdc_uuid))
815 for osc_uuid in self.devlist:
816 osc = lookup(self.dom_node.parentNode, osc_uuid)
821 panic('osc not found:', osc_uuid)
823 cleanup_mdc(self.dom_node.parentNode, self.mds_uuid)
824 def load_module(self):
825 for osc_uuid in self.devlist:
826 osc = lookup(self.dom_node.parentNode, osc_uuid)
832 panic('osc not found:', osc_uuid)
833 Module.load_module(self)
834 def cleanup_module(self):
835 Module.cleanup_module(self)
836 for osc_uuid in self.devlist:
837 osc = lookup(self.dom_node.parentNode, osc_uuid)
843 panic('osc not found:', osc_uuid)
845 class LOVConfig(Module):
846 def __init__(self,dom_node):
847 Module.__init__(self, 'LOVConfig', dom_node)
848 self.lov_uuid = get_first_ref(dom_node, 'lov')
849 l = lookup(dom_node.parentNode, self.lov_uuid)
854 self.info(lov.mds_uuid, lov.stripe_cnt, lov.stripe_sz, lov.stripe_off,
855 lov.pattern, lov.devlist, lov.mds_name)
856 lctl.lovconfig(lov.uuid, lov.mds_name, lov.stripe_cnt,
857 lov.stripe_sz, lov.stripe_off, lov.pattern,
858 string.join(lov.devlist))
866 def __init__(self,dom_node):
867 Module.__init__(self, 'MDS', dom_node)
868 self.devname, self.size = get_device(dom_node)
869 self.fstype = get_text(dom_node, 'fstype')
870 self.format = get_text(dom_node, 'autoformat', "no")
871 if self.fstype == 'extN':
872 self.add_module('lustre/extN', 'extN')
873 self.add_module('lustre/mds', 'mds')
874 self.add_module('lustre/mds', 'mds_%s' % (self.fstype))
877 self.info(self.devname, self.fstype, self.format)
878 blkdev = block_dev(self.devname, self.size, self.fstype, self.format)
879 lctl.newdev(attach="mds %s %s" % (self.name, self.uuid),
880 setup ="%s %s" %(blkdev, self.fstype))
883 clean_loop(self.devname)
885 # Very unusual case, as there is no MDC element in the XML anymore
886 # Builds itself from an MDS node
888 def __init__(self,dom_node):
889 self.mds = MDS(dom_node)
890 self.dom_node = dom_node
891 self.module_name = 'MDC'
892 self.kmodule_list = []
896 host = socket.gethostname()
897 self.name = 'MDC_'+host
898 self.uuid = self.name+'_UUID'
900 self.lookup_server(self.mds.uuid)
901 self.add_module('lustre/mdc', 'mdc')
904 self.info(self.mds.uuid)
905 srv = self.get_server()
906 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
907 lctl.newdev(attach="mdc %s %s" % (self.name, self.uuid),
908 setup ="%s %s" %(self.mds.uuid, srv.uuid))
911 def __init__(self, dom_node):
912 Module.__init__(self, 'OBD', dom_node)
913 self.obdtype = get_attr(dom_node, 'type')
914 self.devname, self.size = get_device(dom_node)
915 self.fstype = get_text(dom_node, 'fstype')
916 self.format = get_text(dom_node, 'autoformat', 'yes')
917 if self.fstype == 'extN':
918 self.add_module('lustre/extN', 'extN')
919 self.add_module('lustre/' + self.obdtype, self.obdtype)
921 # need to check /proc/mounts and /etc/mtab before
922 # formatting anything.
923 # FIXME: check if device is already formatted.
925 self.info(self.obdtype, self.devname, self.size, self.fstype, self.format)
926 if self.obdtype == 'obdecho':
929 blkdev = block_dev(self.devname, self.size, self.fstype, self.format)
930 lctl.newdev(attach="%s %s %s" % (self.obdtype, self.name, self.uuid),
931 setup ="%s %s" %(blkdev, self.fstype))
934 if not self.obdtype == 'obdecho':
935 clean_loop(self.devname)
938 def __init__(self,dom_node):
939 Module.__init__(self, 'OST', dom_node)
940 self.obd_uuid = get_first_ref(dom_node, 'obd')
941 self.add_module('lustre/ost', 'ost')
944 self.info(self.obd_uuid)
945 lctl.newdev(attach="ost %s %s" % (self.name, self.uuid),
946 setup ="%s" % (self.obd_uuid))
949 # virtual interface for OSC and LOV
951 def __init__(self,dom_node):
952 Module.__init__(self, 'VOSC', dom_node)
953 if dom_node.nodeName == 'lov':
954 self.osc = LOV(dom_node)
956 self.osc = OSC(dom_node)
961 def load_module(self):
962 self.osc.load_module()
963 def cleanup_module(self):
964 self.osc.cleanup_module()
968 def __init__(self,dom_node):
969 Module.__init__(self, 'OSC', dom_node)
970 self.obd_uuid = get_first_ref(dom_node, 'obd')
971 self.ost_uuid = get_first_ref(dom_node, 'ost')
972 self.lookup_server(self.ost_uuid)
973 self.add_module('lustre/osc', 'osc')
976 self.info(self.obd_uuid, self.ost_uuid)
977 srv = self.get_server()
979 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
983 lctl.add_route_host(r[0], srv.uuid, r[1], r[2])
985 panic ("no route to", srv.nid)
987 lctl.newdev(attach="osc %s %s" % (self.name, self.uuid),
988 setup ="%s %s" %(self.obd_uuid, srv.uuid))
991 srv = self.get_server()
995 self.info(self.obd_uuid, self.ost_uuid)
998 lctl.del_route_host(r[0], srv.uuid, r[1], r[2])
1002 class Mountpoint(Module):
1003 def __init__(self,dom_node):
1004 Module.__init__(self, 'MTPT', dom_node)
1005 self.path = get_text(dom_node, 'path')
1006 self.mds_uuid = get_first_ref(dom_node, 'mds')
1007 self.lov_uuid = get_first_ref(dom_node, 'osc')
1008 self.add_module('lustre/mdc', 'mdc')
1009 self.add_module('lustre/llite', 'llite')
1010 l = lookup(self.dom_node.parentNode, self.lov_uuid)
1015 mdc_uuid = prepare_mdc(self.dom_node.parentNode, self.mds_uuid)
1017 self.info(self.path, self.mds_uuid,self.lov_uuid)
1018 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s none %s" % \
1019 (self.lov_uuid, mdc_uuid, self.path)
1020 run("mkdir", self.path)
1023 panic("mount failed:", self.path)
1026 self.info(self.path, self.mds_uuid,self.lov_uuid)
1027 (rc, out) = run("umount", self.path)
1029 log("umount failed, cleanup will most likely not work.")
1030 l = lookup(self.dom_node.parentNode, self.lov_uuid)
1032 cleanup_mdc(self.dom_node.parentNode, self.mds_uuid)
1034 def load_module(self):
1035 self.osc.load_module()
1036 Module.load_module(self)
1037 def cleanup_module(self):
1038 Module.cleanup_module(self)
1039 self.osc.cleanup_module()
1042 # ============================================================
1043 # XML processing and query
1044 # TODO: Change query funcs to use XPath, which is muc cleaner
1046 def get_device(obd):
1047 list = obd.getElementsByTagName('device')
1051 size = get_attr_int(dev, 'size', 0)
1052 return dev.firstChild.data, size
1055 # Get the text content from the first matching child
1056 # If there is no content (or it is all whitespace), return
1058 def get_text(dom_node, tag, default=""):
1059 list = dom_node.getElementsByTagName(tag)
1062 dom_node.normalize()
1063 if dom_node.firstChild:
1064 txt = string.strip(dom_node.firstChild.data)
1069 def get_text_int(dom_node, tag, default=0):
1070 list = dom_node.getElementsByTagName(tag)
1074 dom_node.normalize()
1075 if dom_node.firstChild:
1076 txt = string.strip(dom_node.firstChild.data)
1081 panic("text value is not integer:", txt)
1084 def get_attr(dom_node, attr, default=""):
1085 v = dom_node.getAttribute(attr)
1090 def get_attr_int(dom_node, attr, default=0):
1092 v = dom_node.getAttribute(attr)
1097 panic("attr value is not integer", v)
1100 def get_first_ref(dom_node, tag):
1101 """ Get the first uuidref of the type TAG. Used one only
1102 one is expected. Returns the uuid."""
1104 refname = '%s_ref' % tag
1105 list = dom_node.getElementsByTagName(refname)
1107 uuid = getRef(list[0])
1110 def get_all_refs(dom_node, tag):
1111 """ Get all the refs of type TAG. Returns list of uuids. """
1113 refname = '%s_ref' % tag
1114 list = dom_node.getElementsByTagName(refname)
1117 uuids.append(getRef(i))
1120 def get_ost_net(dom_node, uuid):
1121 ost = lookup(dom_node, uuid)
1122 uuid = get_first_ref(ost, 'network')
1125 return lookup(dom_node, uuid)
1127 def nid2server(dom_node, nid):
1128 netlist = dom_node.getElementsByTagName('network')
1129 for net_node in netlist:
1130 if get_text(net_node, 'server') == nid:
1131 return Network(net_node)
1134 def lookup(dom_node, uuid):
1135 for n in dom_node.childNodes:
1136 if n.nodeType == n.ELEMENT_NODE:
1137 if getUUID(n) == uuid:
1144 # Get name attribute of dom_node
1145 def getName(dom_node):
1146 return dom_node.getAttribute('name')
1148 def getRef(dom_node):
1149 return dom_node.getAttribute('uuidref')
1151 # Get name attribute of dom_node
1152 def getUUID(dom_node):
1153 return dom_node.getAttribute('uuid')
1155 # the tag name is the service type
1156 # fixme: this should do some checks to make sure the dom_node is a service
1157 def getServiceType(dom_node):
1158 return dom_node.nodeName
1161 # determine what "level" a particular node is at.
1162 # the order of iniitailization is based on level.
1163 def getServiceLevel(dom_node):
1164 type = getServiceType(dom_node)
1165 if type in ('network',):
1167 elif type in ('device', 'ldlm'):
1169 elif type in ('obd', 'mdd'):
1171 elif type in ('mds','ost'):
1173 elif type in ('mdc','osc'):
1175 elif type in ('lov', 'lovconfig'):
1177 elif type in ('mountpoint',):
1182 # return list of services in a profile. list is a list of tuples
1183 # [(level, dom_node),]
1184 def getServices(lustreNode, profileNode):
1186 for n in profileNode.childNodes:
1187 if n.nodeType == n.ELEMENT_NODE:
1188 servNode = lookup(lustreNode, getRef(n))
1191 panic('service not found: ' + getRef(n))
1192 level = getServiceLevel(servNode)
1193 list.append((level, servNode))
1197 def getByName(lustreNode, name, tag):
1198 ndList = lustreNode.getElementsByTagName(tag)
1200 if getName(nd) == name:
1205 ############################################################
1207 # FIXME: clean this mess up!
1210 def prepare_mdc(dom_node, mds_uuid):
1212 mds_node = lookup(dom_node, mds_uuid);
1214 panic("no mds:", mds_uuid)
1223 def cleanup_mdc(dom_node, mds_uuid):
1225 mds_node = lookup(dom_node, mds_uuid);
1227 panic("no mds:", mds_uuid)
1235 ############################################################
1236 # routing ("rooting")
1242 def init_node(dom_node):
1243 global local_node, router_flag
1244 netlist = dom_node.getElementsByTagName('network')
1245 for dom_net in netlist:
1246 type = get_attr(dom_net, 'type')
1247 gw = get_text(dom_net, 'server')
1248 local_node.append((type, gw))
1250 def node_needs_router():
1253 def get_routes(type, gw, dom_net):
1254 """ Return the routes as a list of tuples of the form:
1255 [(type, gw, lo, hi),]"""
1257 tbl = dom_net.getElementsByTagName('route_tbl')
1259 routes = t.getElementsByTagName('route')
1261 lo = get_attr(r, 'lo')
1262 hi = get_attr(r, 'hi', '')
1263 res.append((type, gw, lo, hi))
1267 def init_route_config(lustre):
1268 """ Scan the lustre config looking for routers. Build list of
1270 global routes, router_flag
1272 list = lustre.getElementsByTagName('node')
1274 if get_attr(node, 'router'):
1276 for (local_type, local_nid) in local_node:
1278 netlist = node.getElementsByTagName('network')
1279 for dom_net in netlist:
1280 if local_type == get_attr(dom_net, 'type'):
1281 gw = get_text(dom_net, 'server')
1285 for dom_net in netlist:
1286 if local_type != get_attr(dom_net, 'type'):
1287 for route in get_routes(local_type, gw, dom_net):
1288 routes.append(route)
1293 for iface in local_node:
1294 if net.net_type == iface[0]:
1298 def find_route(net):
1299 global local_node, routes
1300 frm_type = local_node[0][0]
1301 to_type = net.net_type
1303 debug ('looking for route to', to_type,to)
1312 ############################################################
1315 def startService(dom_node, module_flag):
1316 type = getServiceType(dom_node)
1317 debug('Service:', type, getName(dom_node), getUUID(dom_node))
1318 # there must be a more dynamic way of doing this...
1324 elif type == 'lovconfig':
1325 n = LOVConfig(dom_node)
1326 elif type == 'network':
1327 n = Network(dom_node)
1338 elif type == 'mountpoint':
1339 n = Mountpoint(dom_node)
1341 panic ("unknown service type:", type)
1346 if config.cleanup():
1351 if config.nosetup():
1353 if config.cleanup():
1359 # Prepare the system to run lustre using a particular profile
1360 # in a the configuration.
1361 # * load & the modules
1362 # * setup networking for the current node
1363 # * make sure partitions are in place and prepared
1364 # * initialize devices with lctl
1365 # Levels is important, and needs to be enforced.
1366 def startProfile(lustreNode, profileNode, module_flag):
1368 panic("profile:", profile, "not found.")
1369 services = getServices(lustreNode, profileNode)
1370 if config.cleanup():
1373 startService(s[1], module_flag)
1378 def doHost(lustreNode, hosts):
1382 dom_node = getByName(lustreNode, h, 'node')
1387 print 'No host entry found.'
1390 if not get_attr(dom_node, 'router'):
1392 init_route_config(lustreNode)
1397 # Two step process: (1) load modules, (2) setup lustre
1398 # if not cleaning, load modules first.
1399 module_flag = not config.cleanup()
1400 reflist = dom_node.getElementsByTagName('profile')
1401 for profile in reflist:
1402 startProfile(lustreNode, profile, module_flag)
1404 if not config.cleanup():
1405 sys_set_debug_path()
1406 script = config.gdb_script()
1407 run(lctl.lctl, ' modules >', script)
1409 # dump /tmp/ogdb and sleep/pause here
1410 log ("The GDB module script is in", script)
1413 module_flag = not module_flag
1414 for profile in reflist:
1415 startProfile(lustreNode, profile, module_flag)
1417 ############################################################
1418 # Command line processing
1420 def parse_cmdline(argv):
1422 long_opts = ["ldap", "reformat", "lustre=", "verbose", "gdb",
1423 "portals=", "makeldiff", "cleanup", "noexec",
1424 "help", "node=", "get=", "nomod", "nosetup",
1429 opts, args = getopt.getopt(argv, short_opts, long_opts)
1430 except getopt.error:
1435 if o in ("-h", "--help"):
1437 if o in ("-d","--cleanup"):
1439 if o in ("-v", "--verbose"):
1441 if o in ("-n", "--noexec"):
1444 if o == "--portals":
1448 if o == "--reformat":
1458 if o == "--nosetup":
1468 s = urllib.urlopen(url)
1474 def setupModulePath(cmd):
1475 base = os.path.dirname(cmd)
1476 if os.access(base+"/Makefile", os.R_OK):
1477 config.src_dir(base + "/../../")
1479 def sys_set_debug_path():
1480 debug("debug path: ", config.debug_path())
1484 fp = open('/proc/sys/portals/debug_path', 'w')
1485 fp.write(config.debug_path())
1490 #/proc/sys/net/core/rmem_max
1491 #/proc/sys/net/core/wmem_max
1492 def sys_set_netmem_max(path, max):
1493 debug("setting", path, "to at least", max)
1501 fp = open(path, 'w')
1502 fp.write('%d\n' %(max))
1506 def sys_make_devices():
1507 if not os.access('/dev/portals', os.R_OK):
1508 run('mknod /dev/portals c 10 240')
1509 if not os.access('/dev/obd', os.R_OK):
1510 run('mknod /dev/obd c 10 241')
1512 # Initialize or shutdown lustre according to a configuration file
1513 # * prepare the system for lustre
1514 # * configure devices with lctl
1515 # Shutdown does steps in reverse
1518 global TCP_ACCEPTOR, lctl, MAXTCPBUF
1519 host = socket.gethostname()
1521 args = parse_cmdline(sys.argv[1:])
1523 if not os.access(args[0], os.R_OK | os.W_OK):
1524 print 'File not found:', args[0]
1526 dom = xml.dom.minidom.parse(args[0])
1528 xmldata = fetch(config.url())
1529 dom = xml.dom.minidom.parseString(xmldata)
1535 node_list.append(config.node())
1538 node_list.append(host)
1539 node_list.append('localhost')
1540 debug("configuring for host: ", node_list)
1543 config._debug_path = config._debug_path + '-' + host
1544 config._gdb_script = config._gdb_script + '-' + host
1546 TCP_ACCEPTOR = find_prog('acceptor')
1547 if not TCP_ACCEPTOR:
1549 TCP_ACCEPTOR = 'acceptor'
1550 debug('! acceptor not found')
1552 panic('acceptor not found')
1554 lctl = LCTLInterface('lctl')
1556 setupModulePath(sys.argv[0])
1558 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
1559 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
1560 doHost(dom.documentElement, node_list)
1562 if __name__ == "__main__":
1565 except LconfError, e:
1567 except CommandError, e: