3 # Copyright (C) 2002 Cluster File Systems, Inc.
4 # Author: Robert Read <rread@clusterfs.com>
6 # This file is part of Lustre, http://www.lustre.org.
8 # Lustre is free software; you can redistribute it and/or
9 # modify it under the terms of version 2 of the GNU General Public
10 # License as published by the Free Software Foundation.
12 # Lustre is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Lustre; if not, write to the Free Software
19 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 # lconf - lustre configuration tool
23 # lconf is the main driver script for starting and stopping
24 # lustre filesystem services.
26 # Based in part on the XML obdctl modifications done by Brian Behlendorf
29 import string, os, stat, popen2, socket, time
31 import xml.dom.minidom
37 # Maximum number of devices to search for.
38 # (the /dev/loop* nodes need to be created beforehand)
39 MAX_LOOP_DEVICES = 256
43 print """usage: lconf config.xml
45 config.xml Lustre configuration in xml format.
46 --get <url> URL to fetch a config file
47 --node <nodename> Load config for <nodename>
48 -d | --cleanup Cleans up config. (Shutdown)
49 -v | --verbose Print system commands as they are run
50 -h | --help Print this help
51 --gdb Prints message after creating gdb module script
52 and sleeps for 5 seconds.
53 -n | --noexec Prints the commands and steps that will be run for a
54 config without executing them. This can used to check if a
55 config file is doing what it should be doing. (Implies -v)
56 --nomod Skip load/unload module step.
57 --nosetup Skip device setup/cleanup step.
58 --reformat Reformat all devices (without question)
61 --ldap server LDAP server with lustre config database
62 --makeldiff Translate xml source to LDIFF
63 This are perhaps not needed:
64 --lustre="src dir" Base directory of lustre sources. Used to search
66 --portals=src Portals source
70 # ============================================================
71 # Config parameters, encapsulated in a class
86 self._gdb_script = '/tmp/ogdb'
87 self._debug_path = '/tmp/lustre-log'
88 self._dump_file = None
91 def verbose(self, flag = None):
92 if flag: self._verbose = flag
95 def noexec(self, flag = None):
96 if flag: self._noexec = flag
99 def reformat(self, flag = None):
100 if flag: self._reformat = flag
101 return self._reformat
103 def cleanup(self, flag = None):
104 if flag: self._cleanup = flag
107 def gdb(self, flag = None):
108 if flag: self._gdb = flag
111 def nomod(self, flag = None):
112 if flag: self._nomod = flag
115 def nosetup(self, flag = None):
116 if flag: self._nosetup = flag
119 def node(self, val = None):
120 if val: self._node = val
123 def url(self, val = None):
124 if val: self._url = val
127 def gdb_script(self):
128 if os.path.isdir('/r'):
129 return '/r' + self._gdb_script
131 return self._gdb_script
133 def debug_path(self):
134 if os.path.isdir('/r'):
135 return '/r' + self._debug_path
137 return self._debug_path
139 def src_dir(self, val = None):
140 if val: self._src_dir = val
143 def dump_file(self, val = None):
144 if val: self._dump_file = val
145 return self._dump_file
149 # ============================================================
150 # debugging and error funcs
152 def fixme(msg = "this feature"):
153 raise LconfError, msg + ' not implmemented yet.'
156 msg = string.join(map(str,args))
157 if not config.noexec():
158 raise LconfError(msg)
163 msg = string.join(map(str,args))
168 print string.strip(s)
172 msg = string.join(map(str,args))
175 # ============================================================
176 # locally defined exceptions
177 class CommandError (exceptions.Exception):
178 def __init__(self, cmd_name, cmd_err, rc=None):
179 self.cmd_name = cmd_name
180 self.cmd_err = cmd_err
185 if type(self.cmd_err) == types.StringType:
187 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
189 print "! %s: %s" % (self.cmd_name, self.cmd_err)
190 elif type(self.cmd_err) == types.ListType:
192 print "! %s (error %d):" % (self.cmd_name, self.rc)
194 print "! %s:" % (self.cmd_name)
195 for s in self.cmd_err:
196 print "> %s" %(string.strip(s))
200 class LconfError (exceptions.Exception):
201 def __init__(self, args):
205 # ============================================================
206 # handle lctl interface
209 Manage communication with lctl
212 def __init__(self, cmd):
214 Initialize close by finding the lctl binary.
216 self.lctl = find_prog(cmd)
219 debug('! lctl not found')
222 raise CommandError('lctl', "unable to find lctl binary.")
227 the cmds are written to stdin of lctl
228 lctl doesn't return errors when run in script mode, so
230 should modify command line to accept multiple commands, or
231 create complex command line options
233 debug("+", self.lctl, cmds)
234 if config.noexec(): return (0, [])
235 p = popen2.Popen3(self.lctl, 1)
236 p.tochild.write(cmds + "\n")
238 out = p.fromchild.readlines()
239 err = p.childerr.readlines()
242 raise CommandError(self.lctl, err, ret)
246 def network(self, net, nid):
247 """ initialized network and add "self" """
248 # Idea: "mynid" could be used for all network types to add "self," and then
249 # this special case would be gone and the "self" hack would be hidden.
255 quit""" % (net, nid, nid)
264 # create a new connection
265 def connect(self, net, nid, port, servuuid, send_mem, recv_mem):
273 quit""" % (net, servuuid, nid, send_mem, recv_mem, nid, port, )
279 quit""" % (net, servuuid, nid, nid, port, )
283 # add a route to a range
284 def add_route(self, net, gw, lo, hi):
288 quit """ % (net, gw, lo, hi)
292 # add a route to a range
293 def del_route(self, net, gw, lo, hi):
300 # add a route to a host
301 def add_route_host(self, net, uuid, gw, tgt):
306 quit """ % (net, uuid, tgt, gw, tgt)
309 # disconnect one connection
310 def disconnect(self, net, nid, port, servuuid):
315 quit""" % (net, nid, servuuid)
318 # disconnect all connections
319 def disconnectAll(self, net):
327 # create a new device with lctl
328 def newdev(self, attach, setup = ""):
333 quit""" % (attach, setup)
337 def cleanup(self, name, uuid):
346 def lovconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist):
350 lovconfig %s %d %d %d %s %s
351 quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
355 def dump(self, dump_file):
358 quit""" % (dump_file)
361 # ============================================================
362 # Various system-level functions
363 # (ideally moved to their own module)
365 # Run a command and return the output and status.
366 # stderr is sent to /dev/null, could use popen3 to
367 # save it if necessary
369 cmd = string.join(map(str,args))
371 if config.noexec(): return (0, [])
372 f = os.popen(cmd + ' 2>&1')
381 # Run a command in the background.
382 def run_daemon(*args):
383 cmd = string.join(map(str,args))
385 if config.noexec(): return 0
386 f = os.popen(cmd + ' 2>&1')
394 # Determine full path to use for an external command
395 # searches dirname(argv[0]) first, then PATH
397 syspath = string.split(os.environ['PATH'], ':')
398 cmdpath = os.path.dirname(sys.argv[0])
399 syspath.insert(0, cmdpath);
400 syspath.insert(0, os.path.join(cmdpath, '../../portals/linux/utils/'))
402 prog = os.path.join(d,cmd)
403 if os.access(prog, os.X_OK):
407 # Recursively look for file starting at base dir
408 def do_find_file(base, mod):
409 fullname = os.path.join(base, mod)
410 if os.access(fullname, os.R_OK):
412 for d in os.listdir(base):
413 dir = os.path.join(base,d)
414 if os.path.isdir(dir):
415 module = do_find_file(dir, mod)
419 def find_module(src_dir, dev_dir, modname):
420 mod = '%s.o' % (modname)
421 module = src_dir +'/'+ dev_dir +'/'+ mod
423 if os.access(module, os.R_OK):
429 # is the path a block device?
436 return stat.S_ISBLK(s[stat.ST_MODE])
438 # build fs according to type
440 def mkfs(fstype, dev):
441 if(fstype in ('ext3', 'extN')):
442 mkfs = 'mkfs.ext2 -j -b 4096'
444 print 'unsupported fs type: ', fstype
445 if not is_block(dev):
449 (ret, out) = run (mkfs, force, dev)
451 panic("Unable to build fs:", dev)
452 # enable hash tree indexing on fs
454 htree = 'echo "feature FEATURE_C5" | debugfs -w'
455 (ret, out) = run (htree, dev)
457 panic("Unable to enable htree:", dev)
459 # some systems use /dev/loopN, some /dev/loop/N
463 if not os.access(loop + str(0), os.R_OK):
465 if not os.access(loop + str(0), os.R_OK):
466 panic ("can't access loop devices")
469 # find loop device assigned to thefile
472 for n in xrange(0, MAX_LOOP_DEVICES):
474 if os.access(dev, os.R_OK):
475 (stat, out) = run('losetup', dev)
476 if (out and stat == 0):
477 m = re.search(r'\((.*)\)', out[0])
478 if m and file == m.group(1):
484 # create file if necessary and assign the first free loop device
485 def init_loop(file, size, fstype):
486 dev = find_loop(file)
488 print 'WARNING file:', file, 'already mapped to', dev
490 if not os.access(file, os.R_OK | os.W_OK):
491 run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size, file))
493 # find next free loop
494 for n in xrange(0, MAX_LOOP_DEVICES):
496 if os.access(dev, os.R_OK):
497 (stat, out) = run('losetup', dev)
499 run('losetup', dev, file)
502 print "out of loop devices"
504 print "out of loop devices"
507 # undo loop assignment
508 def clean_loop(file):
509 dev = find_loop(file)
511 ret, out = run('losetup -d', dev)
513 log('unable to clean loop device:', dev, 'for file:', file)
516 # determine if dev is formatted as a <fstype> filesystem
517 def need_format(fstype, dev):
518 # FIXME don't know how to implement this
521 # initialize a block device if needed
522 def block_dev(dev, size, fstype, format):
523 if config.noexec(): return dev
524 if not is_block(dev):
525 dev = init_loop(dev, size, fstype)
526 if config.reformat() or (need_format(fstype, dev) and format == 'yes'):
530 # panic("device:", dev,
531 # "not prepared, and autoformat is not set.\n",
532 # "Rerun with --reformat option to format ALL filesystems")
536 def get_local_address(net_type):
537 """Return the local address for the network type."""
539 if net_type == 'tcp':
541 host = socket.gethostname()
542 local = socket.gethostbyname(host)
543 elif net_type == 'elan':
544 # awk '/NodeId/ { print $2 }' '/proc/elan/device0/position'
546 fp = open('/proc/elan/device0/position', 'r')
547 lines = fp.readlines()
556 elif net_type == 'gm':
557 fixme("automatic local address for GM")
562 # ============================================================
563 # Classes to prepare and cleanup the various objects
566 """ Base class for the rest of the modules. The default cleanup method is
567 defined here, as well as some utilitiy funcs.
569 def __init__(self, module_name, dom_node):
570 self.dom_node = dom_node
571 self.module_name = module_name
572 self.name = get_attr(dom_node, 'name')
573 self.uuid = get_attr(dom_node, 'uuid')
574 self.kmodule_list = []
578 def info(self, *args):
579 msg = string.join(map(str,args))
580 print self.module_name + ":", self.name, self.uuid, msg
583 def lookup_server(self, srv_uuid):
584 """ Lookup a server's network information """
585 net = get_ost_net(self.dom_node.parentNode, srv_uuid)
586 self._server = Network(net)
588 def get_server(self):
592 """ default cleanup, used for most modules """
594 srv = self.get_server()
595 if srv and local_net(srv):
597 lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
598 except CommandError, e:
599 log(self.module_name, "disconnect failed: ", self.name)
602 lctl.cleanup(self.name, self.uuid)
603 except CommandError, e:
604 log(self.module_name, "cleanup failed: ", self.name)
607 def add_module(self, dev_dir, modname):
608 """Append a module to list of modules to load."""
609 self.kmodule_list.append((dev_dir, modname))
611 def mod_loaded(self, modname):
612 """Check if a module is already loaded. Look in /proc/modules for it."""
613 fp = open('/proc/modules')
614 lines = fp.readlines()
616 # please forgive my tired fingers for this one
617 ret = filter(lambda word, mod=modname: word == mod,
618 map(lambda line: string.split(line)[0], lines))
621 def load_module(self):
622 """Load all the modules in the list in the order they appear."""
623 for dev_dir, mod in self.kmodule_list:
624 # (rc, out) = run ('/sbin/lsmod | grep -s', mod)
625 if self.mod_loaded(mod) and not config.noexec():
627 log ('loading module:', mod)
629 module = find_module(config.src_dir(),dev_dir, mod)
631 panic('module not found:', mod)
632 (rc, out) = run('/sbin/insmod', module)
634 raise CommandError('insmod', out, rc)
636 (rc, out) = run('/sbin/modprobe', mod)
638 raise CommandError('modprobe', out, rc)
640 def cleanup_module(self):
641 """Unload the modules in the list in reverse order."""
642 rev = self.kmodule_list
644 for dev_dir, mod in rev:
645 if not self.mod_loaded(mod):
648 if mod == 'portals' and config.dump_file():
649 lctl.dump(config.dump_file())
650 log('unloading module:', mod)
653 (rc, out) = run('/sbin/rmmod', mod)
655 log('! unable to unload module:', mod)
659 class Network(Module):
660 def __init__(self,dom_node):
661 Module.__init__(self, 'NETWORK', dom_node)
662 self.net_type = get_attr(dom_node,'type')
663 self.nid = get_text(dom_node, 'server', '*')
664 self.port = get_text_int(dom_node, 'port', 0)
665 self.send_mem = get_text_int(dom_node, 'send_mem', 65536)
666 self.recv_mem = get_text_int(dom_node, 'recv_mem', 65536)
668 self.nid = get_local_address(self.net_type)
670 panic("unable to set nid for", self.net_type)
672 self.add_module('portals/linux/oslib/', 'portals')
673 if node_needs_router():
674 self.add_module('portals/linux/router', 'kptlrouter')
675 if self.net_type == 'tcp':
676 self.add_module('portals/linux/socknal', 'ksocknal')
677 if self.net_type == 'elan':
678 self.add_module('portals/linux/rqswnal', 'kqswnal')
679 if self.net_type == 'gm':
680 self.add_module('portals/linux/gmnal', 'kgmnal')
681 self.add_module('lustre/obdclass', 'obdclass')
682 self.add_module('lustre/ptlrpc', 'ptlrpc')
685 self.info(self.net_type, self.nid, self.port)
686 if self.net_type == 'tcp':
687 ret = run_daemon(TCP_ACCEPTOR, '-s', self.send_mem, '-r', self.recv_mem, self.port)
689 raise CommandError(TCP_ACCEPTOR, 'failed', ret)
690 ret = self.dom_node.getElementsByTagName('route_tbl')
692 for r in a.getElementsByTagName('route'):
693 net_type = get_attr(r, 'type')
694 gw = get_attr(r, 'gw')
695 lo = get_attr(r, 'lo')
696 hi = get_attr(r,'hi', '')
697 lctl.add_route(net_type, gw, lo, hi)
698 if self.net_type == 'tcp' and hi == '':
699 srv = nid2server(self.dom_node.parentNode.parentNode, lo)
701 panic("no server for nid", lo)
703 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
706 lctl.network(self.net_type, self.nid)
707 lctl.newdev(attach = "ptlrpc RPCDEV")
710 self.info(self.net_type, self.nid, self.port)
711 ret = self.dom_node.getElementsByTagName('route_tbl')
713 for r in a.getElementsByTagName('route'):
714 lo = get_attr(r, 'lo')
715 hi = get_attr(r,'hi', '')
716 if self.net_type == 'tcp' and hi == '':
717 srv = nid2server(self.dom_node.parentNode.parentNode, lo)
719 panic("no server for nid", lo)
722 lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
723 except CommandError, e:
724 print "disconnect failed: ", self.name
727 lctl.del_route(self.net_type, self.nid, lo, hi)
728 except CommandError, e:
729 print "del_route failed: ", self.name
733 lctl.cleanup("RPCDEV", "")
734 except CommandError, e:
735 print "cleanup failed: ", self.name
738 lctl.disconnectAll(self.net_type)
739 except CommandError, e:
740 print "disconnectAll failed: ", self.name
742 if self.net_type == 'tcp':
743 # yikes, this ugly! need to save pid in /var/something
744 run("killall acceptor")
747 def __init__(self,dom_node):
748 Module.__init__(self, 'LDLM', dom_node)
749 self.add_module('lustre/ldlm', 'ldlm')
752 lctl.newdev(attach="ldlm %s %s" % (self.name, self.uuid),
756 def __init__(self,dom_node):
757 Module.__init__(self, 'LOV', dom_node)
758 self.mdsuuid = get_first_ref(dom_node, 'mds')
759 mds= lookup(dom_node.parentNode, self.mdsuuid)
760 self.mdsname = getName(mds)
761 devs = dom_node.getElementsByTagName('devices')
764 self.stripe_sz = get_attr_int(dev_node, 'stripesize', 65536)
765 self.stripe_off = get_attr_int(dev_node, 'stripeoffset', 0)
766 self.pattern = get_attr_int(dev_node, 'pattern', 0)
767 self.devlist = get_all_refs(dev_node, 'osc')
768 self.stripe_cnt = len(self.devlist)
771 self.info(self.mdsuuid, self.stripe_cnt, self.stripe_sz, self.stripe_off, self.pattern,
772 self.devlist, self.mdsname)
773 lctl.lovconfig(self.uuid, self.mdsname, self.stripe_cnt,
774 self.stripe_sz, self.stripe_off, self.pattern,
775 string.join(self.devlist))
779 def __init__(self,dom_node):
780 Module.__init__(self, 'MDS', dom_node)
781 self.devname, self.size = get_device(dom_node)
782 self.fstype = get_text(dom_node, 'fstype')
783 self.format = get_text(dom_node, 'autoformat', "no")
784 if self.fstype == 'extN':
785 self.add_module('lustre/extN', 'extN')
786 self.add_module('lustre/mds', 'mds')
787 self.add_module('lustre/mds', 'mds_%s' % (self.fstype))
790 self.info(self.devname, self.fstype, self.format)
791 blkdev = block_dev(self.devname, self.size, self.fstype, self.format)
792 lctl.newdev(attach="mds %s %s" % (self.name, self.uuid),
793 setup ="%s %s" %(blkdev, self.fstype))
796 clean_loop(self.devname)
799 def __init__(self,dom_node):
800 Module.__init__(self, 'MDC', dom_node)
801 self.mds_uuid = get_first_ref(dom_node, 'mds')
802 self.lookup_server(self.mds_uuid)
803 self.add_module('lustre/mdc', 'mdc')
806 self.info(self.mds_uuid)
807 srv = self.get_server()
808 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
809 lctl.newdev(attach="mdc %s %s" % (self.name, self.uuid),
810 setup ="%s %s" %(self.mds_uuid, srv.uuid))
813 def __init__(self, dom_node):
814 Module.__init__(self, 'OBD', dom_node)
815 self.obdtype = get_attr(dom_node, 'type')
816 self.devname, self.size = get_device(dom_node)
817 self.fstype = get_text(dom_node, 'fstype')
818 self.format = get_text(dom_node, 'autoformat', 'yes')
819 if self.fstype == 'extN':
820 self.add_module('lustre/extN', 'extN')
821 self.add_module('lustre/' + self.obdtype, self.obdtype)
823 # need to check /proc/mounts and /etc/mtab before
824 # formatting anything.
825 # FIXME: check if device is already formatted.
827 self.info(self.obdtype, self.devname, self.size, self.fstype, self.format)
828 if self.obdtype == 'obdecho':
831 blkdev = block_dev(self.devname, self.size, self.fstype, self.format)
832 lctl.newdev(attach="%s %s %s" % (self.obdtype, self.name, self.uuid),
833 setup ="%s %s" %(blkdev, self.fstype))
836 if not self.obdtype == 'obdecho':
837 clean_loop(self.devname)
840 def __init__(self,dom_node):
841 Module.__init__(self, 'OST', dom_node)
842 self.obd_uuid = get_first_ref(dom_node, 'obd')
843 self.add_module('lustre/ost', 'ost')
846 self.info(self.obd_uuid)
847 lctl.newdev(attach="ost %s %s" % (self.name, self.uuid),
848 setup ="%s" % (self.obd_uuid))
851 def __init__(self,dom_node):
852 Module.__init__(self, 'OSC', dom_node)
853 self.obd_uuid = get_first_ref(dom_node, 'obd')
854 self.ost_uuid = get_first_ref(dom_node, 'ost')
855 self.lookup_server(self.ost_uuid)
856 self.add_module('lustre/osc', 'osc')
859 self.info(self.obd_uuid, self.ost_uuid)
860 srv = self.get_server()
862 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
866 lctl.add_route_host(r[0], srv.uuid, r[1], r[2])
868 panic ("no route to", srv.nid)
870 lctl.newdev(attach="osc %s %s" % (self.name, self.uuid),
871 setup ="%s %s" %(self.obd_uuid, srv.uuid))
874 class Mountpoint(Module):
875 def __init__(self,dom_node):
876 Module.__init__(self, 'MTPT', dom_node)
877 self.path = get_text(dom_node, 'path')
878 self.mdc_uuid = get_first_ref(dom_node, 'mdc')
879 self.lov_uuid = get_first_ref(dom_node, 'osc')
880 self.add_module('lustre/osc', 'osc')
881 # should add lov only if needed
882 self.add_module('lustre/lov', 'lov')
883 self.add_module('lustre/llite', 'llite')
886 l = lookup(self.dom_node.parentNode, self.lov_uuid)
887 if l.nodeName == 'lov':
889 for osc_uuid in lov.devlist:
890 osc = lookup(self.dom_node.parentNode, osc_uuid)
895 panic('osc not found:', osc_uuid)
896 lctl.newdev(attach="lov %s %s" % (lov.name, lov.uuid),
897 setup ="%s" % (self.mdc_uuid))
902 self.info(self.path, self.mdc_uuid,self.lov_uuid)
903 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s none %s" % \
904 (self.lov_uuid, self.mdc_uuid, self.path)
905 run("mkdir", self.path)
908 panic("mount failed:", self.path)
910 self.info(self.path, self.mdc_uuid,self.lov_uuid)
911 (rc, out) = run("umount", self.path)
913 log("umount failed, cleanup will most likely not work.")
914 l = lookup(self.dom_node.parentNode, self.lov_uuid)
915 if l.nodeName == 'lov':
917 for osc_uuid in lov.devlist:
918 osc = lookup(self.dom_node.parentNode, osc_uuid)
923 panic('osc not found:', osc_uuid)
930 # ============================================================
931 # XML processing and query
932 # TODO: Change query funcs to use XPath, which is muc cleaner
935 list = obd.getElementsByTagName('device')
939 size = get_attr_int(dev, 'size', 0)
940 return dev.firstChild.data, size
943 # Get the text content from the first matching child
944 # If there is no content (or it is all whitespace), return
946 def get_text(dom_node, tag, default=""):
947 list = dom_node.getElementsByTagName(tag)
951 if dom_node.firstChild:
952 txt = string.strip(dom_node.firstChild.data)
957 def get_text_int(dom_node, tag, default=0):
958 list = dom_node.getElementsByTagName(tag)
963 if dom_node.firstChild:
964 txt = string.strip(dom_node.firstChild.data)
969 panic("text value is not integer:", txt)
972 def get_attr(dom_node, attr, default=""):
973 v = dom_node.getAttribute(attr)
978 def get_attr_int(dom_node, attr, default=0):
980 v = dom_node.getAttribute(attr)
985 panic("attr value is not integer", v)
988 def get_first_ref(dom_node, tag):
989 """ Get the first uuidref of the type TAG. Used one only
990 one is expected. Returns the uuid."""
992 refname = '%s_ref' % tag
993 list = dom_node.getElementsByTagName(refname)
995 uuid = getRef(list[0])
998 def get_all_refs(dom_node, tag):
999 """ Get all the refs of type TAG. Returns list of uuids. """
1001 refname = '%s_ref' % tag
1002 list = dom_node.getElementsByTagName(refname)
1005 uuids.append(getRef(i))
1008 def get_ost_net(dom_node, uuid):
1009 ost = lookup(dom_node, uuid)
1010 uuid = get_first_ref(ost, 'network')
1013 return lookup(dom_node, uuid)
1015 def nid2server(dom_node, nid):
1016 netlist = dom_node.getElementsByTagName('network')
1017 for net_node in netlist:
1018 if get_text(net_node, 'server') == nid:
1019 return Network(net_node)
1022 def lookup(dom_node, uuid):
1023 for n in dom_node.childNodes:
1024 if n.nodeType == n.ELEMENT_NODE:
1025 if getUUID(n) == uuid:
1032 # Get name attribute of dom_node
1033 def getName(dom_node):
1034 return dom_node.getAttribute('name')
1036 def getRef(dom_node):
1037 return dom_node.getAttribute('uuidref')
1039 # Get name attribute of dom_node
1040 def getUUID(dom_node):
1041 return dom_node.getAttribute('uuid')
1043 # the tag name is the service type
1044 # fixme: this should do some checks to make sure the dom_node is a service
1045 def getServiceType(dom_node):
1046 return dom_node.nodeName
1049 # determine what "level" a particular node is at.
1050 # the order of iniitailization is based on level.
1051 def getServiceLevel(dom_node):
1052 type = getServiceType(dom_node)
1053 if type in ('network',):
1055 elif type in ('device', 'ldlm'):
1057 elif type in ('obd', 'mdd'):
1059 elif type in ('mds','ost'):
1061 elif type in ('mdc','osc'):
1063 elif type in ('lov',):
1065 elif type in ('mountpoint',):
1070 # return list of services in a profile. list is a list of tuples
1071 # [(level, dom_node),]
1072 def getServices(lustreNode, profileNode):
1074 for n in profileNode.childNodes:
1075 if n.nodeType == n.ELEMENT_NODE:
1076 servNode = lookup(lustreNode, getRef(n))
1079 panic('service not found: ' + getRef(n))
1080 level = getServiceLevel(servNode)
1081 list.append((level, servNode))
1085 def getByName(lustreNode, name, tag):
1086 ndList = lustreNode.getElementsByTagName(tag)
1088 if getName(nd) == name:
1095 ############################################################
1096 # routing ("rooting")
1102 def init_node(dom_node):
1103 global local_node, router_flag
1104 netlist = dom_node.getElementsByTagName('network')
1105 for dom_net in netlist:
1106 type = get_attr(dom_net, 'type')
1107 gw = get_text(dom_net, 'server')
1108 local_node.append((type, gw))
1110 def node_needs_router():
1113 def get_routes(type, gw, dom_net):
1114 """ Return the routes as a list of tuples of the form:
1115 [(type, gw, lo, hi),]"""
1117 tbl = dom_net.getElementsByTagName('route_tbl')
1119 routes = t.getElementsByTagName('route')
1121 lo = get_attr(r, 'lo')
1122 hi = get_attr(r, 'hi', '')
1123 res.append((type, gw, lo, hi))
1127 def init_route_config(lustre):
1128 """ Scan the lustre config looking for routers. Build list of
1130 global routes, router_flag
1132 list = lustre.getElementsByTagName('node')
1134 if get_attr(node, 'router'):
1136 for (local_type, local_nid) in local_node:
1138 netlist = node.getElementsByTagName('network')
1139 for dom_net in netlist:
1140 if local_type == get_attr(dom_net, 'type'):
1141 gw = get_text(dom_net, 'server')
1145 for dom_net in netlist:
1146 if local_type != get_attr(dom_net, 'type'):
1147 for route in get_routes(local_type, gw, dom_net):
1148 routes.append(route)
1153 for iface in local_node:
1154 if net.net_type == iface[0]:
1158 def find_route(net):
1159 global local_node, routes
1160 frm_type = local_node[0][0]
1161 to_type = net.net_type
1163 debug ('looking for route to', to_type,to)
1172 ############################################################
1175 def startService(dom_node, module_flag):
1176 type = getServiceType(dom_node)
1177 debug('Service:', type, getName(dom_node), getUUID(dom_node))
1178 # there must be a more dynamic way of doing this...
1184 elif type == 'network':
1185 n = Network(dom_node)
1196 elif type == 'mountpoint':
1197 n = Mountpoint(dom_node)
1199 panic ("unknown service type:", type)
1204 if config.cleanup():
1209 if config.nosetup():
1211 if config.cleanup():
1217 # Prepare the system to run lustre using a particular profile
1218 # in a the configuration.
1219 # * load & the modules
1220 # * setup networking for the current node
1221 # * make sure partitions are in place and prepared
1222 # * initialize devices with lctl
1223 # Levels is important, and needs to be enforced.
1224 def startProfile(lustreNode, profileNode, module_flag):
1226 panic("profile:", profile, "not found.")
1227 services = getServices(lustreNode, profileNode)
1228 if config.cleanup():
1231 startService(s[1], module_flag)
1236 def doHost(lustreNode, hosts):
1240 dom_node = getByName(lustreNode, h, 'node')
1245 print 'No host entry found.'
1248 if not get_attr(dom_node, 'router'):
1250 init_route_config(lustreNode)
1255 # Two step process: (1) load modules, (2) setup lustre
1256 # if not cleaning, load modules first.
1257 module_flag = not config.cleanup()
1258 reflist = dom_node.getElementsByTagName('profile')
1259 for profile in reflist:
1260 startProfile(lustreNode, profile, module_flag)
1262 if not config.cleanup():
1263 sys_set_debug_path()
1264 script = config.gdb_script()
1265 run(lctl.lctl, ' modules >', script)
1267 # dump /tmp/ogdb and sleep/pause here
1268 log ("The GDB module script is in", script)
1271 module_flag = not module_flag
1272 for profile in reflist:
1273 startProfile(lustreNode, profile, module_flag)
1275 ############################################################
1276 # Command line processing
1278 def parse_cmdline(argv):
1280 long_opts = ["ldap", "reformat", "lustre=", "verbose", "gdb",
1281 "portals=", "makeldiff", "cleanup", "noexec",
1282 "help", "node=", "get=", "nomod", "nosetup",
1287 opts, args = getopt.getopt(argv, short_opts, long_opts)
1288 except getopt.error:
1293 if o in ("-h", "--help"):
1295 if o in ("-d","--cleanup"):
1297 if o in ("-v", "--verbose"):
1299 if o in ("-n", "--noexec"):
1302 if o == "--portals":
1306 if o == "--reformat":
1316 if o == "--nosetup":
1326 s = urllib.urlopen(url)
1332 def setupModulePath(cmd):
1333 base = os.path.dirname(cmd)
1334 if os.access(base+"/Makefile", os.R_OK):
1335 config.src_dir(base + "/../../")
1337 def sys_set_debug_path():
1338 debug("debug path: ", config.debug_path())
1342 fp = open('/proc/sys/portals/debug_path', 'w')
1343 fp.write(config.debug_path())
1348 #/proc/sys/net/core/rmem_max
1349 #/proc/sys/net/core/wmem_max
1350 def sys_set_netmem_max(path, max):
1351 debug("setting", path, "to at least", max)
1359 fp = open(path, 'w')
1360 fp.write('%d\n' %(max))
1364 def sys_make_devices():
1365 if not os.access('/dev/portals', os.R_OK):
1366 run('mknod /dev/portals c 10 240')
1367 if not os.access('/dev/obd', os.R_OK):
1368 run('mknod /dev/obd c 10 241')
1370 # Initialize or shutdown lustre according to a configuration file
1371 # * prepare the system for lustre
1372 # * configure devices with lctl
1373 # Shutdown does steps in reverse
1376 global TCP_ACCEPTOR, lctl, MAXTCPBUF
1377 host = socket.gethostname()
1379 args = parse_cmdline(sys.argv[1:])
1381 if not os.access(args[0], os.R_OK | os.W_OK):
1382 print 'File not found:', args[0]
1384 dom = xml.dom.minidom.parse(args[0])
1386 xmldata = fetch(config.url())
1387 dom = xml.dom.minidom.parseString(xmldata)
1393 node_list.append(config.node())
1396 node_list.append(host)
1397 node_list.append('localhost')
1398 debug("configuring for host: ", node_list)
1401 config._debug_path = config._debug_path + '-' + host
1402 config._gdb_script = config._gdb_script + '-' + host
1404 TCP_ACCEPTOR = find_prog('acceptor')
1405 if not TCP_ACCEPTOR:
1407 TCP_ACCEPTOR = 'acceptor'
1408 debug('! acceptor not found')
1410 panic('acceptor not found')
1412 lctl = LCTLInterface('lctl')
1414 setupModulePath(sys.argv[0])
1416 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
1417 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
1418 doHost(dom.documentElement, node_list)
1420 if __name__ == "__main__":
1423 except LconfError, e:
1425 except CommandError, e: