3 # Copyright (C) 2002 Cluster File Systems, Inc.
4 # Author: Robert Read <rread@clusterfs.com>
6 # This file is part of Lustre, http://www.lustre.org.
8 # Lustre is free software; you can redistribute it and/or
9 # modify it under the terms of version 2 of the GNU General Public
10 # License as published by the Free Software Foundation.
12 # Lustre is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Lustre; if not, write to the Free Software
19 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 # lconf - lustre configuration tool
23 # lconf is the main driver script for starting and stopping
24 # lustre filesystem services.
26 # Based in part on the XML obdctl modifications done by Brian Behlendorf
29 import string, os, stat, popen2, socket, time
31 import xml.dom.minidom
37 # Maximum number of devices to search for.
38 # (the /dev/loop* nodes need to be created beforehand)
39 MAX_LOOP_DEVICES = 256
43 print """usage: lconf config.xml
45 config.xml Lustre configuration in xml format.
46 --get <url> URL to fetch a config file
47 --node <nodename> Load config for <nodename>
48 -d | --cleanup Cleans up config. (Shutdown)
49 -v | --verbose Print system commands as they are run
50 -h | --help Print this help
51 --gdb Prints message after creating gdb module script
52 and sleeps for 5 seconds.
53 -n | --noexec Prints the commands and steps that will be run for a
54 config without executing them. This can used to check if a
55 config file is doing what it should be doing. (Implies -v)
56 --nomod Skip load/unload module step.
57 --nosetup Skip device setup/cleanup step.
58 --reformat Reformat all devices (without question)
61 --ldap server LDAP server with lustre config database
62 --makeldiff Translate xml source to LDIFF
63 This are perhaps not needed:
64 --lustre="src dir" Base directory of lustre sources. Used to search
66 --portals=src Portals source
70 # ============================================================
71 # Config parameters, encapsulated in a class
86 self._gdb_script = '/tmp/ogdb'
87 self._debug_path = '/tmp/lustre-log'
90 def verbose(self, flag = None):
91 if flag: self._verbose = flag
94 def noexec(self, flag = None):
95 if flag: self._noexec = flag
98 def reformat(self, flag = None):
99 if flag: self._reformat = flag
100 return self._reformat
102 def cleanup(self, flag = None):
103 if flag: self._cleanup = flag
106 def gdb(self, flag = None):
107 if flag: self._gdb = flag
110 def nomod(self, flag = None):
111 if flag: self._nomod = flag
114 def nosetup(self, flag = None):
115 if flag: self._nosetup = flag
118 def node(self, val = None):
119 if val: self._node = val
122 def url(self, val = None):
123 if val: self._url = val
126 def gdb_script(self):
127 if os.path.isdir('/r'):
128 return '/r' + self._gdb_script
130 return self._gdb_script
132 def debug_path(self):
133 if os.path.isdir('/r'):
134 return '/r' + self._debug_path
136 return self._debug_path
138 def src_dir(self, val = None):
139 if val: self._url = val
144 # ============================================================
145 # debugging and error funcs
147 def fixme(msg = "this feature"):
148 raise LconfError, msg + ' not implmemented yet.'
151 msg = string.join(map(str,args))
152 if not config.noexec():
153 raise LconfError(msg)
158 msg = string.join(map(str,args))
163 print string.strip(s)
167 msg = string.join(map(str,args))
170 # ============================================================
171 # locally defined exceptions
172 class CommandError (exceptions.Exception):
173 def __init__(self, cmd_name, cmd_err, rc=None):
174 self.cmd_name = cmd_name
175 self.cmd_err = cmd_err
180 if type(self.cmd_err) == types.StringType:
182 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
184 print "! %s: %s" % (self.cmd_name, self.cmd_err)
185 elif type(self.cmd_err) == types.ListType:
187 print "! %s (error %d):" % (self.cmd_name, self.rc)
189 print "! %s:" % (self.cmd_name)
190 for s in self.cmd_err:
191 print "> %s" %(string.strip(s))
195 class LconfError (exceptions.Exception):
196 def __init__(self, args):
200 # ============================================================
201 # handle lctl interface
204 Manage communication with lctl
207 def __init__(self, cmd):
209 Initialize close by finding the lctl binary.
211 self.lctl = find_prog(cmd)
214 debug('! lctl not found')
217 raise CommandError('lctl', "unable to find lctl binary.")
222 the cmds are written to stdin of lctl
223 lctl doesn't return errors when run in script mode, so
225 should modify command line to accept multiple commands, or
226 create complex command line options
228 debug("+", self.lctl, cmds)
229 if config.noexec(): return (0, [])
230 p = popen2.Popen3(self.lctl, 1)
231 p.tochild.write(cmds + "\n")
233 out = p.fromchild.readlines()
234 err = p.childerr.readlines()
237 raise CommandError(self.lctl, err, ret)
241 def network(self, net, nid):
242 """ initialized network and add "self" """
243 # Idea: "mynid" could be used for all network types to add "self," and then
244 # this special case would be gone and the "self" hack would be hidden.
250 quit""" % (net, nid, nid)
259 # create a new connection
260 def connect(self, net, nid, port, servuuid, send_mem, recv_mem):
268 quit""" % (net, servuuid, nid, send_mem, recv_mem, nid, port, )
274 quit""" % (net, servuuid, nid, nid, port, )
278 # add a route to a range
279 def add_route(self, net, gw, lo, hi):
283 quit """ % (net, gw, lo, hi)
287 # add a route to a range
288 def del_route(self, net, gw, lo, hi):
295 # add a route to a host
296 def add_route_host(self, net, uuid, gw, tgt):
301 quit """ % (net, uuid, tgt, gw, tgt)
304 # disconnect one connection
305 def disconnect(self, net, nid, port, servuuid):
310 quit""" % (net, nid, servuuid)
313 # disconnect all connections
314 def disconnectAll(self, net):
322 # create a new device with lctl
323 def newdev(self, attach, setup = ""):
328 quit""" % (attach, setup)
332 def cleanup(self, name, uuid):
341 def lovconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist):
345 lovconfig %s %d %d %d %s %s
346 quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
349 # ============================================================
350 # Various system-level functions
351 # (ideally moved to their own module)
353 # Run a command and return the output and status.
354 # stderr is sent to /dev/null, could use popen3 to
355 # save it if necessary
357 cmd = string.join(map(str,args))
359 if config.noexec(): return (0, [])
360 f = os.popen(cmd + ' 2>&1')
369 # Run a command in the background.
370 def run_daemon(*args):
371 cmd = string.join(map(str,args))
373 if config.noexec(): return 0
374 f = os.popen(cmd + ' 2>&1')
382 # Determine full path to use for an external command
383 # searches dirname(argv[0]) first, then PATH
385 syspath = string.split(os.environ['PATH'], ':')
386 cmdpath = os.path.dirname(sys.argv[0])
387 syspath.insert(0, cmdpath);
388 syspath.insert(0, os.path.join(cmdpath, '../../portals/linux/utils/'))
390 prog = os.path.join(d,cmd)
391 if os.access(prog, os.X_OK):
395 # Recursively look for file starting at base dir
396 def do_find_file(base, mod):
397 fullname = os.path.join(base, mod)
398 if os.access(fullname, os.R_OK):
400 for d in os.listdir(base):
401 dir = os.path.join(base,d)
402 if os.path.isdir(dir):
403 module = do_find_file(dir, mod)
407 def find_module(src_dir, dev_dir, modname):
408 mod = '%s.o' % (modname)
409 module = src_dir +'/'+ dev_dir +'/'+ mod
411 if os.access(module, os.R_OK):
417 # is the path a block device?
424 return stat.S_ISBLK(s[stat.ST_MODE])
426 # build fs according to type
428 def mkfs(fstype, dev):
429 if(fstype in ('ext3', 'extN')):
430 mkfs = 'mkfs.ext2 -j -b 4096'
432 print 'unsupported fs type: ', fstype
433 if not is_block(dev):
437 (ret, out) = run (mkfs, force, dev)
439 panic("Unable to build fs:", dev)
440 # enable hash tree indexing on fs
442 htree = 'echo "feature FEATURE_C5" | debugfs -w'
443 (ret, out) = run (htree, dev)
445 panic("Unable to enable htree:", dev)
447 # some systems use /dev/loopN, some /dev/loop/N
451 if not os.access(loop + str(0), os.R_OK):
453 if not os.access(loop + str(0), os.R_OK):
454 panic ("can't access loop devices")
457 # find loop device assigned to thefile
460 for n in xrange(0, MAX_LOOP_DEVICES):
462 if os.access(dev, os.R_OK):
463 (stat, out) = run('losetup', dev)
464 if (out and stat == 0):
465 m = re.search(r'\((.*)\)', out[0])
466 if m and file == m.group(1):
472 # create file if necessary and assign the first free loop device
473 def init_loop(file, size, fstype):
474 dev = find_loop(file)
476 print 'WARNING file:', file, 'already mapped to', dev
478 if not os.access(file, os.R_OK | os.W_OK):
479 run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size, file))
481 # find next free loop
482 for n in xrange(0, MAX_LOOP_DEVICES):
484 if os.access(dev, os.R_OK):
485 (stat, out) = run('losetup', dev)
487 run('losetup', dev, file)
490 print "out of loop devices"
492 print "out of loop devices"
495 # undo loop assignment
496 def clean_loop(file):
497 dev = find_loop(file)
499 ret, out = run('losetup -d', dev)
501 log('unable to clean loop device:', dev, 'for file:', file)
504 # determine if dev is formatted as a <fstype> filesystem
505 def need_format(fstype, dev):
506 # FIXME don't know how to implement this
509 # initialize a block device if needed
510 def block_dev(dev, size, fstype, format):
511 if config.noexec(): return dev
512 if not is_block(dev):
513 dev = init_loop(dev, size, fstype)
514 if config.reformat() or (need_format(fstype, dev) and format == 'yes'):
518 # panic("device:", dev,
519 # "not prepared, and autoformat is not set.\n",
520 # "Rerun with --reformat option to format ALL filesystems")
524 def get_local_address(net_type):
525 """Return the local address for the network type."""
527 if net_type == 'tcp':
529 host = socket.gethostname()
530 local = socket.gethostbyname(host)
531 elif net_type == 'elan':
532 # awk '/NodeId/ { print $2 }' '/proc/elan/device0/position'
534 fp = open('/proc/elan/device0/position', 'r')
535 lines = fp.readlines()
544 elif net_type == 'gm':
545 fixme("automatic local address for GM")
550 # ============================================================
551 # Classes to prepare and cleanup the various objects
554 """ Base class for the rest of the modules. The default cleanup method is
555 defined here, as well as some utilitiy funcs.
557 def __init__(self, module_name, dom_node):
558 self.dom_node = dom_node
559 self.module_name = module_name
560 self.name = get_attr(dom_node, 'name')
561 self.uuid = get_attr(dom_node, 'uuid')
562 self.kmodule_list = []
566 def info(self, *args):
567 msg = string.join(map(str,args))
568 print self.module_name + ":", self.name, self.uuid, msg
571 def lookup_server(self, srv_uuid):
572 """ Lookup a server's network information """
573 net = get_ost_net(self.dom_node.parentNode, srv_uuid)
574 self._server = Network(net)
576 def get_server(self):
580 """ default cleanup, used for most modules """
582 srv = self.get_server()
583 if srv and local_net(srv):
585 lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
586 except CommandError, e:
587 log(self.module_name, "disconnect failed: ", self.name)
590 lctl.cleanup(self.name, self.uuid)
591 except CommandError, e:
592 log(self.module_name, "cleanup failed: ", self.name)
595 def add_module(self, dev_dir, modname):
596 """Append a module to list of modules to load."""
597 self.kmodule_list.append((dev_dir, modname))
599 def mod_loaded(self, modname):
600 """Check if a module is already loaded. Look in /proc/modules for it."""
601 fp = open('/proc/modules')
602 lines = fp.readlines()
604 # please forgive my tired fingers for this one
605 ret = filter(lambda word, mod=modname: word == mod,
606 map(lambda line: string.split(line)[0], lines))
609 def load_module(self):
610 """Load all the modules in the list in the order they appear."""
611 for dev_dir, mod in self.kmodule_list:
612 # (rc, out) = run ('/sbin/lsmod | grep -s', mod)
613 if self.mod_loaded(mod) and not config.noexec():
615 log ('loading module:', mod)
617 module = find_module(config.src_dir(),dev_dir, mod)
619 panic('module not found:', mod)
620 (rc, out) = run('/sbin/insmod', module)
622 raise CommandError('insmod', out, rc)
624 (rc, out) = run('/sbin/modprobe', mod)
626 raise CommandError('modprobe', out, rc)
628 def cleanup_module(self):
629 """Unload the modules in the list in reverse order."""
630 rev = self.kmodule_list
632 for dev_dir, mod in rev:
633 if not self.mod_loaded(mod):
635 log('unloading module:', mod)
638 (rc, out) = run('/sbin/rmmod', mod)
640 log('! unable to unload module:', mod)
644 class Network(Module):
645 def __init__(self,dom_node):
646 Module.__init__(self, 'NETWORK', dom_node)
647 self.net_type = get_attr(dom_node,'type')
648 self.nid = get_text(dom_node, 'server', '*')
649 self.port = get_text_int(dom_node, 'port', 0)
650 self.send_mem = get_text_int(dom_node, 'send_mem', 65536)
651 self.recv_mem = get_text_int(dom_node, 'recv_mem', 65536)
653 self.nid = get_local_address(self.net_type)
655 panic("unable to set nid for", self.net_type)
657 self.add_module('portals/linux/oslib/', 'portals')
658 if node_needs_router():
659 self.add_module('portals/linux/router', 'kptlrouter')
660 if self.net_type == 'tcp':
661 self.add_module('portals/linux/socknal', 'ksocknal')
662 if self.net_type == 'elan':
663 self.add_module('portals/linux/rqswnal', 'kqswnal')
664 if self.net_type == 'gm':
665 self.add_module('portals/linux/gmnal', 'kgmnal')
666 self.add_module('lustre/obdclass', 'obdclass')
667 self.add_module('lustre/ptlrpc', 'ptlrpc')
670 self.info(self.net_type, self.nid, self.port)
671 if self.net_type == 'tcp':
672 ret = run_daemon(TCP_ACCEPTOR, '-s', self.send_mem, '-r', self.recv_mem, self.port)
674 raise CommandError(TCP_ACCEPTOR, 'failed', ret)
675 ret = self.dom_node.getElementsByTagName('route_tbl')
677 for r in a.getElementsByTagName('route'):
678 net_type = get_attr(r, 'type')
679 gw = get_attr(r, 'gw')
680 lo = get_attr(r, 'lo')
681 hi = get_attr(r,'hi', '')
682 lctl.add_route(net_type, gw, lo, hi)
683 if self.net_type == 'tcp' and hi == '':
684 srv = nid2server(self.dom_node.parentNode.parentNode, lo)
686 panic("no server for nid", lo)
688 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
691 lctl.network(self.net_type, self.nid)
692 lctl.newdev(attach = "ptlrpc RPCDEV")
695 self.info(self.net_type, self.nid, self.port)
696 ret = self.dom_node.getElementsByTagName('route_tbl')
698 for r in a.getElementsByTagName('route'):
699 lo = get_attr(r, 'lo')
700 hi = get_attr(r,'hi', '')
701 if self.net_type == 'tcp' and hi == '':
702 srv = nid2server(self.dom_node.parentNode.parentNode, lo)
704 panic("no server for nid", lo)
707 lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
708 except CommandError, e:
709 print "disconnect failed: ", self.name
712 lctl.del_route(self.net_type, self.nid, lo, hi)
713 except CommandError, e:
714 print "del_route failed: ", self.name
718 lctl.cleanup("RPCDEV", "")
719 except CommandError, e:
720 print "cleanup failed: ", self.name
723 lctl.disconnectAll(self.net_type)
724 except CommandError, e:
725 print "disconnectAll failed: ", self.name
727 if self.net_type == 'tcp':
728 # yikes, this ugly! need to save pid in /var/something
729 run("killall acceptor")
732 def __init__(self,dom_node):
733 Module.__init__(self, 'LDLM', dom_node)
734 self.add_module('lustre/ldlm', 'ldlm')
737 lctl.newdev(attach="ldlm %s %s" % (self.name, self.uuid),
741 def __init__(self,dom_node):
742 Module.__init__(self, 'LOV', dom_node)
743 self.mdsuuid = get_first_ref(dom_node, 'mds')
744 mds= lookup(dom_node.parentNode, self.mdsuuid)
745 self.mdsname = getName(mds)
746 devs = dom_node.getElementsByTagName('devices')
749 self.stripe_sz = get_attr_int(dev_node, 'stripesize', 65536)
750 self.stripe_off = get_attr_int(dev_node, 'stripeoffset', 0)
751 self.pattern = get_attr_int(dev_node, 'pattern', 0)
752 self.devlist = get_all_refs(dev_node, 'osc')
753 self.stripe_cnt = len(self.devlist)
756 self.info(self.mdsuuid, self.stripe_cnt, self.stripe_sz, self.stripe_off, self.pattern,
757 self.devlist, self.mdsname)
758 lctl.lovconfig(self.uuid, self.mdsname, self.stripe_cnt,
759 self.stripe_sz, self.stripe_off, self.pattern,
760 string.join(self.devlist))
764 def __init__(self,dom_node):
765 Module.__init__(self, 'MDS', dom_node)
766 self.devname, self.size = get_device(dom_node)
767 self.fstype = get_text(dom_node, 'fstype')
768 self.format = get_text(dom_node, 'autoformat', "no")
769 if self.fstype == 'extN':
770 self.add_module('lustre/extN', 'extN')
771 self.add_module('lustre/mds', 'mds')
772 self.add_module('lustre/mds', 'mds_%s' % (self.fstype))
775 self.info(self.devname, self.fstype, self.format)
776 blkdev = block_dev(self.devname, self.size, self.fstype, self.format)
777 lctl.newdev(attach="mds %s %s" % (self.name, self.uuid),
778 setup ="%s %s" %(blkdev, self.fstype))
781 clean_loop(self.devname)
784 def __init__(self,dom_node):
785 Module.__init__(self, 'MDC', dom_node)
786 self.mds_uuid = get_first_ref(dom_node, 'mds')
787 self.lookup_server(self.mds_uuid)
788 self.add_module('lustre/mdc', 'mdc')
791 self.info(self.mds_uuid)
792 srv = self.get_server()
793 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
794 lctl.newdev(attach="mdc %s %s" % (self.name, self.uuid),
795 setup ="%s %s" %(self.mds_uuid, srv.uuid))
798 def __init__(self, dom_node):
799 Module.__init__(self, 'OBD', dom_node)
800 self.obdtype = get_attr(dom_node, 'type')
801 self.devname, self.size = get_device(dom_node)
802 self.fstype = get_text(dom_node, 'fstype')
803 self.format = get_text(dom_node, 'autoformat', 'yes')
804 if self.fstype == 'extN':
805 self.add_module('lustre/extN', 'extN')
806 self.add_module('lustre/' + self.obdtype, self.obdtype)
808 # need to check /proc/mounts and /etc/mtab before
809 # formatting anything.
810 # FIXME: check if device is already formatted.
812 self.info(self.obdtype, self.devname, self.size, self.fstype, self.format)
813 if self.obdtype == 'obdecho':
816 blkdev = block_dev(self.devname, self.size, self.fstype, self.format)
817 lctl.newdev(attach="%s %s %s" % (self.obdtype, self.name, self.uuid),
818 setup ="%s %s" %(blkdev, self.fstype))
821 if not self.obdtype == 'obdecho':
822 clean_loop(self.devname)
825 def __init__(self,dom_node):
826 Module.__init__(self, 'OST', dom_node)
827 self.obd_uuid = get_first_ref(dom_node, 'obd')
828 self.add_module('lustre/ost', 'ost')
831 self.info(self.obd_uuid)
832 lctl.newdev(attach="ost %s %s" % (self.name, self.uuid),
833 setup ="%s" % (self.obd_uuid))
836 def __init__(self,dom_node):
837 Module.__init__(self, 'OSC', dom_node)
838 self.obd_uuid = get_first_ref(dom_node, 'obd')
839 self.ost_uuid = get_first_ref(dom_node, 'ost')
840 self.lookup_server(self.ost_uuid)
841 self.add_module('lustre/osc', 'osc')
844 self.info(self.obd_uuid, self.ost_uuid)
845 srv = self.get_server()
847 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
851 lctl.add_route_host(r[0], srv.uuid, r[1], r[2])
853 panic ("no route to", srv.nid)
855 lctl.newdev(attach="osc %s %s" % (self.name, self.uuid),
856 setup ="%s %s" %(self.obd_uuid, srv.uuid))
859 class Mountpoint(Module):
860 def __init__(self,dom_node):
861 Module.__init__(self, 'MTPT', dom_node)
862 self.path = get_text(dom_node, 'path')
863 self.mdc_uuid = get_first_ref(dom_node, 'mdc')
864 self.lov_uuid = get_first_ref(dom_node, 'osc')
865 self.add_module('lustre/osc', 'osc')
866 # should add lov only if needed
867 self.add_module('lustre/lov', 'lov')
868 self.add_module('lustre/llite', 'llite')
871 l = lookup(self.dom_node.parentNode, self.lov_uuid)
872 if l.nodeName == 'lov':
874 for osc_uuid in lov.devlist:
875 osc = lookup(self.dom_node.parentNode, osc_uuid)
880 panic('osc not found:', osc_uuid)
881 lctl.newdev(attach="lov %s %s" % (lov.name, lov.uuid),
882 setup ="%s" % (self.mdc_uuid))
887 self.info(self.path, self.mdc_uuid,self.lov_uuid)
888 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s none %s" % \
889 (self.lov_uuid, self.mdc_uuid, self.path)
890 run("mkdir", self.path)
893 panic("mount failed:", self.path)
895 self.info(self.path, self.mdc_uuid,self.lov_uuid)
896 (rc, out) = run("umount", self.path)
898 log("umount failed, cleanup will most likely not work.")
899 l = lookup(self.dom_node.parentNode, self.lov_uuid)
900 if l.nodeName == 'lov':
902 for osc_uuid in lov.devlist:
903 osc = lookup(self.dom_node.parentNode, osc_uuid)
908 panic('osc not found:', osc_uuid)
914 # ============================================================
915 # XML processing and query
916 # TODO: Change query funcs to use XPath, which is muc cleaner
919 list = obd.getElementsByTagName('device')
923 size = get_attr_int(dev, 'size', 0)
924 return dev.firstChild.data, size
927 # Get the text content from the first matching child
928 # If there is no content (or it is all whitespace), return
930 def get_text(dom_node, tag, default=""):
931 list = dom_node.getElementsByTagName(tag)
935 if dom_node.firstChild:
936 txt = string.strip(dom_node.firstChild.data)
941 def get_text_int(dom_node, tag, default=0):
942 list = dom_node.getElementsByTagName(tag)
947 if dom_node.firstChild:
948 txt = string.strip(dom_node.firstChild.data)
953 panic("text value is not integer:", txt)
956 def get_attr(dom_node, attr, default=""):
957 v = dom_node.getAttribute(attr)
962 def get_attr_int(dom_node, attr, default=0):
964 v = dom_node.getAttribute(attr)
969 panic("attr value is not integer", v)
972 def get_first_ref(dom_node, tag):
973 """ Get the first uuidref of the type TAG. Used one only
974 one is expected. Returns the uuid."""
976 refname = '%s_ref' % tag
977 list = dom_node.getElementsByTagName(refname)
979 uuid = getRef(list[0])
982 def get_all_refs(dom_node, tag):
983 """ Get all the refs of type TAG. Returns list of uuids. """
985 refname = '%s_ref' % tag
986 list = dom_node.getElementsByTagName(refname)
989 uuids.append(getRef(i))
992 def get_ost_net(dom_node, uuid):
993 ost = lookup(dom_node, uuid)
994 uuid = get_first_ref(ost, 'network')
997 return lookup(dom_node, uuid)
999 def nid2server(dom_node, nid):
1000 netlist = dom_node.getElementsByTagName('network')
1001 for net_node in netlist:
1002 if get_text(net_node, 'server') == nid:
1003 return Network(net_node)
1006 def lookup(dom_node, uuid):
1007 for n in dom_node.childNodes:
1008 if n.nodeType == n.ELEMENT_NODE:
1009 if getUUID(n) == uuid:
1016 # Get name attribute of dom_node
1017 def getName(dom_node):
1018 return dom_node.getAttribute('name')
1020 def getRef(dom_node):
1021 return dom_node.getAttribute('uuidref')
1023 # Get name attribute of dom_node
1024 def getUUID(dom_node):
1025 return dom_node.getAttribute('uuid')
1027 # the tag name is the service type
1028 # fixme: this should do some checks to make sure the dom_node is a service
1029 def getServiceType(dom_node):
1030 return dom_node.nodeName
1033 # determine what "level" a particular node is at.
1034 # the order of iniitailization is based on level.
1035 def getServiceLevel(dom_node):
1036 type = getServiceType(dom_node)
1037 if type in ('network',):
1039 elif type in ('device', 'ldlm'):
1041 elif type in ('obd', 'mdd'):
1043 elif type in ('mds','ost'):
1045 elif type in ('mdc','osc'):
1047 elif type in ('lov',):
1049 elif type in ('mountpoint',):
1054 # return list of services in a profile. list is a list of tuples
1055 # [(level, dom_node),]
1056 def getServices(lustreNode, profileNode):
1058 for n in profileNode.childNodes:
1059 if n.nodeType == n.ELEMENT_NODE:
1060 servNode = lookup(lustreNode, getRef(n))
1063 panic('service not found: ' + getRef(n))
1064 level = getServiceLevel(servNode)
1065 list.append((level, servNode))
1069 def getByName(lustreNode, name, tag):
1070 ndList = lustreNode.getElementsByTagName(tag)
1072 if getName(nd) == name:
1079 ############################################################
1080 # routing ("rooting")
1086 def init_node(dom_node):
1087 global local_node, router_flag
1088 netlist = dom_node.getElementsByTagName('network')
1089 for dom_net in netlist:
1090 type = get_attr(dom_net, 'type')
1091 gw = get_text(dom_net, 'server')
1092 local_node.append((type, gw))
1094 def node_needs_router():
1097 def get_routes(type, gw, dom_net):
1098 """ Return the routes as a list of tuples of the form:
1099 [(type, gw, lo, hi),]"""
1101 tbl = dom_net.getElementsByTagName('route_tbl')
1103 routes = t.getElementsByTagName('route')
1105 lo = get_attr(r, 'lo')
1106 hi = get_attr(r, 'hi', '')
1107 res.append((type, gw, lo, hi))
1111 def init_route_config(lustre):
1112 """ Scan the lustre config looking for routers. Build list of
1114 global routes, router_flag
1116 list = lustre.getElementsByTagName('node')
1118 if get_attr(node, 'router'):
1120 for (local_type, local_nid) in local_node:
1122 netlist = node.getElementsByTagName('network')
1123 for dom_net in netlist:
1124 if local_type == get_attr(dom_net, 'type'):
1125 gw = get_text(dom_net, 'server')
1129 for dom_net in netlist:
1130 if local_type != get_attr(dom_net, 'type'):
1131 for route in get_routes(local_type, gw, dom_net):
1132 routes.append(route)
1137 for iface in local_node:
1138 if net.net_type == iface[0]:
1142 def find_route(net):
1143 global local_node, routes
1144 frm_type = local_node[0][0]
1145 to_type = net.net_type
1147 debug ('looking for route to', to_type,to)
1156 ############################################################
1159 def startService(dom_node, module_flag):
1160 type = getServiceType(dom_node)
1161 debug('Service:', type, getName(dom_node), getUUID(dom_node))
1162 # there must be a more dynamic way of doing this...
1168 elif type == 'network':
1169 n = Network(dom_node)
1180 elif type == 'mountpoint':
1181 n = Mountpoint(dom_node)
1183 panic ("unknown service type:", type)
1188 if config.cleanup():
1193 if config.nosetup():
1195 if config.cleanup():
1201 # Prepare the system to run lustre using a particular profile
1202 # in a the configuration.
1203 # * load & the modules
1204 # * setup networking for the current node
1205 # * make sure partitions are in place and prepared
1206 # * initialize devices with lctl
1207 # Levels is important, and needs to be enforced.
1208 def startProfile(lustreNode, profileNode, module_flag):
1210 panic("profile:", profile, "not found.")
1211 services = getServices(lustreNode, profileNode)
1212 if config.cleanup():
1215 startService(s[1], module_flag)
1220 def doHost(lustreNode, hosts):
1224 dom_node = getByName(lustreNode, h, 'node')
1229 print 'No host entry found.'
1232 if not get_attr(dom_node, 'router'):
1234 init_route_config(lustreNode)
1239 # Two step process: (1) load modules, (2) setup lustre
1240 # if not cleaning, load modules first.
1241 module_flag = not config.cleanup()
1242 reflist = dom_node.getElementsByTagName('profile')
1243 for profile in reflist:
1244 startProfile(lustreNode, profile, module_flag)
1246 if not config.cleanup():
1247 sys_set_debug_path()
1248 script = config.gdb_script()
1249 run(lctl.lctl, ' modules >', script)
1251 # dump /tmp/ogdb and sleep/pause here
1252 log ("The GDB module script is in", script)
1255 module_flag = not module_flag
1256 for profile in reflist:
1257 startProfile(lustreNode, profile, module_flag)
1259 ############################################################
1260 # Command line processing
1262 def parse_cmdline(argv):
1264 long_opts = ["ldap", "reformat", "lustre=", "verbose", "gdb",
1265 "portals=", "makeldiff", "cleanup", "noexec",
1266 "help", "node=", "get=", "nomod", "nosetup"]
1270 opts, args = getopt.getopt(argv, short_opts, long_opts)
1271 except getopt.error:
1276 if o in ("-h", "--help"):
1278 if o in ("-d","--cleanup"):
1280 if o in ("-v", "--verbose"):
1282 if o in ("-n", "--noexec"):
1285 if o == "--portals":
1289 if o == "--reformat":
1299 if o == "--nosetup":
1307 s = urllib.urlopen(url)
1313 def setupModulePath(cmd):
1314 base = os.path.dirname(cmd)
1315 if os.access(base+"/Makefile", os.R_OK):
1316 config.src_dir(base + "/../../")
1318 def sys_set_debug_path():
1319 debug("debug path: ", config.debug_path())
1323 fp = open('/proc/sys/portals/debug_path', 'w')
1324 fp.write(config.debug_path())
1329 #/proc/sys/net/core/rmem_max
1330 #/proc/sys/net/core/wmem_max
1331 def sys_set_netmem_max(path, max):
1332 debug("setting", path, "to at least", max)
1340 fp = open(path, 'w')
1341 fp.write('%d\n' %(max))
1345 def sys_make_devices():
1346 if not os.access('/dev/portals', os.R_OK):
1347 run('mknod /dev/portals c 10 240')
1348 if not os.access('/dev/obd', os.R_OK):
1349 run('mknod /dev/obd c 10 241')
1351 # Initialize or shutdown lustre according to a configuration file
1352 # * prepare the system for lustre
1353 # * configure devices with lctl
1354 # Shutdown does steps in reverse
1357 global TCP_ACCEPTOR, lctl, MAXTCPBUF
1358 host = socket.gethostname()
1360 args = parse_cmdline(sys.argv[1:])
1362 if not os.access(args[0], os.R_OK | os.W_OK):
1363 print 'File not found:', args[0]
1365 dom = xml.dom.minidom.parse(args[0])
1367 xmldata = fetch(config.url())
1368 dom = xml.dom.minidom.parseString(xmldata)
1374 node_list.append(config.node())
1377 node_list.append(host)
1378 node_list.append('localhost')
1379 debug("configuring for host: ", node_list)
1382 config._debug_path = '/tmp/lustre-log-' + host
1384 TCP_ACCEPTOR = find_prog('acceptor')
1385 if not TCP_ACCEPTOR:
1387 TCP_ACCEPTOR = 'acceptor'
1388 debug('! acceptor not found')
1390 panic('acceptor not found')
1392 lctl = LCTLInterface('lctl')
1394 setupModulePath(sys.argv[0])
1396 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
1397 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
1398 doHost(dom.documentElement, node_list)
1400 if __name__ == "__main__":
1403 except LconfError, e:
1405 except CommandError, e: