3 # Copyright (C) 2002 Cluster File Systems, Inc.
4 # Author: Robert Read <rread@clusterfs.com>
5 # This file is part of Lustre, http://www.lustre.org.
7 # Lustre is free software; you can redistribute it and/or
8 # modify it under the terms of version 2 of the GNU General Public
9 # License as published by the Free Software Foundation.
11 # Lustre is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with Lustre; if not, write to the Free Software
18 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 # lconf - lustre configuration tool
22 # lconf is the main driver script for starting and stopping
23 # lustre filesystem services.
25 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import string, os, stat, popen2, socket, time
30 import xml.dom.minidom
35 DEFAULT_TCPBUF = 1048576
37 # Maximum number of devices to search for.
38 # (the /dev/loop* nodes need to be created beforehand)
39 MAX_LOOP_DEVICES = 256
41 first_cleanup_error = 0
42 def cleanup_error(rc):
43 global first_cleanup_error
44 if not first_cleanup_error:
45 first_cleanup_error = rc
49 print """usage: lconf config.xml
51 config.xml Lustre configuration in xml format.
52 --get <url> URL to fetch a config file
53 --node <nodename> Load config for <nodename>
54 -d | --cleanup Cleans up config. (Shutdown)
55 -f | --force Unmount with \"umount -f\" during shutdown
56 -v | --verbose Print system commands as they are run
57 -h | --help Print this help
58 --gdb Prints message after creating gdb module script
59 and sleeps for 5 seconds.
60 -n | --noexec Prints the commands and steps that will be run for a
61 config without executing them. This can used to check if a
62 config file is doing what it should be doing. (Implies -v)
63 --nomod Skip load/unload module step.
64 --nosetup Skip device setup/cleanup step.
65 --reformat Reformat all devices (without question)
66 --dump <file> Dump the kernel debug log before portals is unloaded
69 --ldap server LDAP server with lustre config database
70 --makeldiff Translate xml source to LDIFF
71 This are perhaps not needed:
72 --lustre="src dir" Base directory of lustre sources. Used to search
74 --portals=src Portals source
78 # ============================================================
79 # Config parameters, encapsulated in a class
95 self._gdb_script = '/tmp/ogdb'
96 self._debug_path = '/tmp/lustre-log'
97 self._dump_file = None
100 def verbose(self, flag = None):
101 if flag: self._verbose = flag
104 def noexec(self, flag = None):
105 if flag: self._noexec = flag
108 def reformat(self, flag = None):
109 if flag: self._reformat = flag
110 return self._reformat
112 def cleanup(self, flag = None):
113 if flag: self._cleanup = flag
116 def gdb(self, flag = None):
117 if flag: self._gdb = flag
120 def nomod(self, flag = None):
121 if flag: self._nomod = flag
124 def nosetup(self, flag = None):
125 if flag: self._nosetup = flag
128 def force(self, flag = None):
129 if flag: self._force = flag
132 def node(self, val = None):
133 if val: self._node = val
136 def url(self, val = None):
137 if val: self._url = val
140 def gdb_script(self):
141 if os.path.isdir('/r'):
142 return '/r' + self._gdb_script
144 return self._gdb_script
146 def debug_path(self):
147 if os.path.isdir('/r'):
148 return '/r' + self._debug_path
150 return self._debug_path
152 def src_dir(self, val = None):
153 if val: self._src_dir = val
156 def dump_file(self, val = None):
157 if val: self._dump_file = val
158 return self._dump_file
162 # ============================================================
163 # debugging and error funcs
165 def fixme(msg = "this feature"):
166 raise LconfError, msg + ' not implmemented yet.'
169 msg = string.join(map(str,args))
170 if not config.noexec():
171 raise LconfError(msg)
176 msg = string.join(map(str,args))
181 print string.strip(s)
185 msg = string.join(map(str,args))
188 # ============================================================
189 # locally defined exceptions
190 class CommandError (exceptions.Exception):
191 def __init__(self, cmd_name, cmd_err, rc=None):
192 self.cmd_name = cmd_name
193 self.cmd_err = cmd_err
198 if type(self.cmd_err) == types.StringType:
200 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
202 print "! %s: %s" % (self.cmd_name, self.cmd_err)
203 elif type(self.cmd_err) == types.ListType:
205 print "! %s (error %d):" % (self.cmd_name, self.rc)
207 print "! %s:" % (self.cmd_name)
208 for s in self.cmd_err:
209 print "> %s" %(string.strip(s))
213 class LconfError (exceptions.Exception):
214 def __init__(self, args):
218 # ============================================================
219 # handle lctl interface
222 Manage communication with lctl
225 def __init__(self, cmd):
227 Initialize close by finding the lctl binary.
229 self.lctl = find_prog(cmd)
232 debug('! lctl not found')
235 raise CommandError('lctl', "unable to find lctl binary.")
240 the cmds are written to stdin of lctl
241 lctl doesn't return errors when run in script mode, so
243 should modify command line to accept multiple commands, or
244 create complex command line options
246 debug("+", self.lctl, cmds)
247 if config.noexec(): return (0, [])
248 p = popen2.Popen3(self.lctl, 1)
249 p.tochild.write(cmds + "\n")
251 out = p.fromchild.readlines()
252 err = p.childerr.readlines()
254 if os.WIFEXITED(ret):
255 rc = os.WEXITSTATUS(ret)
259 raise CommandError(self.lctl, err, rc)
263 def network(self, net, nid):
264 """ initialized network and add "self" """
265 # Idea: "mynid" could be used for all network types to add "self," and then
266 # this special case would be gone and the "self" hack would be hidden.
272 quit""" % (net, nid, nid)
281 # create a new connection
282 def connect(self, net, nid, port, servuuid, send_mem, recv_mem):
290 quit""" % (net, servuuid, nid, send_mem, recv_mem, nid, port, )
296 quit""" % (net, servuuid, nid, nid, port, )
300 # add a route to a range
301 def add_route(self, net, gw, lo, hi):
305 quit """ % (net, gw, lo, hi)
309 # add a route to a range
310 def del_route(self, net, gw, lo, hi):
318 # add a route to a host
319 def add_route_host(self, net, uuid, gw, tgt):
324 quit """ % (net, uuid, tgt, gw, tgt)
327 # add a route to a range
328 def del_route_host(self, net, uuid, gw, tgt):
334 quit """ % (net, uuid, tgt)
337 # disconnect one connection
338 def disconnect(self, net, nid, port, servuuid):
344 quit""" % (net, nid, servuuid)
348 def disconnectAll(self, net):
357 # create a new device with lctl
358 def newdev(self, attach, setup = ""):
363 quit""" % (attach, setup)
367 def cleanup(self, name, uuid):
377 def lovconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist):
381 lovconfig %s %d %d %d %s %s
382 quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
386 def dump(self, dump_file):
389 quit""" % (dump_file)
392 # ============================================================
393 # Various system-level functions
394 # (ideally moved to their own module)
396 # Run a command and return the output and status.
397 # stderr is sent to /dev/null, could use popen3 to
398 # save it if necessary
400 cmd = string.join(map(str,args))
402 if config.noexec(): return (0, [])
403 f = os.popen(cmd + ' 2>&1')
412 # Run a command in the background.
413 def run_daemon(*args):
414 cmd = string.join(map(str,args))
416 if config.noexec(): return 0
417 f = os.popen(cmd + ' 2>&1')
425 # Determine full path to use for an external command
426 # searches dirname(argv[0]) first, then PATH
428 syspath = string.split(os.environ['PATH'], ':')
429 cmdpath = os.path.dirname(sys.argv[0])
430 syspath.insert(0, cmdpath);
431 syspath.insert(0, os.path.join(cmdpath, '../../portals/linux/utils/'))
433 prog = os.path.join(d,cmd)
434 if os.access(prog, os.X_OK):
438 # Recursively look for file starting at base dir
439 def do_find_file(base, mod):
440 fullname = os.path.join(base, mod)
441 if os.access(fullname, os.R_OK):
443 for d in os.listdir(base):
444 dir = os.path.join(base,d)
445 if os.path.isdir(dir):
446 module = do_find_file(dir, mod)
450 def find_module(src_dir, dev_dir, modname):
451 mod = '%s.o' % (modname)
452 module = src_dir +'/'+ dev_dir +'/'+ mod
454 if os.access(module, os.R_OK):
460 # is the path a block device?
467 return stat.S_ISBLK(s[stat.ST_MODE])
469 # build fs according to type
471 def mkfs(fstype, dev):
472 if(fstype in ('ext3', 'extN')):
473 mkfs = 'mkfs.ext2 -j -b 4096'
475 print 'unsupported fs type: ', fstype
476 if not is_block(dev):
480 (ret, out) = run (mkfs, force, dev)
482 panic("Unable to build fs:", dev)
483 # enable hash tree indexing on fs
485 htree = 'echo "feature FEATURE_C5" | debugfs -w'
486 (ret, out) = run (htree, dev)
488 panic("Unable to enable htree:", dev)
490 # some systems use /dev/loopN, some /dev/loop/N
494 if not os.access(loop + str(0), os.R_OK):
496 if not os.access(loop + str(0), os.R_OK):
497 panic ("can't access loop devices")
500 # find loop device assigned to thefile
503 for n in xrange(0, MAX_LOOP_DEVICES):
505 if os.access(dev, os.R_OK):
506 (stat, out) = run('losetup', dev)
507 if (out and stat == 0):
508 m = re.search(r'\((.*)\)', out[0])
509 if m and file == m.group(1):
515 # create file if necessary and assign the first free loop device
516 def init_loop(file, size, fstype):
517 dev = find_loop(file)
519 print 'WARNING file:', file, 'already mapped to', dev
521 if config.reformat() or not os.access(file, os.R_OK | os.W_OK):
522 run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size, file))
524 # find next free loop
525 for n in xrange(0, MAX_LOOP_DEVICES):
527 if os.access(dev, os.R_OK):
528 (stat, out) = run('losetup', dev)
530 run('losetup', dev, file)
533 print "out of loop devices"
535 print "out of loop devices"
538 # undo loop assignment
539 def clean_loop(file):
540 dev = find_loop(file)
542 ret, out = run('losetup -d', dev)
544 log('unable to clean loop device:', dev, 'for file:', file)
547 # determine if dev is formatted as a <fstype> filesystem
548 def need_format(fstype, dev):
549 # FIXME don't know how to implement this
552 # initialize a block device if needed
553 def block_dev(dev, size, fstype, format):
554 if config.noexec(): return dev
555 if not is_block(dev):
556 dev = init_loop(dev, size, fstype)
557 if config.reformat() or (need_format(fstype, dev) and format == 'yes'):
561 # panic("device:", dev,
562 # "not prepared, and autoformat is not set.\n",
563 # "Rerun with --reformat option to format ALL filesystems")
568 """lookup IP address for an interface"""
569 rc, out = run("/sbin/ifconfig", iface)
572 addr = string.split(out[1])[1]
573 ip = string.split(addr, ':')[1]
576 def get_local_address(net_type, wildcard):
577 """Return the local address for the network type."""
579 if net_type == 'tcp':
581 iface, star = string.split(wildcard, ':')
582 local = if2addr(iface)
584 panic ("unable to determine ip for:", wildcard)
586 host = socket.gethostname()
587 local = socket.gethostbyname(host)
588 elif net_type == 'elan':
589 # awk '/NodeId/ { print $2 }' '/proc/elan/device0/position'
591 fp = open('/proc/elan/device0/position', 'r')
592 lines = fp.readlines()
601 elif net_type == 'gm':
602 fixme("automatic local address for GM")
607 # ============================================================
608 # Classes to prepare and cleanup the various objects
611 """ Base class for the rest of the modules. The default cleanup method is
612 defined here, as well as some utilitiy funcs.
614 def __init__(self, module_name, dom_node):
615 self.dom_node = dom_node
616 self.module_name = module_name
617 self.name = get_attr(dom_node, 'name')
618 self.uuid = get_attr(dom_node, 'uuid')
619 self.kmodule_list = []
623 def info(self, *args):
624 msg = string.join(map(str,args))
625 print self.module_name + ":", self.name, self.uuid, msg
628 def lookup_server(self, srv_uuid):
629 """ Lookup a server's network information """
630 net = get_ost_net(self.dom_node.parentNode, srv_uuid)
632 panic ("Unable to find a server for:", srv_uuid)
633 self._server = Network(net)
635 def get_server(self):
639 """ default cleanup, used for most modules """
641 srv = self.get_server()
642 if srv and local_net(srv):
644 lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
645 except CommandError, e:
646 log(self.module_name, "disconnect failed: ", self.name)
650 lctl.cleanup(self.name, self.uuid)
651 except CommandError, e:
652 log(self.module_name, "cleanup failed: ", self.name)
656 def add_module(self, dev_dir, modname):
657 """Append a module to list of modules to load."""
658 self.kmodule_list.append((dev_dir, modname))
660 def mod_loaded(self, modname):
661 """Check if a module is already loaded. Look in /proc/modules for it."""
662 fp = open('/proc/modules')
663 lines = fp.readlines()
665 # please forgive my tired fingers for this one
666 ret = filter(lambda word, mod=modname: word == mod,
667 map(lambda line: string.split(line)[0], lines))
670 def load_module(self):
671 """Load all the modules in the list in the order they appear."""
672 for dev_dir, mod in self.kmodule_list:
673 # (rc, out) = run ('/sbin/lsmod | grep -s', mod)
674 if self.mod_loaded(mod) and not config.noexec():
676 log ('loading module:', mod)
678 module = find_module(config.src_dir(),dev_dir, mod)
680 panic('module not found:', mod)
681 (rc, out) = run('/sbin/insmod', module)
683 raise CommandError('insmod', out, rc)
685 (rc, out) = run('/sbin/modprobe', mod)
687 raise CommandError('modprobe', out, rc)
689 def cleanup_module(self):
690 """Unload the modules in the list in reverse order."""
691 rev = self.kmodule_list
693 for dev_dir, mod in rev:
694 if not self.mod_loaded(mod):
697 if mod == 'portals' and config.dump_file():
698 lctl.dump(config.dump_file())
699 log('unloading module:', mod)
702 (rc, out) = run('/sbin/rmmod', mod)
704 log('! unable to unload module:', mod)
708 class Network(Module):
709 def __init__(self,dom_node):
710 Module.__init__(self, 'NETWORK', dom_node)
711 self.net_type = get_attr(dom_node,'type')
712 self.nid = get_text(dom_node, 'server', '*')
713 self.port = get_text_int(dom_node, 'port', 0)
714 self.send_mem = get_text_int(dom_node, 'send_mem', DEFAULT_TCPBUF)
715 self.recv_mem = get_text_int(dom_node, 'recv_mem', DEFAULT_TCPBUF)
717 self.nid = get_local_address(self.net_type, self.nid)
719 panic("unable to set nid for", self.net_type, self.nid)
720 debug("nid:", self.nid)
722 self.add_module('portals/linux/oslib/', 'portals')
723 if node_needs_router():
724 self.add_module('portals/linux/router', 'kptlrouter')
725 if self.net_type == 'tcp':
726 self.add_module('portals/linux/socknal', 'ksocknal')
727 if self.net_type == 'elan':
728 self.add_module('portals/linux/rqswnal', 'kqswnal')
729 if self.net_type == 'gm':
730 self.add_module('portals/linux/gmnal', 'kgmnal')
731 self.add_module('lustre/obdclass', 'obdclass')
732 self.add_module('lustre/ptlrpc', 'ptlrpc')
735 self.info(self.net_type, self.nid, self.port)
736 if self.net_type == 'tcp':
737 ret, out = run(TCP_ACCEPTOR, '-s', self.send_mem, '-r', self.recv_mem, self.port)
739 raise CommandError(TCP_ACCEPTOR, out, ret)
740 ret = self.dom_node.getElementsByTagName('route_tbl')
742 for r in a.getElementsByTagName('route'):
743 net_type = get_attr(r, 'type')
744 gw = get_attr(r, 'gw')
745 lo = get_attr(r, 'lo')
746 hi = get_attr(r,'hi', '')
747 lctl.add_route(net_type, gw, lo, hi)
748 if net_type == 'tcp' and net_type == self.net_type and hi == '':
749 srv = nid2server(self.dom_node.parentNode.parentNode, lo)
751 panic("no server for nid", lo)
753 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
756 lctl.network(self.net_type, self.nid)
757 lctl.newdev(attach = "ptlrpc RPCDEV")
760 self.info(self.net_type, self.nid, self.port)
761 ret = self.dom_node.getElementsByTagName('route_tbl')
763 for r in a.getElementsByTagName('route'):
764 lo = get_attr(r, 'lo')
765 hi = get_attr(r,'hi', '')
766 if self.net_type == 'tcp' and hi == '':
767 srv = nid2server(self.dom_node.parentNode.parentNode, lo)
769 panic("no server for nid", lo)
772 lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
773 except CommandError, e:
774 print "disconnect failed: ", self.name
778 lctl.del_route(self.net_type, self.nid, lo, hi)
779 except CommandError, e:
780 print "del_route failed: ", self.name
785 lctl.cleanup("RPCDEV", "")
786 except CommandError, e:
787 print "cleanup failed: ", self.name
791 lctl.disconnectAll(self.net_type)
792 except CommandError, e:
793 print "disconnectAll failed: ", self.name
796 if self.net_type == 'tcp':
797 # yikes, this ugly! need to save pid in /var/something
798 run("killall acceptor")
801 def __init__(self,dom_node):
802 Module.__init__(self, 'LDLM', dom_node)
803 self.add_module('lustre/ldlm', 'ldlm')
806 lctl.newdev(attach="ldlm %s %s" % (self.name, self.uuid),
810 def __init__(self,dom_node):
811 Module.__init__(self, 'LOV', dom_node)
812 self.mds_uuid = get_first_ref(dom_node, 'mds')
813 mds= lookup(dom_node.parentNode, self.mds_uuid)
814 self.mds_name = getName(mds)
815 devs = dom_node.getElementsByTagName('devices')
818 self.stripe_sz = get_attr_int(dev_node, 'stripesize', 65536)
819 self.stripe_off = get_attr_int(dev_node, 'stripeoffset', 0)
820 self.pattern = get_attr_int(dev_node, 'pattern', 0)
821 self.devlist = get_all_refs(dev_node, 'osc')
822 self.stripe_cnt = get_attr_int(dev_node, 'stripecount', len(self.devlist))
823 self.add_module('lustre/mdc', 'mdc')
824 self.add_module('lustre/lov', 'lov')
827 for osc_uuid in self.devlist:
828 osc = lookup(self.dom_node.parentNode, osc_uuid)
833 panic('osc not found:', osc_uuid)
834 mdc_uuid = prepare_mdc(self.dom_node.parentNode, self.mds_uuid)
835 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
836 self.stripe_off, self.pattern, self.devlist, self.mds_name)
837 lctl.newdev(attach="lov %s %s" % (self.name, self.uuid),
838 setup ="%s" % (mdc_uuid))
841 for osc_uuid in self.devlist:
842 osc = lookup(self.dom_node.parentNode, osc_uuid)
847 panic('osc not found:', osc_uuid)
849 cleanup_mdc(self.dom_node.parentNode, self.mds_uuid)
852 def load_module(self):
853 for osc_uuid in self.devlist:
854 osc = lookup(self.dom_node.parentNode, osc_uuid)
860 panic('osc not found:', osc_uuid)
861 Module.load_module(self)
864 def cleanup_module(self):
865 Module.cleanup_module(self)
866 for osc_uuid in self.devlist:
867 osc = lookup(self.dom_node.parentNode, osc_uuid)
873 panic('osc not found:', osc_uuid)
875 class LOVConfig(Module):
876 def __init__(self,dom_node):
877 Module.__init__(self, 'LOVConfig', dom_node)
878 self.lov_uuid = get_first_ref(dom_node, 'lov')
879 l = lookup(dom_node.parentNode, self.lov_uuid)
884 self.info(lov.mds_uuid, lov.stripe_cnt, lov.stripe_sz, lov.stripe_off,
885 lov.pattern, lov.devlist, lov.mds_name)
886 lctl.lovconfig(lov.uuid, lov.mds_name, lov.stripe_cnt,
887 lov.stripe_sz, lov.stripe_off, lov.pattern,
888 string.join(lov.devlist))
896 def __init__(self,dom_node):
897 Module.__init__(self, 'MDS', dom_node)
898 self.devname, self.size = get_device(dom_node)
899 self.fstype = get_text(dom_node, 'fstype')
900 self.format = get_text(dom_node, 'autoformat', "no")
901 if self.fstype == 'extN':
902 self.add_module('lustre/extN', 'extN')
903 self.add_module('lustre/mds', 'mds')
904 self.add_module('lustre/mds', 'mds_%s' % (self.fstype))
907 self.info(self.devname, self.fstype, self.format)
908 blkdev = block_dev(self.devname, self.size, self.fstype, self.format)
909 lctl.newdev(attach="mds %s %s" % (self.name, self.uuid),
910 setup ="%s %s" %(blkdev, self.fstype))
913 clean_loop(self.devname)
915 # Very unusual case, as there is no MDC element in the XML anymore
916 # Builds itself from an MDS node
918 def __init__(self,dom_node):
919 self.mds = MDS(dom_node)
920 self.dom_node = dom_node
921 self.module_name = 'MDC'
922 self.kmodule_list = []
926 host = socket.gethostname()
927 self.name = 'MDC_'+host
928 self.uuid = self.name+'_UUID'
930 self.lookup_server(self.mds.uuid)
931 self.add_module('lustre/mdc', 'mdc')
934 self.info(self.mds.uuid)
935 srv = self.get_server()
936 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
937 lctl.newdev(attach="mdc %s %s" % (self.name, self.uuid),
938 setup ="%s %s" %(self.mds.uuid, srv.uuid))
941 def __init__(self, dom_node):
942 Module.__init__(self, 'OBD', dom_node)
943 self.obdtype = get_attr(dom_node, 'type')
944 self.devname, self.size = get_device(dom_node)
945 self.fstype = get_text(dom_node, 'fstype')
946 self.format = get_text(dom_node, 'autoformat', 'yes')
947 if self.fstype == 'extN':
948 self.add_module('lustre/extN', 'extN')
949 self.add_module('lustre/' + self.obdtype, self.obdtype)
951 # need to check /proc/mounts and /etc/mtab before
952 # formatting anything.
953 # FIXME: check if device is already formatted.
955 self.info(self.obdtype, self.devname, self.size, self.fstype, self.format)
956 if self.obdtype == 'obdecho':
959 blkdev = block_dev(self.devname, self.size, self.fstype, self.format)
960 lctl.newdev(attach="%s %s %s" % (self.obdtype, self.name, self.uuid),
961 setup ="%s %s" %(blkdev, self.fstype))
964 if not self.obdtype == 'obdecho':
965 clean_loop(self.devname)
968 def __init__(self,dom_node):
969 Module.__init__(self, 'OST', dom_node)
970 self.obd_uuid = get_first_ref(dom_node, 'obd')
971 self.add_module('lustre/ost', 'ost')
974 self.info(self.obd_uuid)
975 lctl.newdev(attach="ost %s %s" % (self.name, self.uuid),
976 setup ="%s" % (self.obd_uuid))
979 # virtual interface for OSC and LOV
981 def __init__(self,dom_node):
982 Module.__init__(self, 'VOSC', dom_node)
983 if dom_node.nodeName == 'lov':
984 self.osc = LOV(dom_node)
986 self.osc = OSC(dom_node)
991 def load_module(self):
992 self.osc.load_module()
993 def cleanup_module(self):
994 self.osc.cleanup_module()
998 def __init__(self,dom_node):
999 Module.__init__(self, 'OSC', dom_node)
1000 self.obd_uuid = get_first_ref(dom_node, 'obd')
1001 self.ost_uuid = get_first_ref(dom_node, 'ost')
1002 self.lookup_server(self.ost_uuid)
1003 self.add_module('lustre/osc', 'osc')
1006 self.info(self.obd_uuid, self.ost_uuid)
1007 srv = self.get_server()
1009 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
1013 lctl.add_route_host(r[0], srv.uuid, r[1], r[2])
1015 panic ("no route to", srv.nid)
1017 lctl.newdev(attach="osc %s %s" % (self.name, self.uuid),
1018 setup ="%s %s" %(self.obd_uuid, srv.uuid))
1021 srv = self.get_server()
1023 Module.cleanup(self)
1025 self.info(self.obd_uuid, self.ost_uuid)
1029 lctl.del_route_host(r[0], srv.uuid, r[1], r[2])
1030 except CommandError, e:
1031 print "del_route failed: ", self.name
1034 Module.cleanup(self)
1037 class Mountpoint(Module):
1038 def __init__(self,dom_node):
1039 Module.__init__(self, 'MTPT', dom_node)
1040 self.path = get_text(dom_node, 'path')
1041 self.mds_uuid = get_first_ref(dom_node, 'mds')
1042 self.lov_uuid = get_first_ref(dom_node, 'osc')
1043 self.add_module('lustre/mdc', 'mdc')
1044 self.add_module('lustre/llite', 'llite')
1045 l = lookup(self.dom_node.parentNode, self.lov_uuid)
1050 mdc_uuid = prepare_mdc(self.dom_node.parentNode, self.mds_uuid)
1052 self.info(self.path, self.mds_uuid,self.lov_uuid)
1053 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s none %s" % \
1054 (self.lov_uuid, mdc_uuid, self.path)
1055 run("mkdir", self.path)
1058 panic("mount failed:", self.path)
1061 self.info(self.path, self.mds_uuid,self.lov_uuid)
1063 (rc, out) = run("umount -f", self.path)
1065 (rc, out) = run("umount", self.path)
1067 log("umount failed, cleanup will most likely not work.")
1068 l = lookup(self.dom_node.parentNode, self.lov_uuid)
1070 cleanup_mdc(self.dom_node.parentNode, self.mds_uuid)
1072 def load_module(self):
1073 self.osc.load_module()
1074 Module.load_module(self)
1075 def cleanup_module(self):
1076 Module.cleanup_module(self)
1077 self.osc.cleanup_module()
1080 # ============================================================
1081 # XML processing and query
1082 # TODO: Change query funcs to use XPath, which is muc cleaner
1084 def get_device(obd):
1085 list = obd.getElementsByTagName('device')
1089 size = get_attr_int(dev, 'size', 0)
1090 return dev.firstChild.data, size
1093 # Get the text content from the first matching child
1094 # If there is no content (or it is all whitespace), return
1096 def get_text(dom_node, tag, default=""):
1097 list = dom_node.getElementsByTagName(tag)
1100 dom_node.normalize()
1101 if dom_node.firstChild:
1102 txt = string.strip(dom_node.firstChild.data)
1107 def get_text_int(dom_node, tag, default=0):
1108 list = dom_node.getElementsByTagName(tag)
1112 dom_node.normalize()
1113 if dom_node.firstChild:
1114 txt = string.strip(dom_node.firstChild.data)
1119 panic("text value is not integer:", txt)
1122 def get_attr(dom_node, attr, default=""):
1123 v = dom_node.getAttribute(attr)
1128 def get_attr_int(dom_node, attr, default=0):
1130 v = dom_node.getAttribute(attr)
1135 panic("attr value is not integer", v)
1138 def get_first_ref(dom_node, tag):
1139 """ Get the first uuidref of the type TAG. Used one only
1140 one is expected. Returns the uuid."""
1142 refname = '%s_ref' % tag
1143 list = dom_node.getElementsByTagName(refname)
1145 uuid = getRef(list[0])
1148 def get_all_refs(dom_node, tag):
1149 """ Get all the refs of type TAG. Returns list of uuids. """
1151 refname = '%s_ref' % tag
1152 list = dom_node.getElementsByTagName(refname)
1155 uuids.append(getRef(i))
1158 def get_ost_net(dom_node, uuid):
1159 ost = lookup(dom_node, uuid)
1160 uuid = get_first_ref(ost, 'network')
1163 return lookup(dom_node, uuid)
1165 def nid2server(dom_node, nid):
1166 netlist = dom_node.getElementsByTagName('network')
1167 for net_node in netlist:
1168 if get_text(net_node, 'server') == nid:
1169 return Network(net_node)
1172 def lookup(dom_node, uuid):
1173 for n in dom_node.childNodes:
1174 if n.nodeType == n.ELEMENT_NODE:
1175 if getUUID(n) == uuid:
1182 # Get name attribute of dom_node
1183 def getName(dom_node):
1184 return dom_node.getAttribute('name')
1186 def getRef(dom_node):
1187 return dom_node.getAttribute('uuidref')
1189 # Get name attribute of dom_node
1190 def getUUID(dom_node):
1191 return dom_node.getAttribute('uuid')
1193 # the tag name is the service type
1194 # fixme: this should do some checks to make sure the dom_node is a service
1195 def getServiceType(dom_node):
1196 return dom_node.nodeName
1199 # determine what "level" a particular node is at.
1200 # the order of iniitailization is based on level.
1201 def getServiceLevel(dom_node):
1202 type = getServiceType(dom_node)
1203 if type in ('network',):
1205 elif type in ('device', 'ldlm'):
1207 elif type in ('obd', 'mdd'):
1209 elif type in ('mds','ost'):
1211 elif type in ('mdc','osc'):
1213 elif type in ('lov', 'lovconfig'):
1215 elif type in ('mountpoint',):
1220 # return list of services in a profile. list is a list of tuples
1221 # [(level, dom_node),]
1222 def getServices(lustreNode, profileNode):
1224 for n in profileNode.childNodes:
1225 if n.nodeType == n.ELEMENT_NODE:
1226 servNode = lookup(lustreNode, getRef(n))
1229 panic('service not found: ' + getRef(n))
1230 level = getServiceLevel(servNode)
1231 list.append((level, servNode))
1235 def getByName(lustreNode, name, tag):
1236 ndList = lustreNode.getElementsByTagName(tag)
1238 if getName(nd) == name:
1243 ############################################################
1245 # FIXME: clean this mess up!
1248 def prepare_mdc(dom_node, mds_uuid):
1250 mds_node = lookup(dom_node, mds_uuid);
1252 panic("no mds:", mds_uuid)
1261 def cleanup_mdc(dom_node, mds_uuid):
1263 mds_node = lookup(dom_node, mds_uuid);
1265 panic("no mds:", mds_uuid)
1273 ############################################################
1274 # routing ("rooting")
1280 def init_node(dom_node):
1281 global local_node, router_flag
1282 netlist = dom_node.getElementsByTagName('network')
1283 for dom_net in netlist:
1284 type = get_attr(dom_net, 'type')
1285 gw = get_text(dom_net, 'server')
1286 local_node.append((type, gw))
1288 def node_needs_router():
1291 def get_routes(type, gw, dom_net):
1292 """ Return the routes as a list of tuples of the form:
1293 [(type, gw, lo, hi),]"""
1295 tbl = dom_net.getElementsByTagName('route_tbl')
1297 routes = t.getElementsByTagName('route')
1299 lo = get_attr(r, 'lo')
1300 hi = get_attr(r, 'hi', '')
1301 res.append((type, gw, lo, hi))
1305 def init_route_config(lustre):
1306 """ Scan the lustre config looking for routers. Build list of
1308 global routes, router_flag
1310 list = lustre.getElementsByTagName('node')
1312 if get_attr(node, 'router'):
1314 for (local_type, local_nid) in local_node:
1316 netlist = node.getElementsByTagName('network')
1317 for dom_net in netlist:
1318 if local_type == get_attr(dom_net, 'type'):
1319 gw = get_text(dom_net, 'server')
1323 for dom_net in netlist:
1324 if local_type != get_attr(dom_net, 'type'):
1325 for route in get_routes(local_type, gw, dom_net):
1326 routes.append(route)
1331 for iface in local_node:
1332 if net.net_type == iface[0]:
1336 def find_route(net):
1337 global local_node, routes
1338 frm_type = local_node[0][0]
1339 to_type = net.net_type
1341 debug ('looking for route to', to_type,to)
1350 ############################################################
1353 def startService(dom_node, module_flag):
1354 type = getServiceType(dom_node)
1355 debug('Service:', type, getName(dom_node), getUUID(dom_node))
1356 # there must be a more dynamic way of doing this...
1362 elif type == 'lovconfig':
1363 n = LOVConfig(dom_node)
1364 elif type == 'network':
1365 n = Network(dom_node)
1376 elif type == 'mountpoint':
1377 n = Mountpoint(dom_node)
1379 panic ("unknown service type:", type)
1384 if config.cleanup():
1389 if config.nosetup():
1391 if config.cleanup():
1397 # Prepare the system to run lustre using a particular profile
1398 # in a the configuration.
1399 # * load & the modules
1400 # * setup networking for the current node
1401 # * make sure partitions are in place and prepared
1402 # * initialize devices with lctl
1403 # Levels is important, and needs to be enforced.
1404 def startProfile(lustreNode, profileNode, module_flag):
1406 panic("profile:", profile, "not found.")
1407 services = getServices(lustreNode, profileNode)
1408 if config.cleanup():
1411 startService(s[1], module_flag)
1416 def doHost(lustreNode, hosts):
1420 dom_node = getByName(lustreNode, h, 'node')
1425 print 'No host entry found.'
1428 if not get_attr(dom_node, 'router'):
1430 init_route_config(lustreNode)
1435 # Two step process: (1) load modules, (2) setup lustre
1436 # if not cleaning, load modules first.
1437 module_flag = not config.cleanup()
1438 reflist = dom_node.getElementsByTagName('profile')
1439 for profile in reflist:
1440 startProfile(lustreNode, profile, module_flag)
1442 if not config.cleanup():
1443 sys_set_debug_path()
1444 script = config.gdb_script()
1445 run(lctl.lctl, ' modules >', script)
1447 # dump /tmp/ogdb and sleep/pause here
1448 log ("The GDB module script is in", script)
1451 module_flag = not module_flag
1452 for profile in reflist:
1453 startProfile(lustreNode, profile, module_flag)
1455 ############################################################
1456 # Command line processing
1458 def parse_cmdline(argv):
1459 short_opts = "hdnvf"
1460 long_opts = ["ldap", "reformat", "lustre=", "verbose", "gdb",
1461 "portals=", "makeldiff", "cleanup", "noexec",
1462 "help", "node=", "nomod", "nosetup",
1467 opts, args = getopt.getopt(argv, short_opts, long_opts)
1468 except getopt.error:
1473 if o in ("-h", "--help"):
1475 if o in ("-d","--cleanup"):
1477 if o in ("-v", "--verbose"):
1479 if o in ("-n", "--noexec"):
1482 if o == "--portals":
1486 if o == "--reformat":
1494 if o == "--nosetup":
1498 if o in ("-f", "--force"):
1506 s = urllib.urlopen(url)
1512 def setupModulePath(cmd):
1513 base = os.path.dirname(cmd)
1514 if os.access(base+"/Makefile", os.R_OK):
1515 config.src_dir(base + "/../../")
1517 def sys_set_debug_path():
1518 debug("debug path: ", config.debug_path())
1522 fp = open('/proc/sys/portals/debug_path', 'w')
1523 fp.write(config.debug_path())
1528 #/proc/sys/net/core/rmem_max
1529 #/proc/sys/net/core/wmem_max
1530 def sys_set_netmem_max(path, max):
1531 debug("setting", path, "to at least", max)
1539 fp = open(path, 'w')
1540 fp.write('%d\n' %(max))
1544 def sys_make_devices():
1545 if not os.access('/dev/portals', os.R_OK):
1546 run('mknod /dev/portals c 10 240')
1547 if not os.access('/dev/obd', os.R_OK):
1548 run('mknod /dev/obd c 10 241')
1551 # Add dir to the global PATH, if not already there.
1552 def add_to_path(new_dir):
1553 syspath = string.split(os.environ['PATH'], ':')
1554 if new_dir in syspath:
1556 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
1559 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
1560 # ensure basic elements are in the system path
1561 def sanitise_path():
1562 for dir in DEFAULT_PATH:
1565 # Initialize or shutdown lustre according to a configuration file
1566 # * prepare the system for lustre
1567 # * configure devices with lctl
1568 # Shutdown does steps in reverse
1571 global TCP_ACCEPTOR, lctl, MAXTCPBUF
1572 host = socket.gethostname()
1576 args = parse_cmdline(sys.argv[1:])
1578 if not os.access(args[0], os.R_OK):
1579 print 'File not found or readable:', args[0]
1581 dom = xml.dom.minidom.parse(args[0])
1583 xmldata = fetch(config.url())
1584 dom = xml.dom.minidom.parseString(xmldata)
1590 node_list.append(config.node())
1593 node_list.append(host)
1594 node_list.append('localhost')
1595 debug("configuring for host: ", node_list)
1598 config._debug_path = config._debug_path + '-' + host
1599 config._gdb_script = config._gdb_script + '-' + host
1601 TCP_ACCEPTOR = find_prog('acceptor')
1602 if not TCP_ACCEPTOR:
1604 TCP_ACCEPTOR = 'acceptor'
1605 debug('! acceptor not found')
1607 panic('acceptor not found')
1609 lctl = LCTLInterface('lctl')
1611 setupModulePath(sys.argv[0])
1613 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
1614 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
1615 doHost(dom.documentElement, node_list)
1617 if __name__ == "__main__":
1620 except LconfError, e:
1622 except CommandError, e:
1626 if first_cleanup_error:
1627 sys.exit(first_cleanup_error)