3 # Copyright (C) 2002 Cluster File Systems, Inc.
4 # Author: Robert Read <rread@clusterfs.com>
5 # This file is part of Lustre, http://www.lustre.org.
7 # Lustre is free software; you can redistribute it and/or
8 # modify it under the terms of version 2 of the GNU General Public
9 # License as published by the Free Software Foundation.
11 # Lustre is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with Lustre; if not, write to the Free Software
18 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 # lconf - lustre configuration tool
22 # lconf is the main driver script for starting and stopping
23 # lustre filesystem services.
25 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import string, os, stat, popen2, socket, time
30 import xml.dom.minidom
35 DEFAULT_TCPBUF = 1048576
37 # Maximum number of devices to search for.
38 # (the /dev/loop* nodes need to be created beforehand)
39 MAX_LOOP_DEVICES = 256
41 first_cleanup_error = 0
42 def cleanup_error(rc):
43 global first_cleanup_error
44 if not first_cleanup_error:
45 first_cleanup_error = rc
49 print """usage: lconf config.xml
51 config.xml Lustre configuration in xml format.
52 --get <url> URL to fetch a config file
53 --node <nodename> Load config for <nodename>
54 -d | --cleanup Cleans up config. (Shutdown)
55 -f | --force Forced unmounting and/or obd detach during cleanup
56 -v | --verbose Print system commands as they are run
57 -h | --help Print this help
58 --gdb Prints message after creating gdb module script
59 and sleeps for 5 seconds.
60 -n | --noexec Prints the commands and steps that will be run for a
61 config without executing them. This can used to check if a
62 config file is doing what it should be doing. (Implies -v)
63 --nomod Skip load/unload module step.
64 --nosetup Skip device setup/cleanup step.
65 --reformat Reformat all devices (without question)
66 --dump <file> Dump the kernel debug log before portals is unloaded
67 --startlevel <num> Specify the level of services to start with (default 0)
68 --endlevel <num> Specify the level of services to end with (default 100)
69 Levels are aproximatly like:
79 --ldap server LDAP server with lustre config database
80 --makeldiff Translate xml source to LDIFF
81 This are perhaps not needed:
82 --lustre="src dir" Base directory of lustre sources. Used to search
84 --portals=src Portals source
88 # ============================================================
89 # Config parameters, encapsulated in a class
105 self._gdb_script = '/tmp/ogdb'
106 self._debug_path = '/tmp/lustre-log'
107 self._dump_file = None
109 self._start_level = 0
110 self._end_level = 100
112 def verbose(self, flag = None):
113 if flag: self._verbose = flag
116 def noexec(self, flag = None):
117 if flag: self._noexec = flag
120 def reformat(self, flag = None):
121 if flag: self._reformat = flag
122 return self._reformat
124 def cleanup(self, flag = None):
125 if flag: self._cleanup = flag
128 def gdb(self, flag = None):
129 if flag: self._gdb = flag
132 def nomod(self, flag = None):
133 if flag: self._nomod = flag
136 def nosetup(self, flag = None):
137 if flag: self._nosetup = flag
140 def force(self, flag = None):
141 if flag: self._force = flag
144 def node(self, val = None):
145 if val: self._node = val
148 def url(self, val = None):
149 if val: self._url = val
152 def gdb_script(self):
153 if os.path.isdir('/r'):
154 return '/r' + self._gdb_script
156 return self._gdb_script
158 def debug_path(self):
159 if os.path.isdir('/r'):
160 return '/r' + self._debug_path
162 return self._debug_path
164 def src_dir(self, val = None):
165 if val: self._src_dir = val
168 def dump_file(self, val = None):
169 if val: self._dump_file = val
170 return self._dump_file
172 def startlevel(self, val = None):
173 if val: self._start_level = int(val)
174 return self._start_level
176 def endlevel(self, val = None):
177 if val: self._end_level = int(val)
178 return self._end_level
184 # ============================================================
185 # debugging and error funcs
187 def fixme(msg = "this feature"):
188 raise LconfError, msg + ' not implmemented yet.'
191 msg = string.join(map(str,args))
192 if not config.noexec():
193 raise LconfError(msg)
198 msg = string.join(map(str,args))
203 print string.strip(s)
207 msg = string.join(map(str,args))
210 # ============================================================
211 # locally defined exceptions
212 class CommandError (exceptions.Exception):
213 def __init__(self, cmd_name, cmd_err, rc=None):
214 self.cmd_name = cmd_name
215 self.cmd_err = cmd_err
220 if type(self.cmd_err) == types.StringType:
222 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
224 print "! %s: %s" % (self.cmd_name, self.cmd_err)
225 elif type(self.cmd_err) == types.ListType:
227 print "! %s (error %d):" % (self.cmd_name, self.rc)
229 print "! %s:" % (self.cmd_name)
230 for s in self.cmd_err:
231 print "> %s" %(string.strip(s))
235 class LconfError (exceptions.Exception):
236 def __init__(self, args):
240 # ============================================================
241 # handle lctl interface
244 Manage communication with lctl
247 def __init__(self, cmd):
249 Initialize close by finding the lctl binary.
251 self.lctl = find_prog(cmd)
254 debug('! lctl not found')
257 raise CommandError('lctl', "unable to find lctl binary.")
262 the cmds are written to stdin of lctl
263 lctl doesn't return errors when run in script mode, so
265 should modify command line to accept multiple commands, or
266 create complex command line options
268 debug("+", self.lctl, cmds)
269 if config.noexec(): return (0, [])
270 p = popen2.Popen3(self.lctl, 1)
271 p.tochild.write(cmds + "\n")
273 out = p.fromchild.readlines()
274 err = p.childerr.readlines()
276 if os.WIFEXITED(ret):
277 rc = os.WEXITSTATUS(ret)
281 raise CommandError(self.lctl, err, rc)
285 def network(self, net, nid):
286 """ initialized network and add "self" """
287 # Idea: "mynid" could be used for all network types to add "self," and then
288 # this special case would be gone and the "self" hack would be hidden.
294 quit""" % (net, nid, nid)
303 # create a new connection
304 def connect(self, net, nid, port, servuuid, send_mem, recv_mem):
312 quit""" % (net, servuuid, nid, send_mem, recv_mem, nid, port, )
318 quit""" % (net, servuuid, nid, nid, port, )
322 # add a route to a range
323 def add_route(self, net, gw, lo, hi):
327 quit """ % (net, gw, lo, hi)
331 def del_route(self, net, gw, lo, hi):
339 # add a route to a host
340 def add_route_host(self, net, uuid, gw, tgt):
345 quit """ % (net, uuid, tgt, gw, tgt)
348 # add a route to a range
349 def del_route_host(self, net, uuid, gw, tgt):
355 quit """ % (net, uuid, tgt)
358 # disconnect one connection
359 def disconnect(self, net, nid, port, servuuid):
365 quit""" % (net, nid, servuuid)
369 def disconnectAll(self, net):
378 # create a new device with lctl
379 def newdev(self, attach, setup = ""):
384 quit""" % (attach, setup)
388 def cleanup(self, name, uuid):
394 quit""" % (name, ('', 'force')[config.force()])
398 def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist):
402 lov_setconfig %s %d %d %d %s %s
403 quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
407 def dump(self, dump_file):
410 quit""" % (dump_file)
413 # ============================================================
414 # Various system-level functions
415 # (ideally moved to their own module)
417 # Run a command and return the output and status.
418 # stderr is sent to /dev/null, could use popen3 to
419 # save it if necessary
421 cmd = string.join(map(str,args))
423 if config.noexec(): return (0, [])
424 f = os.popen(cmd + ' 2>&1')
433 # Run a command in the background.
434 def run_daemon(*args):
435 cmd = string.join(map(str,args))
437 if config.noexec(): return 0
438 f = os.popen(cmd + ' 2>&1')
446 # Determine full path to use for an external command
447 # searches dirname(argv[0]) first, then PATH
449 syspath = string.split(os.environ['PATH'], ':')
450 cmdpath = os.path.dirname(sys.argv[0])
451 syspath.insert(0, cmdpath);
452 syspath.insert(0, os.path.join(cmdpath, '../../portals/linux/utils/'))
454 prog = os.path.join(d,cmd)
455 if os.access(prog, os.X_OK):
459 # Recursively look for file starting at base dir
460 def do_find_file(base, mod):
461 fullname = os.path.join(base, mod)
462 if os.access(fullname, os.R_OK):
464 for d in os.listdir(base):
465 dir = os.path.join(base,d)
466 if os.path.isdir(dir):
467 module = do_find_file(dir, mod)
471 def find_module(src_dir, dev_dir, modname):
472 mod = '%s.o' % (modname)
473 module = src_dir +'/'+ dev_dir +'/'+ mod
475 if os.access(module, os.R_OK):
481 # is the path a block device?
488 return stat.S_ISBLK(s[stat.ST_MODE])
490 # build fs according to type
492 def mkfs(fstype, dev):
493 if(fstype in ('ext3', 'extN')):
494 mkfs = 'mkfs.ext2 -j -b 4096'
496 print 'unsupported fs type: ', fstype
497 if not is_block(dev):
501 (ret, out) = run (mkfs, force, dev)
503 panic("Unable to build fs:", dev)
504 # enable hash tree indexing on fs
506 htree = 'echo "feature FEATURE_C5" | debugfs -w'
507 (ret, out) = run (htree, dev)
509 panic("Unable to enable htree:", dev)
511 # some systems use /dev/loopN, some /dev/loop/N
515 if not os.access(loop + str(0), os.R_OK):
517 if not os.access(loop + str(0), os.R_OK):
518 panic ("can't access loop devices")
521 # find loop device assigned to thefile
524 for n in xrange(0, MAX_LOOP_DEVICES):
526 if os.access(dev, os.R_OK):
527 (stat, out) = run('losetup', dev)
528 if (out and stat == 0):
529 m = re.search(r'\((.*)\)', out[0])
530 if m and file == m.group(1):
536 # create file if necessary and assign the first free loop device
537 def init_loop(file, size, fstype):
538 dev = find_loop(file)
540 print 'WARNING file:', file, 'already mapped to', dev
542 if config.reformat() or not os.access(file, os.R_OK | os.W_OK):
543 run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size, file))
545 # find next free loop
546 for n in xrange(0, MAX_LOOP_DEVICES):
548 if os.access(dev, os.R_OK):
549 (stat, out) = run('losetup', dev)
551 run('losetup', dev, file)
554 print "out of loop devices"
556 print "out of loop devices"
559 # undo loop assignment
560 def clean_loop(file):
561 dev = find_loop(file)
563 ret, out = run('losetup -d', dev)
565 log('unable to clean loop device:', dev, 'for file:', file)
568 # determine if dev is formatted as a <fstype> filesystem
569 def need_format(fstype, dev):
570 # FIXME don't know how to implement this
573 # initialize a block device if needed
574 def block_dev(dev, size, fstype, format):
575 if config.noexec(): return dev
576 if not is_block(dev):
577 dev = init_loop(dev, size, fstype)
578 if config.reformat() or (need_format(fstype, dev) and format == 'yes'):
582 # panic("device:", dev,
583 # "not prepared, and autoformat is not set.\n",
584 # "Rerun with --reformat option to format ALL filesystems")
589 """lookup IP address for an interface"""
590 rc, out = run("/sbin/ifconfig", iface)
593 addr = string.split(out[1])[1]
594 ip = string.split(addr, ':')[1]
597 def get_local_address(net_type, wildcard):
598 """Return the local address for the network type."""
600 if net_type == 'tcp':
602 iface, star = string.split(wildcard, ':')
603 local = if2addr(iface)
605 panic ("unable to determine ip for:", wildcard)
607 host = socket.gethostname()
608 local = socket.gethostbyname(host)
609 elif net_type == 'elan':
610 # awk '/NodeId/ { print $2 }' '/proc/elan/device0/position'
612 fp = open('/proc/elan/device0/position', 'r')
613 lines = fp.readlines()
622 elif net_type == 'gm':
623 fixme("automatic local address for GM")
628 # ============================================================
629 # Classes to prepare and cleanup the various objects
632 """ Base class for the rest of the modules. The default cleanup method is
633 defined here, as well as some utilitiy funcs.
635 def __init__(self, module_name, dom_node):
636 self.dom_node = dom_node
637 self.module_name = module_name
638 self.name = get_attr(dom_node, 'name')
639 self.uuid = get_attr(dom_node, 'uuid')
640 self.kmodule_list = []
644 def info(self, *args):
645 msg = string.join(map(str,args))
646 print self.module_name + ":", self.name, self.uuid, msg
649 def lookup_server(self, srv_uuid):
650 """ Lookup a server's network information """
651 net = get_ost_net(self.dom_node.parentNode, srv_uuid)
653 panic ("Unable to find a server for:", srv_uuid)
654 self._server = Network(net)
656 def get_server(self):
660 """ default cleanup, used for most modules """
662 srv = self.get_server()
663 if srv and local_net(srv):
665 lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
666 except CommandError, e:
667 log(self.module_name, "disconnect failed: ", self.name)
671 lctl.cleanup(self.name, self.uuid)
672 except CommandError, e:
673 log(self.module_name, "cleanup failed: ", self.name)
677 def add_module(self, dev_dir, modname):
678 """Append a module to list of modules to load."""
679 self.kmodule_list.append((dev_dir, modname))
681 def mod_loaded(self, modname):
682 """Check if a module is already loaded. Look in /proc/modules for it."""
683 fp = open('/proc/modules')
684 lines = fp.readlines()
686 # please forgive my tired fingers for this one
687 ret = filter(lambda word, mod=modname: word == mod,
688 map(lambda line: string.split(line)[0], lines))
691 def load_module(self):
692 """Load all the modules in the list in the order they appear."""
693 for dev_dir, mod in self.kmodule_list:
694 # (rc, out) = run ('/sbin/lsmod | grep -s', mod)
695 if self.mod_loaded(mod) and not config.noexec():
697 log ('loading module:', mod)
699 module = find_module(config.src_dir(),dev_dir, mod)
701 panic('module not found:', mod)
702 (rc, out) = run('/sbin/insmod', module)
704 raise CommandError('insmod', out, rc)
706 (rc, out) = run('/sbin/modprobe', mod)
708 raise CommandError('modprobe', out, rc)
710 def cleanup_module(self):
711 """Unload the modules in the list in reverse order."""
712 rev = self.kmodule_list
714 for dev_dir, mod in rev:
715 if not self.mod_loaded(mod):
718 if mod == 'portals' and config.dump_file():
719 lctl.dump(config.dump_file())
720 log('unloading module:', mod)
723 (rc, out) = run('/sbin/rmmod', mod)
725 log('! unable to unload module:', mod)
729 class Network(Module):
730 def __init__(self,dom_node):
731 Module.__init__(self, 'NETWORK', dom_node)
732 self.net_type = get_attr(dom_node,'type')
733 self.nid = get_text(dom_node, 'server', '*')
734 self.port = get_text_int(dom_node, 'port', 0)
735 self.send_mem = get_text_int(dom_node, 'send_mem', DEFAULT_TCPBUF)
736 self.recv_mem = get_text_int(dom_node, 'recv_mem', DEFAULT_TCPBUF)
738 self.nid = get_local_address(self.net_type, self.nid)
740 panic("unable to set nid for", self.net_type, self.nid)
741 debug("nid:", self.nid)
743 self.add_module('portals/linux/oslib/', 'portals')
744 if node_needs_router():
745 self.add_module('portals/linux/router', 'kptlrouter')
746 if self.net_type == 'tcp':
747 self.add_module('portals/linux/socknal', 'ksocknal')
748 if self.net_type == 'elan':
749 self.add_module('portals/linux/rqswnal', 'kqswnal')
750 if self.net_type == 'gm':
751 self.add_module('portals/linux/gmnal', 'kgmnal')
752 self.add_module('lustre/obdclass', 'obdclass')
753 self.add_module('lustre/ptlrpc', 'ptlrpc')
756 self.info(self.net_type, self.nid, self.port)
757 if self.net_type in ('tcp', 'toe'):
758 nal_id = '' # default is socknal
759 if self.net_type == 'toe':
761 ret, out = run(TCP_ACCEPTOR, '-s', self.send_mem, '-r', self.recv_mem, nal_id, self.port)
763 raise CommandError(TCP_ACCEPTOR, out, ret)
764 ret = self.dom_node.getElementsByTagName('route_tbl')
766 for r in a.getElementsByTagName('route'):
767 net_type = get_attr(r, 'type')
768 gw = get_attr(r, 'gw')
769 lo = get_attr(r, 'lo')
770 hi = get_attr(r,'hi', '')
771 lctl.add_route(net_type, gw, lo, hi)
772 if net_type == 'tcp' and net_type == self.net_type and hi == '':
773 srv = nid2server(self.dom_node.parentNode.parentNode, lo)
775 panic("no server for nid", lo)
777 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
780 lctl.network(self.net_type, self.nid)
781 lctl.newdev(attach = "ptlrpc RPCDEV")
784 self.info(self.net_type, self.nid, self.port)
785 ret = self.dom_node.getElementsByTagName('route_tbl')
787 for r in a.getElementsByTagName('route'):
788 lo = get_attr(r, 'lo')
789 hi = get_attr(r,'hi', '')
790 if self.net_type == 'tcp' and hi == '':
791 srv = nid2server(self.dom_node.parentNode.parentNode, lo)
793 panic("no server for nid", lo)
796 lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
797 except CommandError, e:
798 print "disconnect failed: ", self.name
802 lctl.del_route(self.net_type, self.nid, lo, hi)
803 except CommandError, e:
804 print "del_route failed: ", self.name
809 lctl.cleanup("RPCDEV", "")
810 except CommandError, e:
811 print "cleanup failed: ", self.name
815 lctl.disconnectAll(self.net_type)
816 except CommandError, e:
817 print "disconnectAll failed: ", self.name
820 if self.net_type == 'tcp':
821 # yikes, this ugly! need to save pid in /var/something
822 run("killall acceptor")
825 def __init__(self,dom_node):
826 Module.__init__(self, 'LDLM', dom_node)
827 self.add_module('lustre/ldlm', 'ldlm')
830 lctl.newdev(attach="ldlm %s %s" % (self.name, self.uuid),
834 def __init__(self,dom_node):
835 Module.__init__(self, 'LOV', dom_node)
836 self.mds_uuid = get_first_ref(dom_node, 'mds')
837 mds= lookup(dom_node.parentNode, self.mds_uuid)
838 self.mds_name = getName(mds)
839 devs = dom_node.getElementsByTagName('devices')
842 self.stripe_sz = get_attr_int(dev_node, 'stripesize', 65536)
843 self.stripe_off = get_attr_int(dev_node, 'stripeoffset', 0)
844 self.pattern = get_attr_int(dev_node, 'pattern', 0)
845 self.devlist = get_all_refs(dev_node, 'osc')
846 self.stripe_cnt = get_attr_int(dev_node, 'stripecount', len(self.devlist))
847 self.add_module('lustre/mdc', 'mdc')
848 self.add_module('lustre/lov', 'lov')
851 for osc_uuid in self.devlist:
852 osc = lookup(self.dom_node.parentNode, osc_uuid)
857 panic('osc not found:', osc_uuid)
858 mdc_uuid = prepare_mdc(self.dom_node.parentNode, self.mds_uuid)
859 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
860 self.stripe_off, self.pattern, self.devlist, self.mds_name)
861 lctl.newdev(attach="lov %s %s" % (self.name, self.uuid),
862 setup ="%s" % (mdc_uuid))
865 for osc_uuid in self.devlist:
866 osc = lookup(self.dom_node.parentNode, osc_uuid)
871 panic('osc not found:', osc_uuid)
873 cleanup_mdc(self.dom_node.parentNode, self.mds_uuid)
876 def load_module(self):
877 for osc_uuid in self.devlist:
878 osc = lookup(self.dom_node.parentNode, osc_uuid)
884 panic('osc not found:', osc_uuid)
885 Module.load_module(self)
888 def cleanup_module(self):
889 Module.cleanup_module(self)
890 for osc_uuid in self.devlist:
891 osc = lookup(self.dom_node.parentNode, osc_uuid)
897 panic('osc not found:', osc_uuid)
899 class LOVConfig(Module):
900 def __init__(self,dom_node):
901 Module.__init__(self, 'LOVConfig', dom_node)
902 self.lov_uuid = get_first_ref(dom_node, 'lov')
903 l = lookup(dom_node.parentNode, self.lov_uuid)
908 self.info(lov.mds_uuid, lov.stripe_cnt, lov.stripe_sz, lov.stripe_off,
909 lov.pattern, lov.devlist, lov.mds_name)
910 lctl.lov_setconfig(lov.uuid, lov.mds_name, lov.stripe_cnt,
911 lov.stripe_sz, lov.stripe_off, lov.pattern,
912 string.join(lov.devlist))
920 def __init__(self,dom_node):
921 Module.__init__(self, 'MDS', dom_node)
922 self.devname, self.size = get_device(dom_node)
923 self.fstype = get_text(dom_node, 'fstype')
924 self.format = get_text(dom_node, 'autoformat', "no")
925 if self.fstype == 'extN':
926 self.add_module('lustre/extN', 'extN')
927 self.add_module('lustre/mds', 'mds')
928 self.add_module('lustre/mds', 'mds_%s' % (self.fstype))
931 self.info(self.devname, self.fstype, self.format)
932 blkdev = block_dev(self.devname, self.size, self.fstype, self.format)
933 lctl.newdev(attach="mds %s %s" % (self.name, self.uuid),
934 setup ="%s %s" %(blkdev, self.fstype))
937 clean_loop(self.devname)
939 # Very unusual case, as there is no MDC element in the XML anymore
940 # Builds itself from an MDS node
942 def __init__(self,dom_node):
943 self.mds = MDS(dom_node)
944 self.dom_node = dom_node
945 self.module_name = 'MDC'
946 self.kmodule_list = []
950 host = socket.gethostname()
951 self.name = 'MDC_%s_%s' % ( host, self.mds.name )
952 self.uuid = self.name + '_UUID'
954 self.lookup_server(self.mds.uuid)
955 self.add_module('lustre/mdc', 'mdc')
958 self.info(self.mds.uuid)
959 srv = self.get_server()
960 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
961 lctl.newdev(attach="mdc %s %s" % (self.name, self.uuid),
962 setup ="%s %s" %(self.mds.uuid, srv.uuid))
965 def __init__(self, dom_node):
966 Module.__init__(self, 'OBD', dom_node)
967 self.obdtype = get_attr(dom_node, 'type')
968 self.devname, self.size = get_device(dom_node)
969 self.fstype = get_text(dom_node, 'fstype')
970 self.format = get_text(dom_node, 'autoformat', 'yes')
971 if self.fstype == 'extN':
972 self.add_module('lustre/extN', 'extN')
973 self.add_module('lustre/' + self.obdtype, self.obdtype)
975 # need to check /proc/mounts and /etc/mtab before
976 # formatting anything.
977 # FIXME: check if device is already formatted.
979 self.info(self.obdtype, self.devname, self.size, self.fstype, self.format)
980 if self.obdtype == 'obdecho':
983 blkdev = block_dev(self.devname, self.size, self.fstype, self.format)
984 lctl.newdev(attach="%s %s %s" % (self.obdtype, self.name, self.uuid),
985 setup ="%s %s" %(blkdev, self.fstype))
988 if not self.obdtype == 'obdecho':
989 clean_loop(self.devname)
992 def __init__(self,dom_node):
993 Module.__init__(self, 'OST', dom_node)
994 self.obd_uuid = get_first_ref(dom_node, 'obd')
995 self.add_module('lustre/ost', 'ost')
998 self.info(self.obd_uuid)
999 lctl.newdev(attach="ost %s %s" % (self.name, self.uuid),
1000 setup ="%s" % (self.obd_uuid))
1003 # virtual interface for OSC and LOV
1005 def __init__(self,dom_node):
1006 Module.__init__(self, 'VOSC', dom_node)
1007 if dom_node.nodeName == 'lov':
1008 self.osc = LOV(dom_node)
1010 self.osc = OSC(dom_node)
1015 def load_module(self):
1016 self.osc.load_module()
1017 def cleanup_module(self):
1018 self.osc.cleanup_module()
1022 def __init__(self,dom_node):
1023 Module.__init__(self, 'OSC', dom_node)
1024 self.obd_uuid = get_first_ref(dom_node, 'obd')
1025 self.ost_uuid = get_first_ref(dom_node, 'ost')
1026 self.lookup_server(self.ost_uuid)
1027 self.add_module('lustre/osc', 'osc')
1030 self.info(self.obd_uuid, self.ost_uuid)
1031 srv = self.get_server()
1033 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
1037 lctl.add_route_host(r[0], srv.uuid, r[1], r[2])
1039 panic ("no route to", srv.nid)
1041 lctl.newdev(attach="osc %s %s" % (self.name, self.uuid),
1042 setup ="%s %s" %(self.obd_uuid, srv.uuid))
1045 srv = self.get_server()
1047 Module.cleanup(self)
1049 self.info(self.obd_uuid, self.ost_uuid)
1053 lctl.del_route_host(r[0], srv.uuid, r[1], r[2])
1054 except CommandError, e:
1055 print "del_route failed: ", self.name
1058 Module.cleanup(self)
1061 class Mountpoint(Module):
1062 def __init__(self,dom_node):
1063 Module.__init__(self, 'MTPT', dom_node)
1064 self.path = get_text(dom_node, 'path')
1065 self.mds_uuid = get_first_ref(dom_node, 'mds')
1066 self.lov_uuid = get_first_ref(dom_node, 'osc')
1067 self.add_module('lustre/mdc', 'mdc')
1068 self.add_module('lustre/llite', 'llite')
1069 l = lookup(self.dom_node.parentNode, self.lov_uuid)
1074 mdc_uuid = prepare_mdc(self.dom_node.parentNode, self.mds_uuid)
1075 self.info(self.path, self.mds_uuid, self.lov_uuid)
1076 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s none %s" % \
1077 (self.lov_uuid, mdc_uuid, self.path)
1078 run("mkdir", self.path)
1081 panic("mount failed:", self.path)
1084 self.info(self.path, self.mds_uuid,self.lov_uuid)
1086 (rc, out) = run("umount -f", self.path)
1088 (rc, out) = run("umount", self.path)
1090 log("umount failed, cleanup will most likely not work.")
1091 l = lookup(self.dom_node.parentNode, self.lov_uuid)
1093 cleanup_mdc(self.dom_node.parentNode, self.mds_uuid)
1095 def load_module(self):
1096 self.osc.load_module()
1097 Module.load_module(self)
1098 def cleanup_module(self):
1099 Module.cleanup_module(self)
1100 self.osc.cleanup_module()
1103 # ============================================================
1104 # XML processing and query
1105 # TODO: Change query funcs to use XPath, which is muc cleaner
1107 def get_device(obd):
1108 list = obd.getElementsByTagName('device')
1112 size = get_attr_int(dev, 'size', 0)
1113 return dev.firstChild.data, size
1116 # Get the text content from the first matching child
1117 # If there is no content (or it is all whitespace), return
1119 def get_text(dom_node, tag, default=""):
1120 list = dom_node.getElementsByTagName(tag)
1123 dom_node.normalize()
1124 if dom_node.firstChild:
1125 txt = string.strip(dom_node.firstChild.data)
1130 def get_text_int(dom_node, tag, default=0):
1131 list = dom_node.getElementsByTagName(tag)
1135 dom_node.normalize()
1136 if dom_node.firstChild:
1137 txt = string.strip(dom_node.firstChild.data)
1142 panic("text value is not integer:", txt)
1145 def get_attr(dom_node, attr, default=""):
1146 v = dom_node.getAttribute(attr)
1151 def get_attr_int(dom_node, attr, default=0):
1153 v = dom_node.getAttribute(attr)
1158 panic("attr value is not integer", v)
1161 def get_first_ref(dom_node, tag):
1162 """ Get the first uuidref of the type TAG. Used one only
1163 one is expected. Returns the uuid."""
1165 refname = '%s_ref' % tag
1166 list = dom_node.getElementsByTagName(refname)
1168 uuid = getRef(list[0])
1171 def get_all_refs(dom_node, tag):
1172 """ Get all the refs of type TAG. Returns list of uuids. """
1174 refname = '%s_ref' % tag
1175 list = dom_node.getElementsByTagName(refname)
1178 uuids.append(getRef(i))
1181 def get_ost_net(dom_node, uuid):
1182 ost = lookup(dom_node, uuid)
1183 uuid = get_first_ref(ost, 'network')
1186 return lookup(dom_node, uuid)
1188 def nid2server(dom_node, nid):
1189 netlist = dom_node.getElementsByTagName('network')
1190 for net_node in netlist:
1191 if get_text(net_node, 'server') == nid:
1192 return Network(net_node)
1195 def lookup(dom_node, uuid):
1196 for n in dom_node.childNodes:
1197 if n.nodeType == n.ELEMENT_NODE:
1198 if getUUID(n) == uuid:
1205 # Get name attribute of dom_node
1206 def getName(dom_node):
1207 return dom_node.getAttribute('name')
1209 def getRef(dom_node):
1210 return dom_node.getAttribute('uuidref')
1212 # Get name attribute of dom_node
1213 def getUUID(dom_node):
1214 return dom_node.getAttribute('uuid')
1216 # the tag name is the service type
1217 # fixme: this should do some checks to make sure the dom_node is a service
1218 def getServiceType(dom_node):
1219 return dom_node.nodeName
1222 # determine what "level" a particular node is at.
1223 # the order of iniitailization is based on level.
1224 def getServiceLevel(dom_node):
1225 type = getServiceType(dom_node)
1227 if type in ('network',):
1229 elif type in ('device', 'ldlm'):
1231 elif type in ('obd', 'mdd'):
1233 elif type in ('mds','ost'):
1235 elif type in ('mdc','osc'):
1237 elif type in ('lov', 'lovconfig'):
1239 elif type in ('mountpoint',):
1242 if ret < config.startlevel() or ret > config.endlevel():
1247 # return list of services in a profile. list is a list of tuples
1248 # [(level, dom_node),]
1249 def getServices(lustreNode, profileNode):
1251 for n in profileNode.childNodes:
1252 if n.nodeType == n.ELEMENT_NODE:
1253 servNode = lookup(lustreNode, getRef(n))
1256 panic('service not found: ' + getRef(n))
1257 level = getServiceLevel(servNode)
1259 list.append((level, servNode))
1263 def getByName(lustreNode, name, tag):
1264 ndList = lustreNode.getElementsByTagName(tag)
1266 if getName(nd) == name:
1271 ############################################################
1273 # FIXME: clean this mess up!
1276 def prepare_mdc(dom_node, mds_uuid):
1278 mds_node = lookup(dom_node, mds_uuid);
1280 panic("no mds:", mds_uuid)
1281 if saved_mdc.has_key(mds_uuid):
1282 return saved_mdc[mds_uuid]
1285 saved_mdc[mds_uuid] = mdc.uuid
1289 def cleanup_mdc(dom_node, mds_uuid):
1291 mds_node = lookup(dom_node, mds_uuid);
1293 panic("no mds:", mds_uuid)
1301 ############################################################
1302 # routing ("rooting")
1308 def init_node(dom_node):
1309 global local_node, router_flag
1310 netlist = dom_node.getElementsByTagName('network')
1311 for dom_net in netlist:
1312 type = get_attr(dom_net, 'type')
1313 gw = get_text(dom_net, 'server')
1314 local_node.append((type, gw))
1316 def node_needs_router():
1319 def get_routes(type, gw, dom_net):
1320 """ Return the routes as a list of tuples of the form:
1321 [(type, gw, lo, hi),]"""
1323 tbl = dom_net.getElementsByTagName('route_tbl')
1325 routes = t.getElementsByTagName('route')
1327 lo = get_attr(r, 'lo')
1328 hi = get_attr(r, 'hi', '')
1329 res.append((type, gw, lo, hi))
1333 def init_route_config(lustre):
1334 """ Scan the lustre config looking for routers. Build list of
1336 global routes, router_flag
1338 list = lustre.getElementsByTagName('node')
1340 if get_attr(node, 'router'):
1342 for (local_type, local_nid) in local_node:
1344 netlist = node.getElementsByTagName('network')
1345 for dom_net in netlist:
1346 if local_type == get_attr(dom_net, 'type'):
1347 gw = get_text(dom_net, 'server')
1351 for dom_net in netlist:
1352 if local_type != get_attr(dom_net, 'type'):
1353 for route in get_routes(local_type, gw, dom_net):
1354 routes.append(route)
1359 for iface in local_node:
1360 if net.net_type == iface[0]:
1364 def find_route(net):
1365 global local_node, routes
1366 frm_type = local_node[0][0]
1367 to_type = net.net_type
1369 debug ('looking for route to', to_type,to)
1378 ############################################################
1381 def startService(dom_node, module_flag):
1382 type = getServiceType(dom_node)
1383 debug('Service:', type, getName(dom_node), getUUID(dom_node))
1384 # there must be a more dynamic way of doing this...
1390 elif type == 'lovconfig':
1391 n = LOVConfig(dom_node)
1392 elif type == 'network':
1393 n = Network(dom_node)
1404 elif type == 'mountpoint':
1405 n = Mountpoint(dom_node)
1407 panic ("unknown service type:", type)
1412 if config.cleanup():
1417 if config.nosetup():
1419 if config.cleanup():
1425 # Prepare the system to run lustre using a particular profile
1426 # in a the configuration.
1427 # * load & the modules
1428 # * setup networking for the current node
1429 # * make sure partitions are in place and prepared
1430 # * initialize devices with lctl
1431 # Levels is important, and needs to be enforced.
1432 def startProfile(lustreNode, profileNode, module_flag):
1434 panic("profile:", profile, "not found.")
1435 services = getServices(lustreNode, profileNode)
1436 if config.cleanup():
1439 startService(s[1], module_flag)
1444 def doHost(lustreNode, hosts):
1448 dom_node = getByName(lustreNode, h, 'node')
1453 print 'No host entry found.'
1456 if not get_attr(dom_node, 'router'):
1458 init_route_config(lustreNode)
1463 # Two step process: (1) load modules, (2) setup lustre
1464 # if not cleaning, load modules first.
1465 module_flag = not config.cleanup()
1466 reflist = dom_node.getElementsByTagName('profile')
1467 for profile in reflist:
1468 startProfile(lustreNode, profile, module_flag)
1470 if not config.cleanup():
1471 sys_set_debug_path()
1472 script = config.gdb_script()
1473 run(lctl.lctl, ' modules >', script)
1475 # dump /tmp/ogdb and sleep/pause here
1476 log ("The GDB module script is in", script)
1479 module_flag = not module_flag
1480 for profile in reflist:
1481 startProfile(lustreNode, profile, module_flag)
1483 ############################################################
1484 # Command line processing
1486 def parse_cmdline(argv):
1487 short_opts = "hdnvf"
1488 long_opts = ["ldap", "reformat", "lustre=", "verbose", "gdb",
1489 "portals=", "makeldiff", "cleanup", "noexec",
1490 "help", "node=", "nomod", "nosetup",
1491 "dump=", "force", "startlevel=", "endlevel="]
1495 opts, args = getopt.getopt(argv, short_opts, long_opts)
1496 except getopt.error:
1501 if o in ("-h", "--help"):
1503 if o in ("-d","--cleanup"):
1505 if o in ("-v", "--verbose"):
1507 if o in ("-n", "--noexec"):
1510 if o == "--portals":
1514 if o == "--reformat":
1522 if o == "--nosetup":
1526 if o in ("-f", "--force"):
1528 if o in ("--startlevel",):
1529 config.startlevel(a)
1530 if o in ("--endlevel",):
1539 s = urllib.urlopen(url)
1545 def setupModulePath(cmd):
1546 base = os.path.dirname(cmd)
1547 if os.access(base+"/Makefile", os.R_OK):
1548 config.src_dir(base + "/../../")
1550 def sys_set_debug_path():
1551 debug("debug path: ", config.debug_path())
1555 fp = open('/proc/sys/portals/debug_path', 'w')
1556 fp.write(config.debug_path())
1561 #/proc/sys/net/core/rmem_max
1562 #/proc/sys/net/core/wmem_max
1563 def sys_set_netmem_max(path, max):
1564 debug("setting", path, "to at least", max)
1572 fp = open(path, 'w')
1573 fp.write('%d\n' %(max))
1577 def sys_make_devices():
1578 if not os.access('/dev/portals', os.R_OK):
1579 run('mknod /dev/portals c 10 240')
1580 if not os.access('/dev/obd', os.R_OK):
1581 run('mknod /dev/obd c 10 241')
1584 # Add dir to the global PATH, if not already there.
1585 def add_to_path(new_dir):
1586 syspath = string.split(os.environ['PATH'], ':')
1587 if new_dir in syspath:
1589 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
1592 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
1593 # ensure basic elements are in the system path
1594 def sanitise_path():
1595 for dir in DEFAULT_PATH:
1598 # Initialize or shutdown lustre according to a configuration file
1599 # * prepare the system for lustre
1600 # * configure devices with lctl
1601 # Shutdown does steps in reverse
1604 global TCP_ACCEPTOR, lctl, MAXTCPBUF
1605 host = socket.gethostname()
1609 args = parse_cmdline(sys.argv[1:])
1611 if not os.access(args[0], os.R_OK):
1612 print 'File not found or readable:', args[0]
1614 dom = xml.dom.minidom.parse(args[0])
1616 xmldata = fetch(config.url())
1617 dom = xml.dom.minidom.parseString(xmldata)
1623 node_list.append(config.node())
1626 node_list.append(host)
1627 node_list.append('localhost')
1628 debug("configuring for host: ", node_list)
1631 config._debug_path = config._debug_path + '-' + host
1632 config._gdb_script = config._gdb_script + '-' + host
1634 TCP_ACCEPTOR = find_prog('acceptor')
1635 if not TCP_ACCEPTOR:
1637 TCP_ACCEPTOR = 'acceptor'
1638 debug('! acceptor not found')
1640 panic('acceptor not found')
1642 lctl = LCTLInterface('lctl')
1644 setupModulePath(sys.argv[0])
1646 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
1647 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
1648 doHost(dom.documentElement, node_list)
1650 if __name__ == "__main__":
1653 except LconfError, e:
1655 except CommandError, e:
1659 if first_cleanup_error:
1660 sys.exit(first_cleanup_error)