3 # Copyright (C) 2002 Cluster File Systems, Inc.
4 # Author: Robert Read <rread@clusterfs.com>
5 # This file is part of Lustre, http://www.lustre.org.
7 # Lustre is free software; you can redistribute it and/or
8 # modify it under the terms of version 2 of the GNU General Public
9 # License as published by the Free Software Foundation.
11 # Lustre is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with Lustre; if not, write to the Free Software
18 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 # lconf - lustre configuration tool
22 # lconf is the main driver script for starting and stopping
23 # lustre filesystem services.
25 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import string, os, stat, popen2, socket, time
30 import xml.dom.minidom
35 DEFAULT_TCPBUF = 1048576
37 # Maximum number of devices to search for.
38 # (the /dev/loop* nodes need to be created beforehand)
39 MAX_LOOP_DEVICES = 256
41 first_cleanup_error = 0
42 def cleanup_error(rc):
43 global first_cleanup_error
44 if not first_cleanup_error:
45 first_cleanup_error = rc
49 print """usage: lconf config.xml
51 config.xml Lustre configuration in xml format.
52 --get <url> URL to fetch a config file
53 --node <nodename> Load config for <nodename>
54 -d | --cleanup Cleans up config. (Shutdown)
55 -f | --force Forced unmounting and/or obd detach during cleanup
56 -v | --verbose Print system commands as they are run
57 -h | --help Print this help
58 --gdb Prints message after creating gdb module script
59 and sleeps for 5 seconds.
60 -n | --noexec Prints the commands and steps that will be run for a
61 config without executing them. This can used to check if a
62 config file is doing what it should be doing. (Implies -v)
63 --nomod Skip load/unload module step.
64 --nosetup Skip device setup/cleanup step.
65 --reformat Reformat all devices (without question)
66 --dump <file> Dump the kernel debug log before portals is unloaded
67 --startlevel <num> Specify the level of services to start with (default 0)
68 --endlevel <num> Specify the level of services to end with (default 100)
69 Levels are aproximatly like:
79 --ldap server LDAP server with lustre config database
80 --makeldiff Translate xml source to LDIFF
81 This are perhaps not needed:
82 --lustre="src dir" Base directory of lustre sources. Used to search
84 --portals=src Portals source
88 # ============================================================
89 # Config parameters, encapsulated in a class
105 self._gdb_script = '/tmp/ogdb'
106 self._debug_path = '/tmp/lustre-log'
107 self._dump_file = None
109 self._start_level = 0
110 self._end_level = 100
112 def verbose(self, flag = None):
113 if flag: self._verbose = flag
116 def noexec(self, flag = None):
117 if flag: self._noexec = flag
120 def reformat(self, flag = None):
121 if flag: self._reformat = flag
122 return self._reformat
124 def cleanup(self, flag = None):
125 if flag: self._cleanup = flag
128 def gdb(self, flag = None):
129 if flag: self._gdb = flag
132 def nomod(self, flag = None):
133 if flag: self._nomod = flag
136 def nosetup(self, flag = None):
137 if flag: self._nosetup = flag
140 def force(self, flag = None):
141 if flag: self._force = flag
144 def node(self, val = None):
145 if val: self._node = val
148 def url(self, val = None):
149 if val: self._url = val
152 def gdb_script(self):
153 if os.path.isdir('/r'):
154 return '/r' + self._gdb_script
156 return self._gdb_script
158 def debug_path(self):
159 if os.path.isdir('/r'):
160 return '/r' + self._debug_path
162 return self._debug_path
164 def src_dir(self, val = None):
165 if val: self._src_dir = val
168 def dump_file(self, val = None):
169 if val: self._dump_file = val
170 return self._dump_file
172 def startlevel(self, val = None):
173 if val: self._start_level = int(val)
174 return self._start_level
176 def endlevel(self, val = None):
177 if val: self._end_level = int(val)
178 return self._end_level
184 # ============================================================
185 # debugging and error funcs
187 def fixme(msg = "this feature"):
188 raise LconfError, msg + ' not implmemented yet.'
191 msg = string.join(map(str,args))
192 if not config.noexec():
193 raise LconfError(msg)
198 msg = string.join(map(str,args))
203 print string.strip(s)
207 msg = string.join(map(str,args))
210 # ============================================================
211 # locally defined exceptions
212 class CommandError (exceptions.Exception):
213 def __init__(self, cmd_name, cmd_err, rc=None):
214 self.cmd_name = cmd_name
215 self.cmd_err = cmd_err
220 if type(self.cmd_err) == types.StringType:
222 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
224 print "! %s: %s" % (self.cmd_name, self.cmd_err)
225 elif type(self.cmd_err) == types.ListType:
227 print "! %s (error %d):" % (self.cmd_name, self.rc)
229 print "! %s:" % (self.cmd_name)
230 for s in self.cmd_err:
231 print "> %s" %(string.strip(s))
235 class LconfError (exceptions.Exception):
236 def __init__(self, args):
240 # ============================================================
241 # handle lctl interface
244 Manage communication with lctl
247 def __init__(self, cmd):
249 Initialize close by finding the lctl binary.
251 self.lctl = find_prog(cmd)
254 debug('! lctl not found')
257 raise CommandError('lctl', "unable to find lctl binary.")
262 the cmds are written to stdin of lctl
263 lctl doesn't return errors when run in script mode, so
265 should modify command line to accept multiple commands, or
266 create complex command line options
268 debug("+", self.lctl, cmds)
269 if config.noexec(): return (0, [])
270 p = popen2.Popen3(self.lctl, 1)
271 p.tochild.write(cmds + "\n")
273 out = p.fromchild.readlines()
274 err = p.childerr.readlines()
276 if os.WIFEXITED(ret):
277 rc = os.WEXITSTATUS(ret)
281 raise CommandError(self.lctl, err, rc)
285 def network(self, net, nid):
286 """ initialized network and add "self" """
287 # Idea: "mynid" could be used for all network types to add "self," and then
288 # this special case would be gone and the "self" hack would be hidden.
294 quit""" % (net, nid, nid)
303 # create a new connection
304 def connect(self, net, nid, port, servuuid, send_mem, recv_mem):
312 quit""" % (net, servuuid, nid, send_mem, recv_mem, nid, port, )
318 quit""" % (net, servuuid, nid, nid, port, )
322 # add a route to a range
323 def add_route(self, net, gw, lo, hi):
327 quit """ % (net, gw, lo, hi)
331 # add a route to a range
332 def del_route(self, net, gw, lo, hi):
340 # add a route to a host
341 def add_route_host(self, net, uuid, gw, tgt):
346 quit """ % (net, uuid, tgt, gw, tgt)
349 # add a route to a range
350 def del_route_host(self, net, uuid, gw, tgt):
356 quit """ % (net, uuid, tgt)
359 # disconnect one connection
360 def disconnect(self, net, nid, port, servuuid):
366 quit""" % (net, nid, servuuid)
370 def disconnectAll(self, net):
379 # create a new device with lctl
380 def newdev(self, attach, setup = ""):
385 quit""" % (attach, setup)
389 def cleanup(self, name, uuid):
395 quit""" % (name, ('', 'force')[config.force()])
399 def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist):
403 lov_setconfig %s %d %d %d %s %s
404 quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
408 def dump(self, dump_file):
411 quit""" % (dump_file)
414 # ============================================================
415 # Various system-level functions
416 # (ideally moved to their own module)
418 # Run a command and return the output and status.
419 # stderr is sent to /dev/null, could use popen3 to
420 # save it if necessary
422 cmd = string.join(map(str,args))
424 if config.noexec(): return (0, [])
425 f = os.popen(cmd + ' 2>&1')
434 # Run a command in the background.
435 def run_daemon(*args):
436 cmd = string.join(map(str,args))
438 if config.noexec(): return 0
439 f = os.popen(cmd + ' 2>&1')
447 # Determine full path to use for an external command
448 # searches dirname(argv[0]) first, then PATH
450 syspath = string.split(os.environ['PATH'], ':')
451 cmdpath = os.path.dirname(sys.argv[0])
452 syspath.insert(0, cmdpath);
453 syspath.insert(0, os.path.join(cmdpath, '../../portals/linux/utils/'))
455 prog = os.path.join(d,cmd)
456 if os.access(prog, os.X_OK):
460 # Recursively look for file starting at base dir
461 def do_find_file(base, mod):
462 fullname = os.path.join(base, mod)
463 if os.access(fullname, os.R_OK):
465 for d in os.listdir(base):
466 dir = os.path.join(base,d)
467 if os.path.isdir(dir):
468 module = do_find_file(dir, mod)
472 def find_module(src_dir, dev_dir, modname):
473 mod = '%s.o' % (modname)
474 module = src_dir +'/'+ dev_dir +'/'+ mod
476 if os.access(module, os.R_OK):
482 # is the path a block device?
489 return stat.S_ISBLK(s[stat.ST_MODE])
491 # build fs according to type
493 def mkfs(fstype, dev):
494 if(fstype in ('ext3', 'extN')):
495 mkfs = 'mkfs.ext2 -j -b 4096'
497 print 'unsupported fs type: ', fstype
498 if not is_block(dev):
502 (ret, out) = run (mkfs, force, dev)
504 panic("Unable to build fs:", dev)
505 # enable hash tree indexing on fs
507 htree = 'echo "feature FEATURE_C5" | debugfs -w'
508 (ret, out) = run (htree, dev)
510 panic("Unable to enable htree:", dev)
512 # some systems use /dev/loopN, some /dev/loop/N
516 if not os.access(loop + str(0), os.R_OK):
518 if not os.access(loop + str(0), os.R_OK):
519 panic ("can't access loop devices")
522 # find loop device assigned to thefile
525 for n in xrange(0, MAX_LOOP_DEVICES):
527 if os.access(dev, os.R_OK):
528 (stat, out) = run('losetup', dev)
529 if (out and stat == 0):
530 m = re.search(r'\((.*)\)', out[0])
531 if m and file == m.group(1):
537 # create file if necessary and assign the first free loop device
538 def init_loop(file, size, fstype):
539 dev = find_loop(file)
541 print 'WARNING file:', file, 'already mapped to', dev
543 if config.reformat() or not os.access(file, os.R_OK | os.W_OK):
544 run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size, file))
546 # find next free loop
547 for n in xrange(0, MAX_LOOP_DEVICES):
549 if os.access(dev, os.R_OK):
550 (stat, out) = run('losetup', dev)
552 run('losetup', dev, file)
555 print "out of loop devices"
557 print "out of loop devices"
560 # undo loop assignment
561 def clean_loop(file):
562 dev = find_loop(file)
564 ret, out = run('losetup -d', dev)
566 log('unable to clean loop device:', dev, 'for file:', file)
569 # determine if dev is formatted as a <fstype> filesystem
570 def need_format(fstype, dev):
571 # FIXME don't know how to implement this
574 # initialize a block device if needed
575 def block_dev(dev, size, fstype, format):
576 if config.noexec(): return dev
577 if not is_block(dev):
578 dev = init_loop(dev, size, fstype)
579 if config.reformat() or (need_format(fstype, dev) and format == 'yes'):
583 # panic("device:", dev,
584 # "not prepared, and autoformat is not set.\n",
585 # "Rerun with --reformat option to format ALL filesystems")
590 """lookup IP address for an interface"""
591 rc, out = run("/sbin/ifconfig", iface)
594 addr = string.split(out[1])[1]
595 ip = string.split(addr, ':')[1]
598 def get_local_address(net_type, wildcard):
599 """Return the local address for the network type."""
601 if net_type == 'tcp':
603 iface, star = string.split(wildcard, ':')
604 local = if2addr(iface)
606 panic ("unable to determine ip for:", wildcard)
608 host = socket.gethostname()
609 local = socket.gethostbyname(host)
610 elif net_type == 'elan':
611 # awk '/NodeId/ { print $2 }' '/proc/elan/device0/position'
613 fp = open('/proc/elan/device0/position', 'r')
614 lines = fp.readlines()
623 elif net_type == 'gm':
624 fixme("automatic local address for GM")
629 # ============================================================
630 # Classes to prepare and cleanup the various objects
633 """ Base class for the rest of the modules. The default cleanup method is
634 defined here, as well as some utilitiy funcs.
636 def __init__(self, module_name, dom_node):
637 self.dom_node = dom_node
638 self.module_name = module_name
639 self.name = get_attr(dom_node, 'name')
640 self.uuid = get_attr(dom_node, 'uuid')
641 self.kmodule_list = []
645 def info(self, *args):
646 msg = string.join(map(str,args))
647 print self.module_name + ":", self.name, self.uuid, msg
650 def lookup_server(self, srv_uuid):
651 """ Lookup a server's network information """
652 net = get_ost_net(self.dom_node.parentNode, srv_uuid)
654 panic ("Unable to find a server for:", srv_uuid)
655 self._server = Network(net)
657 def get_server(self):
661 """ default cleanup, used for most modules """
663 srv = self.get_server()
664 if srv and local_net(srv):
666 lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
667 except CommandError, e:
668 log(self.module_name, "disconnect failed: ", self.name)
672 lctl.cleanup(self.name, self.uuid)
673 except CommandError, e:
674 log(self.module_name, "cleanup failed: ", self.name)
678 def add_module(self, dev_dir, modname):
679 """Append a module to list of modules to load."""
680 self.kmodule_list.append((dev_dir, modname))
682 def mod_loaded(self, modname):
683 """Check if a module is already loaded. Look in /proc/modules for it."""
684 fp = open('/proc/modules')
685 lines = fp.readlines()
687 # please forgive my tired fingers for this one
688 ret = filter(lambda word, mod=modname: word == mod,
689 map(lambda line: string.split(line)[0], lines))
692 def load_module(self):
693 """Load all the modules in the list in the order they appear."""
694 for dev_dir, mod in self.kmodule_list:
695 # (rc, out) = run ('/sbin/lsmod | grep -s', mod)
696 if self.mod_loaded(mod) and not config.noexec():
698 log ('loading module:', mod)
700 module = find_module(config.src_dir(),dev_dir, mod)
702 panic('module not found:', mod)
703 (rc, out) = run('/sbin/insmod', module)
705 raise CommandError('insmod', out, rc)
707 (rc, out) = run('/sbin/modprobe', mod)
709 raise CommandError('modprobe', out, rc)
711 def cleanup_module(self):
712 """Unload the modules in the list in reverse order."""
713 rev = self.kmodule_list
715 for dev_dir, mod in rev:
716 if not self.mod_loaded(mod):
719 if mod == 'portals' and config.dump_file():
720 lctl.dump(config.dump_file())
721 log('unloading module:', mod)
724 (rc, out) = run('/sbin/rmmod', mod)
726 log('! unable to unload module:', mod)
730 class Network(Module):
731 def __init__(self,dom_node):
732 Module.__init__(self, 'NETWORK', dom_node)
733 self.net_type = get_attr(dom_node,'type')
734 self.nid = get_text(dom_node, 'server', '*')
735 self.port = get_text_int(dom_node, 'port', 0)
736 self.send_mem = get_text_int(dom_node, 'send_mem', DEFAULT_TCPBUF)
737 self.recv_mem = get_text_int(dom_node, 'recv_mem', DEFAULT_TCPBUF)
739 self.nid = get_local_address(self.net_type, self.nid)
741 panic("unable to set nid for", self.net_type, self.nid)
742 debug("nid:", self.nid)
744 self.add_module('portals/linux/oslib/', 'portals')
745 if node_needs_router():
746 self.add_module('portals/linux/router', 'kptlrouter')
747 if self.net_type == 'tcp':
748 self.add_module('portals/linux/socknal', 'ksocknal')
749 if self.net_type == 'elan':
750 self.add_module('portals/linux/rqswnal', 'kqswnal')
751 if self.net_type == 'gm':
752 self.add_module('portals/linux/gmnal', 'kgmnal')
753 self.add_module('lustre/obdclass', 'obdclass')
754 self.add_module('lustre/ptlrpc', 'ptlrpc')
757 self.info(self.net_type, self.nid, self.port)
758 if self.net_type == 'tcp':
759 ret, out = run(TCP_ACCEPTOR, '-s', self.send_mem, '-r', self.recv_mem, self.port)
761 raise CommandError(TCP_ACCEPTOR, out, ret)
762 ret = self.dom_node.getElementsByTagName('route_tbl')
764 for r in a.getElementsByTagName('route'):
765 net_type = get_attr(r, 'type')
766 gw = get_attr(r, 'gw')
767 lo = get_attr(r, 'lo')
768 hi = get_attr(r,'hi', '')
769 lctl.add_route(net_type, gw, lo, hi)
770 if net_type == 'tcp' and net_type == self.net_type and hi == '':
771 srv = nid2server(self.dom_node.parentNode.parentNode, lo)
773 panic("no server for nid", lo)
775 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
778 lctl.network(self.net_type, self.nid)
779 lctl.newdev(attach = "ptlrpc RPCDEV")
782 self.info(self.net_type, self.nid, self.port)
783 ret = self.dom_node.getElementsByTagName('route_tbl')
785 for r in a.getElementsByTagName('route'):
786 lo = get_attr(r, 'lo')
787 hi = get_attr(r,'hi', '')
788 if self.net_type == 'tcp' and hi == '':
789 srv = nid2server(self.dom_node.parentNode.parentNode, lo)
791 panic("no server for nid", lo)
794 lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
795 except CommandError, e:
796 print "disconnect failed: ", self.name
800 lctl.del_route(self.net_type, self.nid, lo, hi)
801 except CommandError, e:
802 print "del_route failed: ", self.name
807 lctl.cleanup("RPCDEV", "")
808 except CommandError, e:
809 print "cleanup failed: ", self.name
813 lctl.disconnectAll(self.net_type)
814 except CommandError, e:
815 print "disconnectAll failed: ", self.name
818 if self.net_type == 'tcp':
819 # yikes, this ugly! need to save pid in /var/something
820 run("killall acceptor")
823 def __init__(self,dom_node):
824 Module.__init__(self, 'LDLM', dom_node)
825 self.add_module('lustre/ldlm', 'ldlm')
828 lctl.newdev(attach="ldlm %s %s" % (self.name, self.uuid),
832 def __init__(self,dom_node):
833 Module.__init__(self, 'LOV', dom_node)
834 self.mds_uuid = get_first_ref(dom_node, 'mds')
835 mds= lookup(dom_node.parentNode, self.mds_uuid)
836 self.mds_name = getName(mds)
837 devs = dom_node.getElementsByTagName('devices')
840 self.stripe_sz = get_attr_int(dev_node, 'stripesize', 65536)
841 self.stripe_off = get_attr_int(dev_node, 'stripeoffset', 0)
842 self.pattern = get_attr_int(dev_node, 'pattern', 0)
843 self.devlist = get_all_refs(dev_node, 'osc')
844 self.stripe_cnt = get_attr_int(dev_node, 'stripecount', len(self.devlist))
845 self.add_module('lustre/mdc', 'mdc')
846 self.add_module('lustre/lov', 'lov')
849 for osc_uuid in self.devlist:
850 osc = lookup(self.dom_node.parentNode, osc_uuid)
855 panic('osc not found:', osc_uuid)
856 mdc_uuid = prepare_mdc(self.dom_node.parentNode, self.mds_uuid)
857 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
858 self.stripe_off, self.pattern, self.devlist, self.mds_name)
859 lctl.newdev(attach="lov %s %s" % (self.name, self.uuid),
860 setup ="%s" % (mdc_uuid))
863 for osc_uuid in self.devlist:
864 osc = lookup(self.dom_node.parentNode, osc_uuid)
869 panic('osc not found:', osc_uuid)
871 cleanup_mdc(self.dom_node.parentNode, self.mds_uuid)
874 def load_module(self):
875 for osc_uuid in self.devlist:
876 osc = lookup(self.dom_node.parentNode, osc_uuid)
882 panic('osc not found:', osc_uuid)
883 Module.load_module(self)
886 def cleanup_module(self):
887 Module.cleanup_module(self)
888 for osc_uuid in self.devlist:
889 osc = lookup(self.dom_node.parentNode, osc_uuid)
895 panic('osc not found:', osc_uuid)
897 class LOVConfig(Module):
898 def __init__(self,dom_node):
899 Module.__init__(self, 'LOVConfig', dom_node)
900 self.lov_uuid = get_first_ref(dom_node, 'lov')
901 l = lookup(dom_node.parentNode, self.lov_uuid)
906 self.info(lov.mds_uuid, lov.stripe_cnt, lov.stripe_sz, lov.stripe_off,
907 lov.pattern, lov.devlist, lov.mds_name)
908 lctl.lov_setconfig(lov.uuid, lov.mds_name, lov.stripe_cnt,
909 lov.stripe_sz, lov.stripe_off, lov.pattern,
910 string.join(lov.devlist))
918 def __init__(self,dom_node):
919 Module.__init__(self, 'MDS', dom_node)
920 self.devname, self.size = get_device(dom_node)
921 self.fstype = get_text(dom_node, 'fstype')
922 self.format = get_text(dom_node, 'autoformat', "no")
923 if self.fstype == 'extN':
924 self.add_module('lustre/extN', 'extN')
925 self.add_module('lustre/mds', 'mds')
926 self.add_module('lustre/mds', 'mds_%s' % (self.fstype))
929 self.info(self.devname, self.fstype, self.format)
930 blkdev = block_dev(self.devname, self.size, self.fstype, self.format)
931 lctl.newdev(attach="mds %s %s" % (self.name, self.uuid),
932 setup ="%s %s" %(blkdev, self.fstype))
935 clean_loop(self.devname)
937 # Very unusual case, as there is no MDC element in the XML anymore
938 # Builds itself from an MDS node
940 def __init__(self,dom_node):
941 self.mds = MDS(dom_node)
942 self.dom_node = dom_node
943 self.module_name = 'MDC'
944 self.kmodule_list = []
948 host = socket.gethostname()
949 self.name = 'MDC_'+host
950 self.uuid = self.name+'_UUID'
952 self.lookup_server(self.mds.uuid)
953 self.add_module('lustre/mdc', 'mdc')
956 self.info(self.mds.uuid)
957 srv = self.get_server()
958 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
959 lctl.newdev(attach="mdc %s %s" % (self.name, self.uuid),
960 setup ="%s %s" %(self.mds.uuid, srv.uuid))
963 def __init__(self, dom_node):
964 Module.__init__(self, 'OBD', dom_node)
965 self.obdtype = get_attr(dom_node, 'type')
966 self.devname, self.size = get_device(dom_node)
967 self.fstype = get_text(dom_node, 'fstype')
968 self.format = get_text(dom_node, 'autoformat', 'yes')
969 if self.fstype == 'extN':
970 self.add_module('lustre/extN', 'extN')
971 self.add_module('lustre/' + self.obdtype, self.obdtype)
973 # need to check /proc/mounts and /etc/mtab before
974 # formatting anything.
975 # FIXME: check if device is already formatted.
977 self.info(self.obdtype, self.devname, self.size, self.fstype, self.format)
978 if self.obdtype == 'obdecho':
981 blkdev = block_dev(self.devname, self.size, self.fstype, self.format)
982 lctl.newdev(attach="%s %s %s" % (self.obdtype, self.name, self.uuid),
983 setup ="%s %s" %(blkdev, self.fstype))
986 if not self.obdtype == 'obdecho':
987 clean_loop(self.devname)
990 def __init__(self,dom_node):
991 Module.__init__(self, 'OST', dom_node)
992 self.obd_uuid = get_first_ref(dom_node, 'obd')
993 self.add_module('lustre/ost', 'ost')
996 self.info(self.obd_uuid)
997 lctl.newdev(attach="ost %s %s" % (self.name, self.uuid),
998 setup ="%s" % (self.obd_uuid))
1001 # virtual interface for OSC and LOV
1003 def __init__(self,dom_node):
1004 Module.__init__(self, 'VOSC', dom_node)
1005 if dom_node.nodeName == 'lov':
1006 self.osc = LOV(dom_node)
1008 self.osc = OSC(dom_node)
1013 def load_module(self):
1014 self.osc.load_module()
1015 def cleanup_module(self):
1016 self.osc.cleanup_module()
1020 def __init__(self,dom_node):
1021 Module.__init__(self, 'OSC', dom_node)
1022 self.obd_uuid = get_first_ref(dom_node, 'obd')
1023 self.ost_uuid = get_first_ref(dom_node, 'ost')
1024 self.lookup_server(self.ost_uuid)
1025 self.add_module('lustre/osc', 'osc')
1028 self.info(self.obd_uuid, self.ost_uuid)
1029 srv = self.get_server()
1031 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
1035 lctl.add_route_host(r[0], srv.uuid, r[1], r[2])
1037 panic ("no route to", srv.nid)
1039 lctl.newdev(attach="osc %s %s" % (self.name, self.uuid),
1040 setup ="%s %s" %(self.obd_uuid, srv.uuid))
1043 srv = self.get_server()
1045 Module.cleanup(self)
1047 self.info(self.obd_uuid, self.ost_uuid)
1051 lctl.del_route_host(r[0], srv.uuid, r[1], r[2])
1052 except CommandError, e:
1053 print "del_route failed: ", self.name
1056 Module.cleanup(self)
1059 class Mountpoint(Module):
1060 def __init__(self,dom_node):
1061 Module.__init__(self, 'MTPT', dom_node)
1062 self.path = get_text(dom_node, 'path')
1063 self.mds_uuid = get_first_ref(dom_node, 'mds')
1064 self.lov_uuid = get_first_ref(dom_node, 'osc')
1065 self.add_module('lustre/mdc', 'mdc')
1066 self.add_module('lustre/llite', 'llite')
1067 l = lookup(self.dom_node.parentNode, self.lov_uuid)
1072 mdc_uuid = prepare_mdc(self.dom_node.parentNode, self.mds_uuid)
1074 self.info(self.path, self.mds_uuid,self.lov_uuid)
1075 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s none %s" % \
1076 (self.lov_uuid, mdc_uuid, self.path)
1077 run("mkdir", self.path)
1080 panic("mount failed:", self.path)
1083 self.info(self.path, self.mds_uuid,self.lov_uuid)
1085 (rc, out) = run("umount -f", self.path)
1087 (rc, out) = run("umount", self.path)
1089 log("umount failed, cleanup will most likely not work.")
1090 l = lookup(self.dom_node.parentNode, self.lov_uuid)
1092 cleanup_mdc(self.dom_node.parentNode, self.mds_uuid)
1094 def load_module(self):
1095 self.osc.load_module()
1096 Module.load_module(self)
1097 def cleanup_module(self):
1098 Module.cleanup_module(self)
1099 self.osc.cleanup_module()
1102 # ============================================================
1103 # XML processing and query
1104 # TODO: Change query funcs to use XPath, which is muc cleaner
1106 def get_device(obd):
1107 list = obd.getElementsByTagName('device')
1111 size = get_attr_int(dev, 'size', 0)
1112 return dev.firstChild.data, size
1115 # Get the text content from the first matching child
1116 # If there is no content (or it is all whitespace), return
1118 def get_text(dom_node, tag, default=""):
1119 list = dom_node.getElementsByTagName(tag)
1122 dom_node.normalize()
1123 if dom_node.firstChild:
1124 txt = string.strip(dom_node.firstChild.data)
1129 def get_text_int(dom_node, tag, default=0):
1130 list = dom_node.getElementsByTagName(tag)
1134 dom_node.normalize()
1135 if dom_node.firstChild:
1136 txt = string.strip(dom_node.firstChild.data)
1141 panic("text value is not integer:", txt)
1144 def get_attr(dom_node, attr, default=""):
1145 v = dom_node.getAttribute(attr)
1150 def get_attr_int(dom_node, attr, default=0):
1152 v = dom_node.getAttribute(attr)
1157 panic("attr value is not integer", v)
1160 def get_first_ref(dom_node, tag):
1161 """ Get the first uuidref of the type TAG. Used one only
1162 one is expected. Returns the uuid."""
1164 refname = '%s_ref' % tag
1165 list = dom_node.getElementsByTagName(refname)
1167 uuid = getRef(list[0])
1170 def get_all_refs(dom_node, tag):
1171 """ Get all the refs of type TAG. Returns list of uuids. """
1173 refname = '%s_ref' % tag
1174 list = dom_node.getElementsByTagName(refname)
1177 uuids.append(getRef(i))
1180 def get_ost_net(dom_node, uuid):
1181 ost = lookup(dom_node, uuid)
1182 uuid = get_first_ref(ost, 'network')
1185 return lookup(dom_node, uuid)
1187 def nid2server(dom_node, nid):
1188 netlist = dom_node.getElementsByTagName('network')
1189 for net_node in netlist:
1190 if get_text(net_node, 'server') == nid:
1191 return Network(net_node)
1194 def lookup(dom_node, uuid):
1195 for n in dom_node.childNodes:
1196 if n.nodeType == n.ELEMENT_NODE:
1197 if getUUID(n) == uuid:
1204 # Get name attribute of dom_node
1205 def getName(dom_node):
1206 return dom_node.getAttribute('name')
1208 def getRef(dom_node):
1209 return dom_node.getAttribute('uuidref')
1211 # Get name attribute of dom_node
1212 def getUUID(dom_node):
1213 return dom_node.getAttribute('uuid')
1215 # the tag name is the service type
1216 # fixme: this should do some checks to make sure the dom_node is a service
1217 def getServiceType(dom_node):
1218 return dom_node.nodeName
1221 # determine what "level" a particular node is at.
1222 # the order of iniitailization is based on level.
1223 def getServiceLevel(dom_node):
1224 type = getServiceType(dom_node)
1226 if type in ('network',):
1228 elif type in ('device', 'ldlm'):
1230 elif type in ('obd', 'mdd'):
1232 elif type in ('mds','ost'):
1234 elif type in ('mdc','osc'):
1236 elif type in ('lov', 'lovconfig'):
1238 elif type in ('mountpoint',):
1241 if ret < config.startlevel() or ret > config.endlevel():
1246 # return list of services in a profile. list is a list of tuples
1247 # [(level, dom_node),]
1248 def getServices(lustreNode, profileNode):
1250 for n in profileNode.childNodes:
1251 if n.nodeType == n.ELEMENT_NODE:
1252 servNode = lookup(lustreNode, getRef(n))
1255 panic('service not found: ' + getRef(n))
1256 level = getServiceLevel(servNode)
1258 list.append((level, servNode))
1262 def getByName(lustreNode, name, tag):
1263 ndList = lustreNode.getElementsByTagName(tag)
1265 if getName(nd) == name:
1270 ############################################################
1272 # FIXME: clean this mess up!
1275 def prepare_mdc(dom_node, mds_uuid):
1277 mds_node = lookup(dom_node, mds_uuid);
1279 panic("no mds:", mds_uuid)
1288 def cleanup_mdc(dom_node, mds_uuid):
1290 mds_node = lookup(dom_node, mds_uuid);
1292 panic("no mds:", mds_uuid)
1300 ############################################################
1301 # routing ("rooting")
1307 def init_node(dom_node):
1308 global local_node, router_flag
1309 netlist = dom_node.getElementsByTagName('network')
1310 for dom_net in netlist:
1311 type = get_attr(dom_net, 'type')
1312 gw = get_text(dom_net, 'server')
1313 local_node.append((type, gw))
1315 def node_needs_router():
1318 def get_routes(type, gw, dom_net):
1319 """ Return the routes as a list of tuples of the form:
1320 [(type, gw, lo, hi),]"""
1322 tbl = dom_net.getElementsByTagName('route_tbl')
1324 routes = t.getElementsByTagName('route')
1326 lo = get_attr(r, 'lo')
1327 hi = get_attr(r, 'hi', '')
1328 res.append((type, gw, lo, hi))
1332 def init_route_config(lustre):
1333 """ Scan the lustre config looking for routers. Build list of
1335 global routes, router_flag
1337 list = lustre.getElementsByTagName('node')
1339 if get_attr(node, 'router'):
1341 for (local_type, local_nid) in local_node:
1343 netlist = node.getElementsByTagName('network')
1344 for dom_net in netlist:
1345 if local_type == get_attr(dom_net, 'type'):
1346 gw = get_text(dom_net, 'server')
1350 for dom_net in netlist:
1351 if local_type != get_attr(dom_net, 'type'):
1352 for route in get_routes(local_type, gw, dom_net):
1353 routes.append(route)
1358 for iface in local_node:
1359 if net.net_type == iface[0]:
1363 def find_route(net):
1364 global local_node, routes
1365 frm_type = local_node[0][0]
1366 to_type = net.net_type
1368 debug ('looking for route to', to_type,to)
1377 ############################################################
1380 def startService(dom_node, module_flag):
1381 type = getServiceType(dom_node)
1382 debug('Service:', type, getName(dom_node), getUUID(dom_node))
1383 # there must be a more dynamic way of doing this...
1389 elif type == 'lovconfig':
1390 n = LOVConfig(dom_node)
1391 elif type == 'network':
1392 n = Network(dom_node)
1403 elif type == 'mountpoint':
1404 n = Mountpoint(dom_node)
1406 panic ("unknown service type:", type)
1411 if config.cleanup():
1416 if config.nosetup():
1418 if config.cleanup():
1424 # Prepare the system to run lustre using a particular profile
1425 # in a the configuration.
1426 # * load & the modules
1427 # * setup networking for the current node
1428 # * make sure partitions are in place and prepared
1429 # * initialize devices with lctl
1430 # Levels is important, and needs to be enforced.
1431 def startProfile(lustreNode, profileNode, module_flag):
1433 panic("profile:", profile, "not found.")
1434 services = getServices(lustreNode, profileNode)
1435 if config.cleanup():
1438 startService(s[1], module_flag)
1443 def doHost(lustreNode, hosts):
1447 dom_node = getByName(lustreNode, h, 'node')
1452 print 'No host entry found.'
1455 if not get_attr(dom_node, 'router'):
1457 init_route_config(lustreNode)
1462 # Two step process: (1) load modules, (2) setup lustre
1463 # if not cleaning, load modules first.
1464 module_flag = not config.cleanup()
1465 reflist = dom_node.getElementsByTagName('profile')
1466 for profile in reflist:
1467 startProfile(lustreNode, profile, module_flag)
1469 if not config.cleanup():
1470 sys_set_debug_path()
1471 script = config.gdb_script()
1472 run(lctl.lctl, ' modules >', script)
1474 # dump /tmp/ogdb and sleep/pause here
1475 log ("The GDB module script is in", script)
1478 module_flag = not module_flag
1479 for profile in reflist:
1480 startProfile(lustreNode, profile, module_flag)
1482 ############################################################
1483 # Command line processing
1485 def parse_cmdline(argv):
1486 short_opts = "hdnvf"
1487 long_opts = ["ldap", "reformat", "lustre=", "verbose", "gdb",
1488 "portals=", "makeldiff", "cleanup", "noexec",
1489 "help", "node=", "nomod", "nosetup",
1490 "dump=", "force", "startlevel=", "endlevel="]
1494 opts, args = getopt.getopt(argv, short_opts, long_opts)
1495 except getopt.error:
1500 if o in ("-h", "--help"):
1502 if o in ("-d","--cleanup"):
1504 if o in ("-v", "--verbose"):
1506 if o in ("-n", "--noexec"):
1509 if o == "--portals":
1513 if o == "--reformat":
1521 if o == "--nosetup":
1525 if o in ("-f", "--force"):
1527 if o in ("--startlevel",):
1528 config.startlevel(a)
1529 if o in ("--endlevel",):
1538 s = urllib.urlopen(url)
1544 def setupModulePath(cmd):
1545 base = os.path.dirname(cmd)
1546 if os.access(base+"/Makefile", os.R_OK):
1547 config.src_dir(base + "/../../")
1549 def sys_set_debug_path():
1550 debug("debug path: ", config.debug_path())
1554 fp = open('/proc/sys/portals/debug_path', 'w')
1555 fp.write(config.debug_path())
1560 #/proc/sys/net/core/rmem_max
1561 #/proc/sys/net/core/wmem_max
1562 def sys_set_netmem_max(path, max):
1563 debug("setting", path, "to at least", max)
1571 fp = open(path, 'w')
1572 fp.write('%d\n' %(max))
1576 def sys_make_devices():
1577 if not os.access('/dev/portals', os.R_OK):
1578 run('mknod /dev/portals c 10 240')
1579 if not os.access('/dev/obd', os.R_OK):
1580 run('mknod /dev/obd c 10 241')
1583 # Add dir to the global PATH, if not already there.
1584 def add_to_path(new_dir):
1585 syspath = string.split(os.environ['PATH'], ':')
1586 if new_dir in syspath:
1588 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
1591 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
1592 # ensure basic elements are in the system path
1593 def sanitise_path():
1594 for dir in DEFAULT_PATH:
1597 # Initialize or shutdown lustre according to a configuration file
1598 # * prepare the system for lustre
1599 # * configure devices with lctl
1600 # Shutdown does steps in reverse
1603 global TCP_ACCEPTOR, lctl, MAXTCPBUF
1604 host = socket.gethostname()
1608 args = parse_cmdline(sys.argv[1:])
1610 if not os.access(args[0], os.R_OK):
1611 print 'File not found or readable:', args[0]
1613 dom = xml.dom.minidom.parse(args[0])
1615 xmldata = fetch(config.url())
1616 dom = xml.dom.minidom.parseString(xmldata)
1622 node_list.append(config.node())
1625 node_list.append(host)
1626 node_list.append('localhost')
1627 debug("configuring for host: ", node_list)
1630 config._debug_path = config._debug_path + '-' + host
1631 config._gdb_script = config._gdb_script + '-' + host
1633 TCP_ACCEPTOR = find_prog('acceptor')
1634 if not TCP_ACCEPTOR:
1636 TCP_ACCEPTOR = 'acceptor'
1637 debug('! acceptor not found')
1639 panic('acceptor not found')
1641 lctl = LCTLInterface('lctl')
1643 setupModulePath(sys.argv[0])
1645 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
1646 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
1647 doHost(dom.documentElement, node_list)
1649 if __name__ == "__main__":
1652 except LconfError, e:
1654 except CommandError, e:
1658 if first_cleanup_error:
1659 sys.exit(first_cleanup_error)