3 # Copyright (C) 2002 Cluster File Systems, Inc.
4 # Author: Robert Read <rread@clusterfs.com>
5 # This file is part of Lustre, http://www.lustre.org.
7 # Lustre is free software; you can redistribute it and/or
8 # modify it under the terms of version 2 of the GNU General Public
9 # License as published by the Free Software Foundation.
11 # Lustre is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with Lustre; if not, write to the Free Software
18 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 # lconf - lustre configuration tool
22 # lconf is the main driver script for starting and stopping
23 # lustre filesystem services.
25 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import string, os, stat, popen2, socket, time
30 import xml.dom.minidom
35 DEFAULT_TCPBUF = 1048576
37 # Maximum number of devices to search for.
38 # (the /dev/loop* nodes need to be created beforehand)
39 MAX_LOOP_DEVICES = 256
41 first_cleanup_error = 0
42 def cleanup_error(rc):
43 global first_cleanup_error
44 if not first_cleanup_error:
45 first_cleanup_error = rc
49 print """usage: lconf config.xml
51 config.xml Lustre configuration in xml format.
52 --get <url> URL to fetch a config file
53 --node <nodename> Load config for <nodename>
54 -d | --cleanup Cleans up config. (Shutdown)
55 -f | --force Forced unmounting and/or obd detach during cleanup
56 -v | --verbose Print system commands as they are run
57 -h | --help Print this help
58 --gdb Prints message after creating gdb module script
59 and sleeps for 5 seconds.
60 -n | --noexec Prints the commands and steps that will be run for a
61 config without executing them. This can used to check if a
62 config file is doing what it should be doing. (Implies -v)
63 --nomod Skip load/unload module step.
64 --nosetup Skip device setup/cleanup step.
65 --reformat Reformat all devices (without question)
66 --dump <file> Dump the kernel debug log before portals is unloaded
67 --startlevel <num> Specify the level of services to start with (default 0)
68 --endlevel <num> Specify the level of services to end with (default 100)
69 Levels are aproximatly like:
79 --ldap server LDAP server with lustre config database
80 --makeldiff Translate xml source to LDIFF
81 This are perhaps not needed:
82 --lustre="src dir" Base directory of lustre sources. Used to search
84 --portals=src Portals source
88 # ============================================================
89 # Config parameters, encapsulated in a class
105 self._gdb_script = '/tmp/ogdb'
106 self._debug_path = '/tmp/lustre-log'
107 self._dump_file = None
109 self._start_level = 0
110 self._end_level = 100
112 def verbose(self, flag = None):
113 if flag: self._verbose = flag
116 def noexec(self, flag = None):
117 if flag: self._noexec = flag
120 def reformat(self, flag = None):
121 if flag: self._reformat = flag
122 return self._reformat
124 def cleanup(self, flag = None):
125 if flag: self._cleanup = flag
128 def gdb(self, flag = None):
129 if flag: self._gdb = flag
132 def nomod(self, flag = None):
133 if flag: self._nomod = flag
136 def nosetup(self, flag = None):
137 if flag: self._nosetup = flag
140 def force(self, flag = None):
141 if flag: self._force = flag
144 def node(self, val = None):
145 if val: self._node = val
148 def url(self, val = None):
149 if val: self._url = val
152 def gdb_script(self):
153 if os.path.isdir('/r'):
154 return '/r' + self._gdb_script
156 return self._gdb_script
158 def debug_path(self):
159 if os.path.isdir('/r'):
160 return '/r' + self._debug_path
162 return self._debug_path
164 def src_dir(self, val = None):
165 if val: self._src_dir = val
168 def dump_file(self, val = None):
169 if val: self._dump_file = val
170 return self._dump_file
172 def startlevel(self, val = None):
173 if val: self._start_level = int(val)
174 return self._start_level
176 def endlevel(self, val = None):
177 if val: self._end_level = int(val)
178 return self._end_level
184 # ============================================================
185 # debugging and error funcs
187 def fixme(msg = "this feature"):
188 raise LconfError, msg + ' not implmemented yet.'
191 msg = string.join(map(str,args))
192 if not config.noexec():
193 raise LconfError(msg)
198 msg = string.join(map(str,args))
203 print string.strip(s)
207 msg = string.join(map(str,args))
210 # ============================================================
211 # locally defined exceptions
212 class CommandError (exceptions.Exception):
213 def __init__(self, cmd_name, cmd_err, rc=None):
214 self.cmd_name = cmd_name
215 self.cmd_err = cmd_err
220 if type(self.cmd_err) == types.StringType:
222 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
224 print "! %s: %s" % (self.cmd_name, self.cmd_err)
225 elif type(self.cmd_err) == types.ListType:
227 print "! %s (error %d):" % (self.cmd_name, self.rc)
229 print "! %s:" % (self.cmd_name)
230 for s in self.cmd_err:
231 print "> %s" %(string.strip(s))
235 class LconfError (exceptions.Exception):
236 def __init__(self, args):
240 # ============================================================
241 # handle lctl interface
244 Manage communication with lctl
247 def __init__(self, cmd):
249 Initialize close by finding the lctl binary.
251 self.lctl = find_prog(cmd)
254 debug('! lctl not found')
257 raise CommandError('lctl', "unable to find lctl binary.")
262 the cmds are written to stdin of lctl
263 lctl doesn't return errors when run in script mode, so
265 should modify command line to accept multiple commands, or
266 create complex command line options
268 debug("+", self.lctl, cmds)
269 if config.noexec(): return (0, [])
270 p = popen2.Popen3(self.lctl, 1)
271 p.tochild.write(cmds + "\n")
273 out = p.fromchild.readlines()
274 err = p.childerr.readlines()
276 if os.WIFEXITED(ret):
277 rc = os.WEXITSTATUS(ret)
281 raise CommandError(self.lctl, err, rc)
284 def runcmd(self, *args):
286 run lctl using the command line
288 cmd = string.join(map(str,args))
289 debug("+", self.lctl, cmd)
290 rc, out = run(self.lctl, cmd)
292 raise CommandError(self.lctl, out, rc)
296 def network(self, net, nid):
297 """ initialized network and add "self" """
298 # Idea: "mynid" could be used for all network types to add "self," and then
299 # this special case would be gone and the "self" hack would be hidden.
305 quit""" % (net, nid, nid)
314 # create a new connection
315 def connect(self, net, nid, port, servuuid, send_mem, recv_mem):
323 quit""" % (net, servuuid, nid, send_mem, recv_mem, nid, port, )
329 quit""" % (net, servuuid, nid, nid, port, )
333 # add a route to a range
334 def add_route(self, net, gw, lo, hi):
338 quit """ % (net, gw, lo, hi)
342 def del_route(self, net, gw, lo, hi):
350 # add a route to a host
351 def add_route_host(self, net, uuid, gw, tgt):
356 quit """ % (net, uuid, tgt, gw, tgt)
359 # add a route to a range
360 def del_route_host(self, net, uuid, gw, tgt):
366 quit """ % (net, uuid, tgt)
369 # disconnect one connection
370 def disconnect(self, net, nid, port, servuuid):
376 quit""" % (net, nid, servuuid)
380 def disconnectAll(self, net):
389 # create a new device with lctl
390 def newdev(self, attach, setup = ""):
395 quit""" % (attach, setup)
399 def cleanup(self, name, uuid):
405 quit""" % (name, ('', 'force')[config.force()])
409 def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist):
413 lov_setconfig %s %d %d %d %s %s
414 quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
418 def dump(self, dump_file):
421 quit""" % (dump_file)
424 # get list of devices
425 def device_list(self):
426 rc, out = self.runcmd('device_list')
429 # ============================================================
430 # Various system-level functions
431 # (ideally moved to their own module)
433 # Run a command and return the output and status.
434 # stderr is sent to /dev/null, could use popen3 to
435 # save it if necessary
437 cmd = string.join(map(str,args))
439 if config.noexec(): return (0, [])
440 f = os.popen(cmd + ' 2>&1')
449 # Run a command in the background.
450 def run_daemon(*args):
451 cmd = string.join(map(str,args))
453 if config.noexec(): return 0
454 f = os.popen(cmd + ' 2>&1')
462 # Determine full path to use for an external command
463 # searches dirname(argv[0]) first, then PATH
465 syspath = string.split(os.environ['PATH'], ':')
466 cmdpath = os.path.dirname(sys.argv[0])
467 syspath.insert(0, cmdpath);
468 syspath.insert(0, os.path.join(cmdpath, '../../portals/linux/utils/'))
470 prog = os.path.join(d,cmd)
471 if os.access(prog, os.X_OK):
475 # Recursively look for file starting at base dir
476 def do_find_file(base, mod):
477 fullname = os.path.join(base, mod)
478 if os.access(fullname, os.R_OK):
480 for d in os.listdir(base):
481 dir = os.path.join(base,d)
482 if os.path.isdir(dir):
483 module = do_find_file(dir, mod)
487 def find_module(src_dir, dev_dir, modname):
488 mod = '%s.o' % (modname)
489 module = src_dir +'/'+ dev_dir +'/'+ mod
491 if os.access(module, os.R_OK):
497 # is the path a block device?
504 return stat.S_ISBLK(s[stat.ST_MODE])
506 # build fs according to type
508 def mkfs(fstype, dev):
509 if(fstype in ('ext3', 'extN')):
510 mkfs = 'mkfs.ext2 -j -b 4096'
512 print 'unsupported fs type: ', fstype
513 if not is_block(dev):
517 (ret, out) = run (mkfs, force, dev)
519 panic("Unable to build fs:", dev)
520 # enable hash tree indexing on fsswe
521 # FIXME: this check can probably go away on 2.5
523 htree = 'echo "feature FEATURE_C5" | debugfs -w'
524 (ret, out) = run (htree, dev)
526 panic("Unable to enable htree:", dev)
528 # some systems use /dev/loopN, some /dev/loop/N
532 if not os.access(loop + str(0), os.R_OK):
534 if not os.access(loop + str(0), os.R_OK):
535 panic ("can't access loop devices")
538 # find loop device assigned to thefile
541 for n in xrange(0, MAX_LOOP_DEVICES):
543 if os.access(dev, os.R_OK):
544 (stat, out) = run('losetup', dev)
545 if (out and stat == 0):
546 m = re.search(r'\((.*)\)', out[0])
547 if m and file == m.group(1):
553 # create file if necessary and assign the first free loop device
554 def init_loop(file, size, fstype):
555 dev = find_loop(file)
557 print 'WARNING file:', file, 'already mapped to', dev
559 if config.reformat() or not os.access(file, os.R_OK | os.W_OK):
560 run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size, file))
562 # find next free loop
563 for n in xrange(0, MAX_LOOP_DEVICES):
565 if os.access(dev, os.R_OK):
566 (stat, out) = run('losetup', dev)
568 run('losetup', dev, file)
571 print "out of loop devices"
573 print "out of loop devices"
576 # undo loop assignment
577 def clean_loop(file):
578 dev = find_loop(file)
580 ret, out = run('losetup -d', dev)
582 log('unable to clean loop device:', dev, 'for file:', file)
585 # determine if dev is formatted as a <fstype> filesystem
586 def need_format(fstype, dev):
587 # FIXME don't know how to implement this
590 # initialize a block device if needed
591 def block_dev(dev, size, fstype, format):
592 if config.noexec(): return dev
593 if not is_block(dev):
594 dev = init_loop(dev, size, fstype)
595 if config.reformat() or (need_format(fstype, dev) and format == 'yes'):
599 # panic("device:", dev,
600 # "not prepared, and autoformat is not set.\n",
601 # "Rerun with --reformat option to format ALL filesystems")
606 """lookup IP address for an interface"""
607 rc, out = run("/sbin/ifconfig", iface)
610 addr = string.split(out[1])[1]
611 ip = string.split(addr, ':')[1]
614 def get_local_address(net_type, wildcard):
615 """Return the local address for the network type."""
617 if net_type == 'tcp':
619 iface, star = string.split(wildcard, ':')
620 local = if2addr(iface)
622 panic ("unable to determine ip for:", wildcard)
624 host = socket.gethostname()
625 local = socket.gethostbyname(host)
626 elif net_type == 'elan':
627 # awk '/NodeId/ { print $2 }' '/proc/elan/device0/position'
629 fp = open('/proc/elan/device0/position', 'r')
630 lines = fp.readlines()
639 elif net_type == 'gm':
640 fixme("automatic local address for GM")
644 def is_prepared(uuid):
645 """Return true if a device exists for the uuid"""
646 # expect this format:
647 # 1 UP ldlm ldlm ldlm_UUID 2
648 out = lctl.device_list()
650 if uuid == string.split(s)[4]:
655 # ============================================================
656 # Classes to prepare and cleanup the various objects
659 """ Base class for the rest of the modules. The default cleanup method is
660 defined here, as well as some utilitiy funcs.
662 def __init__(self, module_name, dom_node):
663 self.dom_node = dom_node
664 self.module_name = module_name
665 self.name = get_attr(dom_node, 'name')
666 self.uuid = get_attr(dom_node, 'uuid')
667 self.kmodule_list = []
671 def info(self, *args):
672 msg = string.join(map(str,args))
673 print self.module_name + ":", self.name, self.uuid, msg
676 def lookup_server(self, srv_uuid):
677 """ Lookup a server's network information """
678 net = get_ost_net(self.dom_node.parentNode, srv_uuid)
680 panic ("Unable to find a server for:", srv_uuid)
681 self._server = Network(net)
683 def get_server(self):
687 """ default cleanup, used for most modules """
688 if not is_prepared(self.uuid):
691 srv = self.get_server()
692 if srv and local_net(srv):
694 lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
695 except CommandError, e:
696 log(self.module_name, "disconnect failed: ", self.name)
700 lctl.cleanup(self.name, self.uuid)
701 except CommandError, e:
702 log(self.module_name, "cleanup failed: ", self.name)
706 def add_module(self, dev_dir, modname):
707 """Append a module to list of modules to load."""
708 self.kmodule_list.append((dev_dir, modname))
710 def mod_loaded(self, modname):
711 """Check if a module is already loaded. Look in /proc/modules for it."""
712 fp = open('/proc/modules')
713 lines = fp.readlines()
715 # please forgive my tired fingers for this one
716 ret = filter(lambda word, mod=modname: word == mod,
717 map(lambda line: string.split(line)[0], lines))
720 def load_module(self):
721 """Load all the modules in the list in the order they appear."""
722 for dev_dir, mod in self.kmodule_list:
723 # (rc, out) = run ('/sbin/lsmod | grep -s', mod)
724 if self.mod_loaded(mod) and not config.noexec():
726 log ('loading module:', mod)
728 module = find_module(config.src_dir(),dev_dir, mod)
730 panic('module not found:', mod)
731 (rc, out) = run('/sbin/insmod', module)
733 raise CommandError('insmod', out, rc)
735 (rc, out) = run('/sbin/modprobe', mod)
737 raise CommandError('modprobe', out, rc)
739 def cleanup_module(self):
740 """Unload the modules in the list in reverse order."""
741 rev = self.kmodule_list
743 for dev_dir, mod in rev:
744 if not self.mod_loaded(mod):
747 if mod == 'portals' and config.dump_file():
748 lctl.dump(config.dump_file())
749 log('unloading module:', mod)
752 (rc, out) = run('/sbin/rmmod', mod)
754 log('! unable to unload module:', mod)
758 class Network(Module):
759 def __init__(self,dom_node):
760 Module.__init__(self, 'NETWORK', dom_node)
761 self.net_type = get_attr(dom_node,'type')
762 self.nid = get_text(dom_node, 'server', '*')
763 self.port = get_text_int(dom_node, 'port', 0)
764 self.send_mem = get_text_int(dom_node, 'send_mem', DEFAULT_TCPBUF)
765 self.recv_mem = get_text_int(dom_node, 'recv_mem', DEFAULT_TCPBUF)
767 self.nid = get_local_address(self.net_type, self.nid)
769 panic("unable to set nid for", self.net_type, self.nid)
770 debug("nid:", self.nid)
772 self.add_module('portals/linux/oslib/', 'portals')
773 if node_needs_router():
774 self.add_module('portals/linux/router', 'kptlrouter')
775 if self.net_type == 'tcp':
776 self.add_module('portals/linux/socknal', 'ksocknal')
777 if self.net_type == 'toe':
778 self.add_module('portals/linux/toenal', 'ktoenal')
779 if self.net_type == 'elan':
780 self.add_module('portals/linux/rqswnal', 'kqswnal')
781 if self.net_type == 'gm':
782 self.add_module('portals/linux/gmnal', 'kgmnal')
783 self.add_module('lustre/obdclass', 'obdclass')
784 self.add_module('lustre/ptlrpc', 'ptlrpc')
787 self.info(self.net_type, self.nid, self.port)
788 if self.net_type in ('tcp', 'toe'):
789 nal_id = '' # default is socknal
790 if self.net_type == 'toe':
792 ret, out = run(TCP_ACCEPTOR, '-s', self.send_mem, '-r', self.recv_mem, nal_id, self.port)
794 raise CommandError(TCP_ACCEPTOR, out, ret)
795 ret = self.dom_node.getElementsByTagName('route_tbl')
797 for r in a.getElementsByTagName('route'):
798 net_type = get_attr(r, 'type')
799 gw = get_attr(r, 'gw')
800 lo = get_attr(r, 'lo')
801 hi = get_attr(r,'hi', '')
802 lctl.add_route(net_type, gw, lo, hi)
803 if net_type == 'tcp' and net_type == self.net_type and hi == '':
804 srv = nid2server(self.dom_node.parentNode.parentNode, lo)
806 panic("no server for nid", lo)
808 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
811 lctl.network(self.net_type, self.nid)
812 lctl.newdev(attach = "ptlrpc RPCDEV RPCDEV_UUID")
815 self.info(self.net_type, self.nid, self.port)
816 ret = self.dom_node.getElementsByTagName('route_tbl')
818 for r in a.getElementsByTagName('route'):
819 lo = get_attr(r, 'lo')
820 hi = get_attr(r,'hi', '')
821 if self.net_type == 'tcp' and hi == '':
822 srv = nid2server(self.dom_node.parentNode.parentNode, lo)
824 panic("no server for nid", lo)
827 lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
828 except CommandError, e:
829 print "disconnect failed: ", self.name
833 lctl.del_route(self.net_type, self.nid, lo, hi)
834 except CommandError, e:
835 print "del_route failed: ", self.name
840 lctl.cleanup("RPCDEV", "RPCDEV_UUID")
841 except CommandError, e:
842 print "cleanup failed: ", self.name
846 lctl.disconnectAll(self.net_type)
847 except CommandError, e:
848 print "disconnectAll failed: ", self.name
851 if self.net_type == 'tcp':
852 # yikes, this ugly! need to save pid in /var/something
853 run("killall acceptor")
856 def __init__(self,dom_node):
857 Module.__init__(self, 'LDLM', dom_node)
858 self.add_module('lustre/ldlm', 'ldlm')
860 if is_prepared(self.uuid):
863 lctl.newdev(attach="ldlm %s %s" % (self.name, self.uuid),
867 def __init__(self,dom_node):
868 Module.__init__(self, 'LOV', dom_node)
869 self.mds_uuid = get_first_ref(dom_node, 'mds')
870 mds= lookup(dom_node.parentNode, self.mds_uuid)
871 self.mds_name = getName(mds)
872 devs = dom_node.getElementsByTagName('devices')
875 self.stripe_sz = get_attr_int(dev_node, 'stripesize', 65536)
876 self.stripe_off = get_attr_int(dev_node, 'stripeoffset', 0)
877 self.pattern = get_attr_int(dev_node, 'pattern', 0)
878 self.devlist = get_all_refs(dev_node, 'osc')
879 self.stripe_cnt = get_attr_int(dev_node, 'stripecount', len(self.devlist))
880 self.add_module('lustre/mdc', 'mdc')
881 self.add_module('lustre/lov', 'lov')
884 if is_prepared(self.uuid):
886 for osc_uuid in self.devlist:
887 osc = lookup(self.dom_node.parentNode, osc_uuid)
892 panic('osc not found:', osc_uuid)
893 mdc_uuid = prepare_mdc(self.dom_node.parentNode, self.mds_uuid)
894 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
895 self.stripe_off, self.pattern, self.devlist, self.mds_name)
896 lctl.newdev(attach="lov %s %s" % (self.name, self.uuid),
897 setup ="%s" % (mdc_uuid))
900 if not is_prepared(self.uuid):
902 for osc_uuid in self.devlist:
903 osc = lookup(self.dom_node.parentNode, osc_uuid)
908 panic('osc not found:', osc_uuid)
910 cleanup_mdc(self.dom_node.parentNode, self.mds_uuid)
913 def load_module(self):
914 for osc_uuid in self.devlist:
915 osc = lookup(self.dom_node.parentNode, osc_uuid)
921 panic('osc not found:', osc_uuid)
922 Module.load_module(self)
925 def cleanup_module(self):
926 Module.cleanup_module(self)
927 for osc_uuid in self.devlist:
928 osc = lookup(self.dom_node.parentNode, osc_uuid)
934 panic('osc not found:', osc_uuid)
936 class LOVConfig(Module):
937 def __init__(self,dom_node):
938 Module.__init__(self, 'LOVConfig', dom_node)
939 self.lov_uuid = get_first_ref(dom_node, 'lov')
940 l = lookup(dom_node.parentNode, self.lov_uuid)
945 self.info(lov.mds_uuid, lov.stripe_cnt, lov.stripe_sz, lov.stripe_off,
946 lov.pattern, lov.devlist, lov.mds_name)
947 lctl.lov_setconfig(lov.uuid, lov.mds_name, lov.stripe_cnt,
948 lov.stripe_sz, lov.stripe_off, lov.pattern,
949 string.join(lov.devlist))
957 def __init__(self,dom_node):
958 Module.__init__(self, 'MDS', dom_node)
959 self.devname, self.size = get_device(dom_node)
960 self.fstype = get_text(dom_node, 'fstype')
961 # FIXME: if fstype not set, then determine based on kernel version
962 self.format = get_text(dom_node, 'autoformat', "no")
963 if self.fstype == 'extN':
964 self.add_module('lustre/extN', 'extN')
965 self.add_module('lustre/mds', 'mds')
966 self.add_module('lustre/mds', 'mds_%s' % (self.fstype))
969 if is_prepared(self.uuid):
971 self.info(self.devname, self.fstype, self.format)
972 blkdev = block_dev(self.devname, self.size, self.fstype, self.format)
973 if not is_prepared('MDT_UUID'):
974 lctl.newdev(attach="mdt %s %s" % ('MDT', 'MDT_UUID'),
976 lctl.newdev(attach="mds %s %s" % (self.name, self.uuid),
977 setup ="%s %s" %(blkdev, self.fstype))
979 if is_prepared('MDT_UUID'):
981 lctl.cleanup("MDT", "MDT_UUID")
982 except CommandError, e:
983 print "cleanup failed: ", self.name
986 if not is_prepared(self.uuid):
989 clean_loop(self.devname)
991 # Very unusual case, as there is no MDC element in the XML anymore
992 # Builds itself from an MDS node
994 def __init__(self,dom_node):
995 self.mds = MDS(dom_node)
996 self.dom_node = dom_node
997 self.module_name = 'MDC'
998 self.kmodule_list = []
1002 host = socket.gethostname()
1003 self.name = 'MDC_%s_%s' % ( host, self.mds.name )
1004 self.uuid = self.name + '_UUID'
1006 self.lookup_server(self.mds.uuid)
1007 self.add_module('lustre/mdc', 'mdc')
1010 if is_prepared(self.uuid):
1012 self.info(self.mds.uuid)
1013 srv = self.get_server()
1014 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
1015 lctl.newdev(attach="mdc %s %s" % (self.name, self.uuid),
1016 setup ="%s %s" %(self.mds.uuid, srv.uuid))
1019 def __init__(self, dom_node):
1020 Module.__init__(self, 'OBD', dom_node)
1021 self.obdtype = get_attr(dom_node, 'type')
1022 self.devname, self.size = get_device(dom_node)
1023 self.fstype = get_text(dom_node, 'fstype')
1024 # FIXME: if fstype not set, then determine based on kernel version
1025 self.format = get_text(dom_node, 'autoformat', 'yes')
1026 if self.fstype == 'extN':
1027 self.add_module('lustre/extN', 'extN')
1028 self.add_module('lustre/' + self.obdtype, self.obdtype)
1030 # need to check /proc/mounts and /etc/mtab before
1031 # formatting anything.
1032 # FIXME: check if device is already formatted.
1034 if is_prepared(self.uuid):
1036 self.info(self.obdtype, self.devname, self.size, self.fstype, self.format)
1037 if self.obdtype == 'obdecho':
1040 blkdev = block_dev(self.devname, self.size, self.fstype, self.format)
1041 lctl.newdev(attach="%s %s %s" % (self.obdtype, self.name, self.uuid),
1042 setup ="%s %s" %(blkdev, self.fstype))
1044 if not is_prepared(self.uuid):
1046 Module.cleanup(self)
1047 if not self.obdtype == 'obdecho':
1048 clean_loop(self.devname)
1051 def __init__(self,dom_node):
1052 Module.__init__(self, 'OST', dom_node)
1053 self.obd_uuid = get_first_ref(dom_node, 'obd')
1054 self.add_module('lustre/ost', 'ost')
1057 if is_prepared(self.uuid):
1059 self.info(self.obd_uuid)
1060 lctl.newdev(attach="ost %s %s" % (self.name, self.uuid),
1061 setup ="%s" % (self.obd_uuid))
1064 # virtual interface for OSC and LOV
1066 def __init__(self,dom_node):
1067 Module.__init__(self, 'VOSC', dom_node)
1068 if dom_node.nodeName == 'lov':
1069 self.osc = LOV(dom_node)
1071 self.osc = OSC(dom_node)
1076 def load_module(self):
1077 self.osc.load_module()
1078 def cleanup_module(self):
1079 self.osc.cleanup_module()
1083 def __init__(self,dom_node):
1084 Module.__init__(self, 'OSC', dom_node)
1085 self.obd_uuid = get_first_ref(dom_node, 'obd')
1086 self.ost_uuid = get_first_ref(dom_node, 'ost')
1087 self.lookup_server(self.ost_uuid)
1088 self.add_module('lustre/osc', 'osc')
1091 if is_prepared(self.uuid):
1093 self.info(self.obd_uuid, self.ost_uuid)
1094 srv = self.get_server()
1096 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
1100 lctl.add_route_host(r[0], srv.uuid, r[1], r[2])
1102 panic ("no route to", srv.nid)
1104 lctl.newdev(attach="osc %s %s" % (self.name, self.uuid),
1105 setup ="%s %s" %(self.obd_uuid, srv.uuid))
1108 if not is_prepared(self.uuid):
1110 srv = self.get_server()
1112 Module.cleanup(self)
1114 self.info(self.obd_uuid, self.ost_uuid)
1118 lctl.del_route_host(r[0], srv.uuid, r[1], r[2])
1119 except CommandError, e:
1120 print "del_route failed: ", self.name
1123 Module.cleanup(self)
1126 class Mountpoint(Module):
1127 def __init__(self,dom_node):
1128 Module.__init__(self, 'MTPT', dom_node)
1129 self.path = get_text(dom_node, 'path')
1130 self.mds_uuid = get_first_ref(dom_node, 'mds')
1131 self.lov_uuid = get_first_ref(dom_node, 'osc')
1132 self.add_module('lustre/mdc', 'mdc')
1133 self.add_module('lustre/llite', 'llite')
1134 l = lookup(self.dom_node.parentNode, self.lov_uuid)
1139 mdc_uuid = prepare_mdc(self.dom_node.parentNode, self.mds_uuid)
1140 self.info(self.path, self.mds_uuid, self.lov_uuid)
1141 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s none %s" % \
1142 (self.lov_uuid, mdc_uuid, self.path)
1143 run("mkdir", self.path)
1146 panic("mount failed:", self.path)
1149 self.info(self.path, self.mds_uuid,self.lov_uuid)
1151 (rc, out) = run("umount -f", self.path)
1153 (rc, out) = run("umount", self.path)
1155 log("umount failed, cleanup will most likely not work.")
1156 l = lookup(self.dom_node.parentNode, self.lov_uuid)
1158 cleanup_mdc(self.dom_node.parentNode, self.mds_uuid)
1160 def load_module(self):
1161 self.osc.load_module()
1162 Module.load_module(self)
1163 def cleanup_module(self):
1164 Module.cleanup_module(self)
1165 self.osc.cleanup_module()
1168 # ============================================================
1169 # XML processing and query
1170 # TODO: Change query funcs to use XPath, which is muc cleaner
1172 def get_device(obd):
1173 list = obd.getElementsByTagName('device')
1177 size = get_attr_int(dev, 'size', 0)
1178 return dev.firstChild.data, size
1181 # Get the text content from the first matching child
1182 # If there is no content (or it is all whitespace), return
1184 def get_text(dom_node, tag, default=""):
1185 list = dom_node.getElementsByTagName(tag)
1188 dom_node.normalize()
1189 if dom_node.firstChild:
1190 txt = string.strip(dom_node.firstChild.data)
1195 def get_text_int(dom_node, tag, default=0):
1196 list = dom_node.getElementsByTagName(tag)
1200 dom_node.normalize()
1201 if dom_node.firstChild:
1202 txt = string.strip(dom_node.firstChild.data)
1207 panic("text value is not integer:", txt)
1210 def get_attr(dom_node, attr, default=""):
1211 v = dom_node.getAttribute(attr)
1216 def get_attr_int(dom_node, attr, default=0):
1218 v = dom_node.getAttribute(attr)
1223 panic("attr value is not integer", v)
1226 def get_first_ref(dom_node, tag):
1227 """ Get the first uuidref of the type TAG. Used one only
1228 one is expected. Returns the uuid."""
1230 refname = '%s_ref' % tag
1231 list = dom_node.getElementsByTagName(refname)
1233 uuid = getRef(list[0])
1236 def get_all_refs(dom_node, tag):
1237 """ Get all the refs of type TAG. Returns list of uuids. """
1239 refname = '%s_ref' % tag
1240 list = dom_node.getElementsByTagName(refname)
1243 uuids.append(getRef(i))
1246 def get_ost_net(dom_node, uuid):
1247 ost = lookup(dom_node, uuid)
1248 uuid = get_first_ref(ost, 'network')
1251 return lookup(dom_node, uuid)
1253 def nid2server(dom_node, nid):
1254 netlist = dom_node.getElementsByTagName('network')
1255 for net_node in netlist:
1256 if get_text(net_node, 'server') == nid:
1257 return Network(net_node)
1260 def lookup(dom_node, uuid):
1261 for n in dom_node.childNodes:
1262 if n.nodeType == n.ELEMENT_NODE:
1263 if getUUID(n) == uuid:
1270 # Get name attribute of dom_node
1271 def getName(dom_node):
1272 return dom_node.getAttribute('name')
1274 def getRef(dom_node):
1275 return dom_node.getAttribute('uuidref')
1277 # Get name attribute of dom_node
1278 def getUUID(dom_node):
1279 return dom_node.getAttribute('uuid')
1281 # the tag name is the service type
1282 # fixme: this should do some checks to make sure the dom_node is a service
1283 def getServiceType(dom_node):
1284 return dom_node.nodeName
1287 # determine what "level" a particular node is at.
1288 # the order of iniitailization is based on level.
1289 def getServiceLevel(dom_node):
1290 type = getServiceType(dom_node)
1292 if type in ('network',):
1294 elif type in ('device', 'ldlm'):
1296 elif type in ('obd', 'mdd'):
1298 elif type in ('mds','ost'):
1300 elif type in ('mdc','osc'):
1302 elif type in ('lov', 'lovconfig'):
1304 elif type in ('mountpoint',):
1307 if ret < config.startlevel() or ret > config.endlevel():
1312 # return list of services in a profile. list is a list of tuples
1313 # [(level, dom_node),]
1314 def getServices(lustreNode, profileNode):
1316 for n in profileNode.childNodes:
1317 if n.nodeType == n.ELEMENT_NODE:
1318 servNode = lookup(lustreNode, getRef(n))
1321 panic('service not found: ' + getRef(n))
1322 level = getServiceLevel(servNode)
1324 list.append((level, servNode))
1328 def getByName(lustreNode, name, tag):
1329 ndList = lustreNode.getElementsByTagName(tag)
1331 if getName(nd) == name:
1336 ############################################################
1338 # FIXME: clean this mess up!
1341 def prepare_mdc(dom_node, mds_uuid):
1343 mds_node = lookup(dom_node, mds_uuid);
1345 panic("no mds:", mds_uuid)
1346 if saved_mdc.has_key(mds_uuid):
1347 return saved_mdc[mds_uuid]
1350 saved_mdc[mds_uuid] = mdc.uuid
1353 def cleanup_mdc(dom_node, mds_uuid):
1355 mds_node = lookup(dom_node, mds_uuid);
1357 panic("no mds:", mds_uuid)
1358 if not saved_mdc.has_key(mds_uuid):
1361 saved_mdc[mds_uuid] = mdc.uuid
1364 ############################################################
1365 # routing ("rooting")
1371 def init_node(dom_node):
1372 global local_node, router_flag
1373 netlist = dom_node.getElementsByTagName('network')
1374 for dom_net in netlist:
1375 type = get_attr(dom_net, 'type')
1376 gw = get_text(dom_net, 'server')
1377 local_node.append((type, gw))
1379 def node_needs_router():
1382 def get_routes(type, gw, dom_net):
1383 """ Return the routes as a list of tuples of the form:
1384 [(type, gw, lo, hi),]"""
1386 tbl = dom_net.getElementsByTagName('route_tbl')
1388 routes = t.getElementsByTagName('route')
1390 lo = get_attr(r, 'lo')
1391 hi = get_attr(r, 'hi', '')
1392 res.append((type, gw, lo, hi))
1396 def init_route_config(lustre):
1397 """ Scan the lustre config looking for routers. Build list of
1399 global routes, router_flag
1401 list = lustre.getElementsByTagName('node')
1403 if get_attr(node, 'router'):
1405 for (local_type, local_nid) in local_node:
1407 netlist = node.getElementsByTagName('network')
1408 for dom_net in netlist:
1409 if local_type == get_attr(dom_net, 'type'):
1410 gw = get_text(dom_net, 'server')
1414 for dom_net in netlist:
1415 if local_type != get_attr(dom_net, 'type'):
1416 for route in get_routes(local_type, gw, dom_net):
1417 routes.append(route)
1422 for iface in local_node:
1423 if net.net_type == iface[0]:
1427 def find_route(net):
1428 global local_node, routes
1429 frm_type = local_node[0][0]
1430 to_type = net.net_type
1432 debug ('looking for route to', to_type,to)
1441 ############################################################
1444 def startService(dom_node, module_flag):
1445 type = getServiceType(dom_node)
1446 debug('Service:', type, getName(dom_node), getUUID(dom_node))
1447 # there must be a more dynamic way of doing this...
1453 elif type == 'lovconfig':
1454 n = LOVConfig(dom_node)
1455 elif type == 'network':
1456 n = Network(dom_node)
1467 elif type == 'mountpoint':
1468 n = Mountpoint(dom_node)
1470 panic ("unknown service type:", type)
1475 if config.cleanup():
1480 if config.nosetup():
1482 if config.cleanup():
1488 # Prepare the system to run lustre using a particular profile
1489 # in a the configuration.
1490 # * load & the modules
1491 # * setup networking for the current node
1492 # * make sure partitions are in place and prepared
1493 # * initialize devices with lctl
1494 # Levels is important, and needs to be enforced.
1495 def startProfile(lustreNode, profileNode, module_flag):
1497 panic("profile:", profile, "not found.")
1498 services = getServices(lustreNode, profileNode)
1499 if config.cleanup():
1502 startService(s[1], module_flag)
1507 def doHost(lustreNode, hosts):
1511 dom_node = getByName(lustreNode, h, 'node')
1516 print 'No host entry found.'
1519 if not get_attr(dom_node, 'router'):
1521 init_route_config(lustreNode)
1526 # Two step process: (1) load modules, (2) setup lustre
1527 # if not cleaning, load modules first.
1528 module_flag = not config.cleanup()
1529 reflist = dom_node.getElementsByTagName('profile')
1530 for profile in reflist:
1531 startProfile(lustreNode, profile, module_flag)
1533 if not config.cleanup():
1534 sys_set_debug_path()
1535 script = config.gdb_script()
1536 run(lctl.lctl, ' modules >', script)
1538 # dump /tmp/ogdb and sleep/pause here
1539 log ("The GDB module script is in", script)
1542 module_flag = not module_flag
1543 for profile in reflist:
1544 startProfile(lustreNode, profile, module_flag)
1546 ############################################################
1547 # Command line processing
1549 def parse_cmdline(argv):
1550 short_opts = "hdnvf"
1551 long_opts = ["ldap", "reformat", "lustre=", "verbose", "gdb",
1552 "portals=", "makeldiff", "cleanup", "noexec",
1553 "help", "node=", "nomod", "nosetup",
1554 "dump=", "force", "startlevel=", "endlevel="]
1558 opts, args = getopt.getopt(argv, short_opts, long_opts)
1559 except getopt.error:
1564 if o in ("-h", "--help"):
1566 if o in ("-d","--cleanup"):
1568 if o in ("-v", "--verbose"):
1570 if o in ("-n", "--noexec"):
1573 if o == "--portals":
1577 if o == "--reformat":
1585 if o == "--nosetup":
1589 if o in ("-f", "--force"):
1591 if o in ("--startlevel",):
1592 config.startlevel(a)
1593 if o in ("--endlevel",):
1602 s = urllib.urlopen(url)
1608 def setupModulePath(cmd):
1609 base = os.path.dirname(cmd)
1610 if os.access(base+"/Makefile", os.R_OK):
1611 config.src_dir(base + "/../../")
1613 def sys_set_debug_path():
1614 debug("debug path: ", config.debug_path())
1618 fp = open('/proc/sys/portals/debug_path', 'w')
1619 fp.write(config.debug_path())
1624 #/proc/sys/net/core/rmem_max
1625 #/proc/sys/net/core/wmem_max
1626 def sys_set_netmem_max(path, max):
1627 debug("setting", path, "to at least", max)
1635 fp = open(path, 'w')
1636 fp.write('%d\n' %(max))
1640 def sys_make_devices():
1641 if not os.access('/dev/portals', os.R_OK):
1642 run('mknod /dev/portals c 10 240')
1643 if not os.access('/dev/obd', os.R_OK):
1644 run('mknod /dev/obd c 10 241')
1647 # Add dir to the global PATH, if not already there.
1648 def add_to_path(new_dir):
1649 syspath = string.split(os.environ['PATH'], ':')
1650 if new_dir in syspath:
1652 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
1655 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
1656 # ensure basic elements are in the system path
1657 def sanitise_path():
1658 for dir in DEFAULT_PATH:
1661 # Initialize or shutdown lustre according to a configuration file
1662 # * prepare the system for lustre
1663 # * configure devices with lctl
1664 # Shutdown does steps in reverse
1667 global TCP_ACCEPTOR, lctl, MAXTCPBUF
1668 host = socket.gethostname()
1672 args = parse_cmdline(sys.argv[1:])
1674 if not os.access(args[0], os.R_OK):
1675 print 'File not found or readable:', args[0]
1677 dom = xml.dom.minidom.parse(args[0])
1679 xmldata = fetch(config.url())
1680 dom = xml.dom.minidom.parseString(xmldata)
1686 node_list.append(config.node())
1689 node_list.append(host)
1690 node_list.append('localhost')
1691 debug("configuring for host: ", node_list)
1694 config._debug_path = config._debug_path + '-' + host
1695 config._gdb_script = config._gdb_script + '-' + host
1697 TCP_ACCEPTOR = find_prog('acceptor')
1698 if not TCP_ACCEPTOR:
1700 TCP_ACCEPTOR = 'acceptor'
1701 debug('! acceptor not found')
1703 panic('acceptor not found')
1705 lctl = LCTLInterface('lctl')
1707 setupModulePath(sys.argv[0])
1709 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
1710 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
1711 doHost(dom.documentElement, node_list)
1713 if __name__ == "__main__":
1716 except LconfError, e:
1718 except CommandError, e:
1722 if first_cleanup_error:
1723 sys.exit(first_cleanup_error)