3 # Copyright (C) 2002 Cluster File Systems, Inc.
4 # Author: Robert Read <rread@clusterfs.com>
5 # This file is part of Lustre, http://www.lustre.org.
7 # Lustre is free software; you can redistribute it and/or
8 # modify it under the terms of version 2 of the GNU General Public
9 # License as published by the Free Software Foundation.
11 # Lustre is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with Lustre; if not, write to the Free Software
18 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 # lconf - lustre configuration tool
22 # lconf is the main driver script for starting and stopping
23 # lustre filesystem services.
25 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import string, os, stat, popen2, socket, time, random
30 import xml.dom.minidom
35 DEFAULT_TCPBUF = 1048576
37 # Maximum number of devices to search for.
38 # (the /dev/loop* nodes need to be created beforehand)
39 MAX_LOOP_DEVICES = 256
41 first_cleanup_error = 0
42 def cleanup_error(rc):
43 global first_cleanup_error
44 if not first_cleanup_error:
45 first_cleanup_error = rc
49 print """usage: lconf config.xml
51 config.xml Lustre configuration in xml format.
52 --get <url> URL to fetch a config file
53 --node <nodename> Load config for <nodename>
54 -d | --cleanup Cleans up config. (Shutdown)
55 -f | --force Forced unmounting and/or obd detach during cleanup
56 -v | --verbose Print system commands as they are run
57 -h | --help Print this help
58 --gdb Prints message after creating gdb module script
59 and sleeps for 5 seconds.
60 -n | --noexec Prints the commands and steps that will be run for a
61 config without executing them. This can used to check if a
62 config file is doing what it should be doing. (Implies -v)
63 --nomod Skip load/unload module step.
64 --nosetup Skip device setup/cleanup step.
65 --reformat Reformat all devices (without question)
66 --dump <file> Dump the kernel debug log before portals is unloaded
67 --minlevel <num> Specify the minimum level of services to configure/cleanup (default 0)
68 --maxlevel <num> Specify the maximum level of services to configure/cleanup (default 100)
69 Levels are aproximatly like:
76 70 - mountpoint, echo_client
79 --ldap server LDAP server with lustre config database
80 --makeldiff Translate xml source to LDIFF
81 This are perhaps not needed:
82 --lustre="src dir" Base directory of lustre sources. Used to search
84 --portals=src Portals source
88 # ============================================================
89 # Config parameters, encapsulated in a class
105 self._gdb_script = '/tmp/ogdb'
106 self._debug_path = '/tmp/lustre-log'
107 self._dump_file = None
112 def verbose(self, flag = None):
113 if flag: self._verbose = flag
116 def noexec(self, flag = None):
117 if flag: self._noexec = flag
120 def reformat(self, flag = None):
121 if flag: self._reformat = flag
122 return self._reformat
124 def cleanup(self, flag = None):
125 if flag: self._cleanup = flag
128 def gdb(self, flag = None):
129 if flag: self._gdb = flag
132 def nomod(self, flag = None):
133 if flag: self._nomod = flag
136 def nosetup(self, flag = None):
137 if flag: self._nosetup = flag
140 def force(self, flag = None):
141 if flag: self._force = flag
144 def node(self, val = None):
145 if val: self._node = val
148 def url(self, val = None):
149 if val: self._url = val
152 def gdb_script(self):
153 if os.path.isdir('/r'):
154 return '/r' + self._gdb_script
156 return self._gdb_script
158 def debug_path(self):
159 if os.path.isdir('/r'):
160 return '/r' + self._debug_path
162 return self._debug_path
164 def src_dir(self, val = None):
165 if val: self._src_dir = val
168 def dump_file(self, val = None):
169 if val: self._dump_file = val
170 return self._dump_file
172 def minlevel(self, val = None):
173 if val: self._minlevel = int(val)
174 return self._minlevel
176 def maxlevel(self, val = None):
177 if val: self._maxlevel = int(val)
178 return self._maxlevel
184 # ============================================================
185 # debugging and error funcs
187 def fixme(msg = "this feature"):
188 raise LconfError, msg + ' not implmemented yet.'
191 msg = string.join(map(str,args))
192 if not config.noexec():
193 raise LconfError(msg)
198 msg = string.join(map(str,args))
203 print string.strip(s)
207 msg = string.join(map(str,args))
210 # ============================================================
211 # locally defined exceptions
212 class CommandError (exceptions.Exception):
213 def __init__(self, cmd_name, cmd_err, rc=None):
214 self.cmd_name = cmd_name
215 self.cmd_err = cmd_err
220 if type(self.cmd_err) == types.StringType:
222 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
224 print "! %s: %s" % (self.cmd_name, self.cmd_err)
225 elif type(self.cmd_err) == types.ListType:
227 print "! %s (error %d):" % (self.cmd_name, self.rc)
229 print "! %s:" % (self.cmd_name)
230 for s in self.cmd_err:
231 print "> %s" %(string.strip(s))
235 class LconfError (exceptions.Exception):
236 def __init__(self, args):
240 # ============================================================
241 # handle lctl interface
244 Manage communication with lctl
247 def __init__(self, cmd):
249 Initialize close by finding the lctl binary.
251 self.lctl = find_prog(cmd)
254 debug('! lctl not found')
257 raise CommandError('lctl', "unable to find lctl binary.")
262 the cmds are written to stdin of lctl
263 lctl doesn't return errors when run in script mode, so
265 should modify command line to accept multiple commands, or
266 create complex command line options
268 debug("+", self.lctl, cmds)
269 if config.noexec(): return (0, [])
270 p = popen2.Popen3(self.lctl, 1)
271 p.tochild.write(cmds + "\n")
273 out = p.fromchild.readlines()
274 err = p.childerr.readlines()
276 if os.WIFEXITED(ret):
277 rc = os.WEXITSTATUS(ret)
281 raise CommandError(self.lctl, err, rc)
284 def runcmd(self, *args):
286 run lctl using the command line
288 cmd = string.join(map(str,args))
289 debug("+", self.lctl, cmd)
290 rc, out = run(self.lctl, cmd)
292 raise CommandError(self.lctl, out, rc)
296 def network(self, net, nid):
297 """ initialized network and add "self" """
298 # Idea: "mynid" could be used for all network types to add "self," and then
299 # this special case would be gone and the "self" hack would be hidden.
300 if net in ('tcp', 'toe'):
305 quit""" % (net, nid, nid)
314 # create a new connection
315 def connect(self, net, nid, port, servuuid, send_mem, recv_mem):
316 if net in ('tcp', 'toe'):
323 quit""" % (net, servuuid, nid, send_mem, recv_mem, nid, port, )
329 quit""" % (net, servuuid, nid, nid, port, )
333 # add a route to a range
334 def add_route(self, net, gw, lo, hi):
338 quit """ % (net, gw, lo, hi)
342 def del_route(self, net, gw, lo, hi):
350 # add a route to a host
351 def add_route_host(self, net, uuid, gw, tgt):
356 quit """ % (net, uuid, tgt, gw, tgt)
359 # add a route to a range
360 def del_route_host(self, net, uuid, gw, tgt):
366 quit """ % (net, uuid, tgt)
369 # disconnect one connection
370 def disconnect(self, net, nid, port, servuuid):
376 quit""" % (net, nid, servuuid)
380 def disconnectAll(self, net):
389 # create a new device with lctl
390 def newdev(self, attach, setup = ""):
395 quit""" % (attach, setup)
399 def cleanup(self, name, uuid):
405 quit""" % (name, ('', 'force')[config.force()])
409 def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist):
413 lov_setconfig %s %d %d %d %s %s
414 quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
418 def dump(self, dump_file):
421 quit""" % (dump_file)
424 # get list of devices
425 def device_list(self):
426 rc, out = self.runcmd('device_list')
430 def lustre_version(self):
431 rc, out = self.runcmd('version')
434 # ============================================================
435 # Various system-level functions
436 # (ideally moved to their own module)
438 # Run a command and return the output and status.
439 # stderr is sent to /dev/null, could use popen3 to
440 # save it if necessary
442 cmd = string.join(map(str,args))
444 if config.noexec(): return (0, [])
445 f = os.popen(cmd + ' 2>&1')
454 # Run a command in the background.
455 def run_daemon(*args):
456 cmd = string.join(map(str,args))
458 if config.noexec(): return 0
459 f = os.popen(cmd + ' 2>&1')
467 # Determine full path to use for an external command
468 # searches dirname(argv[0]) first, then PATH
470 syspath = string.split(os.environ['PATH'], ':')
471 cmdpath = os.path.dirname(sys.argv[0])
472 syspath.insert(0, cmdpath);
473 syspath.insert(0, os.path.join(cmdpath, '../../portals/linux/utils/'))
475 prog = os.path.join(d,cmd)
476 if os.access(prog, os.X_OK):
480 # Recursively look for file starting at base dir
481 def do_find_file(base, mod):
482 fullname = os.path.join(base, mod)
483 if os.access(fullname, os.R_OK):
485 for d in os.listdir(base):
486 dir = os.path.join(base,d)
487 if os.path.isdir(dir):
488 module = do_find_file(dir, mod)
492 def find_module(src_dir, dev_dir, modname):
493 mod = '%s.o' % (modname)
494 module = src_dir +'/'+ dev_dir +'/'+ mod
496 if os.access(module, os.R_OK):
502 # is the path a block device?
509 return stat.S_ISBLK(s[stat.ST_MODE])
511 # build fs according to type
513 def mkfs(fstype, dev):
514 if(fstype in ('ext3', 'extN')):
515 mkfs = 'mkfs.ext2 -j -b 4096'
516 elif (fstype == 'reiserfs'):
517 mkfs = 'mkfs.reiserfs -f'
519 print 'unsupported fs type: ', fstype
520 if not is_block(dev):
521 if(fstype in ('ext3', 'extN')):
523 elif (fstype == 'reiserfs'):
526 print 'unsupported fs type: ', fstype
529 (ret, out) = run (mkfs, force, dev)
531 panic("Unable to build fs:", dev)
532 # enable hash tree indexing on fsswe
533 # FIXME: this check can probably go away on 2.5
535 htree = 'echo "feature FEATURE_C5" | debugfs -w'
536 (ret, out) = run (htree, dev)
538 panic("Unable to enable htree:", dev)
540 # some systems use /dev/loopN, some /dev/loop/N
544 if not os.access(loop + str(0), os.R_OK):
546 if not os.access(loop + str(0), os.R_OK):
547 panic ("can't access loop devices")
550 # find loop device assigned to thefile
553 for n in xrange(0, MAX_LOOP_DEVICES):
555 if os.access(dev, os.R_OK):
556 (stat, out) = run('losetup', dev)
557 if (out and stat == 0):
558 m = re.search(r'\((.*)\)', out[0])
559 if m and file == m.group(1):
565 # create file if necessary and assign the first free loop device
566 def init_loop(file, size, fstype):
567 dev = find_loop(file)
569 print 'WARNING file:', file, 'already mapped to', dev
571 if config.reformat() or not os.access(file, os.R_OK | os.W_OK):
572 run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size, file))
574 # find next free loop
575 for n in xrange(0, MAX_LOOP_DEVICES):
577 if os.access(dev, os.R_OK):
578 (stat, out) = run('losetup', dev)
580 run('losetup', dev, file)
583 print "out of loop devices"
585 print "out of loop devices"
588 # undo loop assignment
589 def clean_loop(file):
590 dev = find_loop(file)
592 ret, out = run('losetup -d', dev)
594 log('unable to clean loop device:', dev, 'for file:', file)
597 # determine if dev is formatted as a <fstype> filesystem
598 def need_format(fstype, dev):
599 # FIXME don't know how to implement this
602 # initialize a block device if needed
603 def block_dev(dev, size, fstype, format):
604 if config.noexec(): return dev
605 if not is_block(dev):
606 dev = init_loop(dev, size, fstype)
607 if config.reformat() or (need_format(fstype, dev) and format == 'yes'):
611 # panic("device:", dev,
612 # "not prepared, and autoformat is not set.\n",
613 # "Rerun with --reformat option to format ALL filesystems")
618 """lookup IP address for an interface"""
619 rc, out = run("/sbin/ifconfig", iface)
622 addr = string.split(out[1])[1]
623 ip = string.split(addr, ':')[1]
626 def get_local_address(net_type, wildcard):
627 """Return the local address for the network type."""
629 if net_type in ('tcp', 'toe'):
631 iface, star = string.split(wildcard, ':')
632 local = if2addr(iface)
634 panic ("unable to determine ip for:", wildcard)
636 host = socket.gethostname()
637 local = socket.gethostbyname(host)
638 elif net_type == 'elan':
639 # awk '/NodeId/ { print $2 }' '/proc/elan/device0/position'
641 fp = open('/proc/elan/device0/position', 'r')
642 lines = fp.readlines()
651 elif net_type == 'gm':
652 fixme("automatic local address for GM")
656 def is_prepared(uuid):
657 """Return true if a device exists for the uuid"""
658 # expect this format:
659 # 1 UP ldlm ldlm ldlm_UUID 2
661 out = lctl.device_list()
663 if uuid == string.split(s)[4]:
665 except CommandError, e:
670 # ============================================================
671 # Classes to prepare and cleanup the various objects
674 """ Base class for the rest of the modules. The default cleanup method is
675 defined here, as well as some utilitiy funcs.
677 def __init__(self, module_name, dom_node):
678 self.dom_node = dom_node
679 self.module_name = module_name
680 self.name = get_attr(dom_node, 'name')
681 self.uuid = get_attr(dom_node, 'uuid')
682 self.kmodule_list = []
686 def info(self, *args):
687 msg = string.join(map(str,args))
688 print self.module_name + ":", self.name, self.uuid, msg
691 def lookup_server(self, srv_uuid):
692 """ Lookup a server's network information """
693 net = get_ost_net(self.dom_node.parentNode, srv_uuid)
695 panic ("Unable to find a server for:", srv_uuid)
696 self._server = Network(net)
698 def get_server(self):
702 """ default cleanup, used for most modules """
704 srv = self.get_server()
705 if srv and local_net(srv):
707 lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
708 except CommandError, e:
709 log(self.module_name, "disconnect failed: ", self.name)
713 lctl.cleanup(self.name, self.uuid)
714 except CommandError, e:
715 log(self.module_name, "cleanup failed: ", self.name)
719 def add_module(self, dev_dir, modname):
720 """Append a module to list of modules to load."""
721 self.kmodule_list.append((dev_dir, modname))
723 def mod_loaded(self, modname):
724 """Check if a module is already loaded. Look in /proc/modules for it."""
725 fp = open('/proc/modules')
726 lines = fp.readlines()
728 # please forgive my tired fingers for this one
729 ret = filter(lambda word, mod=modname: word == mod,
730 map(lambda line: string.split(line)[0], lines))
733 def load_module(self):
734 """Load all the modules in the list in the order they appear."""
735 for dev_dir, mod in self.kmodule_list:
736 # (rc, out) = run ('/sbin/lsmod | grep -s', mod)
737 if self.mod_loaded(mod) and not config.noexec():
739 log ('loading module:', mod)
741 module = find_module(config.src_dir(),dev_dir, mod)
743 panic('module not found:', mod)
744 (rc, out) = run('/sbin/insmod', module)
746 raise CommandError('insmod', out, rc)
748 (rc, out) = run('/sbin/modprobe', mod)
750 raise CommandError('modprobe', out, rc)
752 def cleanup_module(self):
753 """Unload the modules in the list in reverse order."""
754 rev = self.kmodule_list
756 for dev_dir, mod in rev:
757 if not self.mod_loaded(mod):
760 if mod == 'portals' and config.dump_file():
761 lctl.dump(config.dump_file())
762 log('unloading module:', mod)
765 (rc, out) = run('/sbin/rmmod', mod)
767 log('! unable to unload module:', mod)
771 class Network(Module):
772 def __init__(self,dom_node):
773 Module.__init__(self, 'NETWORK', dom_node)
774 self.net_type = get_attr(dom_node,'type')
775 self.nid = get_text(dom_node, 'server', '*')
776 self.port = get_text_int(dom_node, 'port', 0)
777 self.send_mem = get_text_int(dom_node, 'send_mem', DEFAULT_TCPBUF)
778 self.recv_mem = get_text_int(dom_node, 'recv_mem', DEFAULT_TCPBUF)
780 self.nid = get_local_address(self.net_type, self.nid)
782 panic("unable to set nid for", self.net_type, self.nid)
783 debug("nid:", self.nid)
785 self.add_module('portals/linux/oslib/', 'portals')
786 if node_needs_router():
787 self.add_module('portals/linux/router', 'kptlrouter')
788 if self.net_type == 'tcp':
789 self.add_module('portals/linux/socknal', 'ksocknal')
790 if self.net_type == 'toe':
791 self.add_module('portals/linux/toenal', 'ktoenal')
792 if self.net_type == 'elan':
793 self.add_module('portals/linux/rqswnal', 'kqswnal')
794 if self.net_type == 'gm':
795 self.add_module('portals/linux/gmnal', 'kgmnal')
796 self.add_module('lustre/obdclass', 'obdclass')
797 self.add_module('lustre/ptlrpc', 'ptlrpc')
800 self.info(self.net_type, self.nid, self.port)
801 if self.net_type in ('tcp', 'toe'):
802 nal_id = '' # default is socknal
803 if self.net_type == 'toe':
805 ret, out = run(TCP_ACCEPTOR, '-s', self.send_mem, '-r', self.recv_mem, nal_id, self.port)
807 raise CommandError(TCP_ACCEPTOR, out, ret)
808 ret = self.dom_node.getElementsByTagName('route_tbl')
810 for r in a.getElementsByTagName('route'):
811 net_type = get_attr(r, 'type')
812 gw = get_attr(r, 'gw')
813 lo = get_attr(r, 'lo')
814 hi = get_attr(r,'hi', '')
815 lctl.add_route(net_type, gw, lo, hi)
816 if net_type in ('tcp', 'toe') and net_type == self.net_type and hi == '':
817 srv = nid2server(self.dom_node.parentNode.parentNode, lo)
819 panic("no server for nid", lo)
821 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
824 lctl.network(self.net_type, self.nid)
825 lctl.newdev(attach = "ptlrpc RPCDEV RPCDEV_UUID")
828 self.info(self.net_type, self.nid, self.port)
829 ret = self.dom_node.getElementsByTagName('route_tbl')
831 for r in a.getElementsByTagName('route'):
832 lo = get_attr(r, 'lo')
833 hi = get_attr(r,'hi', '')
834 if self.net_type in ('tcp', 'toe') and hi == '':
835 srv = nid2server(self.dom_node.parentNode.parentNode, lo)
837 panic("no server for nid", lo)
840 lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
841 except CommandError, e:
842 print "disconnect failed: ", self.name
846 lctl.del_route(self.net_type, self.nid, lo, hi)
847 except CommandError, e:
848 print "del_route failed: ", self.name
853 lctl.cleanup("RPCDEV", "RPCDEV_UUID")
854 except CommandError, e:
855 print "cleanup failed: ", self.name
859 lctl.disconnectAll(self.net_type)
860 except CommandError, e:
861 print "disconnectAll failed: ", self.name
864 if self.net_type in ('tcp', 'toe'):
865 # yikes, this ugly! need to save pid in /var/something
866 run("killall acceptor")
869 def __init__(self,dom_node):
870 Module.__init__(self, 'LDLM', dom_node)
871 self.add_module('lustre/ldlm', 'ldlm')
873 if is_prepared(self.uuid):
876 lctl.newdev(attach="ldlm %s %s" % (self.name, self.uuid),
880 def __init__(self,dom_node):
881 Module.__init__(self, 'LOV', dom_node)
882 self.mds_uuid = get_first_ref(dom_node, 'mds')
883 mds= lookup(dom_node.parentNode, self.mds_uuid)
884 self.mds_name = getName(mds)
885 devs = dom_node.getElementsByTagName('devices')
888 self.stripe_sz = get_attr_int(dev_node, 'stripesize', 65536)
889 self.stripe_off = get_attr_int(dev_node, 'stripeoffset', 0)
890 self.pattern = get_attr_int(dev_node, 'pattern', 0)
891 self.devlist = get_all_refs(dev_node, 'osc')
892 self.stripe_cnt = get_attr_int(dev_node, 'stripecount', len(self.devlist))
893 self.add_module('lustre/mdc', 'mdc')
894 self.add_module('lustre/lov', 'lov')
897 if is_prepared(self.uuid):
899 for osc_uuid in self.devlist:
900 osc = lookup(self.dom_node.parentNode, osc_uuid)
904 # Ignore connection failures, because the LOV will DTRT with
905 # an unconnected OSC.
906 n.prepare(ignore_connect_failure=1)
908 print "Error preparing OSC %s (inactive)\n" % osc_uuid
910 panic('osc not found:', osc_uuid)
911 mdc_uuid = prepare_mdc(self.dom_node.parentNode, self.mds_uuid)
912 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
913 self.stripe_off, self.pattern, self.devlist, self.mds_name)
914 lctl.newdev(attach="lov %s %s" % (self.name, self.uuid),
915 setup ="%s" % (mdc_uuid))
918 if not is_prepared(self.uuid):
920 for osc_uuid in self.devlist:
921 osc = lookup(self.dom_node.parentNode, osc_uuid)
926 panic('osc not found:', osc_uuid)
928 cleanup_mdc(self.dom_node.parentNode, self.mds_uuid)
931 def load_module(self):
932 for osc_uuid in self.devlist:
933 osc = lookup(self.dom_node.parentNode, osc_uuid)
939 panic('osc not found:', osc_uuid)
940 Module.load_module(self)
943 def cleanup_module(self):
944 Module.cleanup_module(self)
945 for osc_uuid in self.devlist:
946 osc = lookup(self.dom_node.parentNode, osc_uuid)
952 panic('osc not found:', osc_uuid)
954 class LOVConfig(Module):
955 def __init__(self,dom_node):
956 Module.__init__(self, 'LOVConfig', dom_node)
957 self.lov_uuid = get_first_ref(dom_node, 'lov')
958 l = lookup(dom_node.parentNode, self.lov_uuid)
963 self.info(lov.mds_uuid, lov.stripe_cnt, lov.stripe_sz, lov.stripe_off,
964 lov.pattern, lov.devlist, lov.mds_name)
965 lctl.lov_setconfig(lov.uuid, lov.mds_name, lov.stripe_cnt,
966 lov.stripe_sz, lov.stripe_off, lov.pattern,
967 string.join(lov.devlist))
975 def __init__(self,dom_node):
976 Module.__init__(self, 'MDS', dom_node)
977 self.devname, self.size = get_device(dom_node)
978 self.fstype = get_text(dom_node, 'fstype')
979 # FIXME: if fstype not set, then determine based on kernel version
980 self.format = get_text(dom_node, 'autoformat', "no")
981 if self.fstype == 'extN':
982 self.add_module('lustre/extN', 'extN')
983 self.add_module('lustre/mds', 'mds')
984 self.add_module('lustre/mds', 'mds_%s' % (self.fstype))
987 if is_prepared(self.uuid):
989 self.info(self.devname, self.fstype, self.format)
990 blkdev = block_dev(self.devname, self.size, self.fstype, self.format)
991 if not is_prepared('MDT_UUID'):
992 lctl.newdev(attach="mdt %s %s" % ('MDT', 'MDT_UUID'),
994 lctl.newdev(attach="mds %s %s" % (self.name, self.uuid),
995 setup ="%s %s" %(blkdev, self.fstype))
997 if is_prepared('MDT_UUID'):
999 lctl.cleanup("MDT", "MDT_UUID")
1000 except CommandError, e:
1001 print "cleanup failed: ", self.name
1004 if not is_prepared(self.uuid):
1006 Module.cleanup(self)
1007 clean_loop(self.devname)
1009 # Very unusual case, as there is no MDC element in the XML anymore
1010 # Builds itself from an MDS node
1012 def __init__(self,dom_node):
1013 self.mds = MDS(dom_node)
1014 self.dom_node = dom_node
1015 self.module_name = 'MDC'
1016 self.kmodule_list = []
1020 host = socket.gethostname()
1021 self.name = 'MDC_%s' % (self.mds.name)
1022 self.uuid = '%s_%05x_%05x' % (self.name, int(random.random() * 1048576),
1023 int(random.random() * 1048576))
1025 self.lookup_server(self.mds.uuid)
1026 self.add_module('lustre/mdc', 'mdc')
1029 if is_prepared(self.uuid):
1031 self.info(self.mds.uuid)
1032 srv = self.get_server()
1033 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
1034 lctl.newdev(attach="mdc %s %s" % (self.name, self.uuid),
1035 setup ="%s %s" %(self.mds.uuid, srv.uuid))
1038 def __init__(self, dom_node):
1039 Module.__init__(self, 'OBD', dom_node)
1040 self.obdtype = get_attr(dom_node, 'type')
1041 self.devname, self.size = get_device(dom_node)
1042 self.fstype = get_text(dom_node, 'fstype')
1043 # FIXME: if fstype not set, then determine based on kernel version
1044 self.format = get_text(dom_node, 'autoformat', 'yes')
1045 if self.fstype == 'extN':
1046 self.add_module('lustre/extN', 'extN')
1047 self.add_module('lustre/' + self.obdtype, self.obdtype)
1049 # need to check /proc/mounts and /etc/mtab before
1050 # formatting anything.
1051 # FIXME: check if device is already formatted.
1053 if is_prepared(self.uuid):
1055 self.info(self.obdtype, self.devname, self.size, self.fstype, self.format)
1056 if self.obdtype == 'obdecho':
1059 blkdev = block_dev(self.devname, self.size, self.fstype, self.format)
1060 lctl.newdev(attach="%s %s %s" % (self.obdtype, self.name, self.uuid),
1061 setup ="%s %s" %(blkdev, self.fstype))
1063 if not is_prepared(self.uuid):
1065 Module.cleanup(self)
1066 if not self.obdtype == 'obdecho':
1067 clean_loop(self.devname)
1070 def __init__(self,dom_node):
1071 Module.__init__(self, 'OST', dom_node)
1072 self.obd_uuid = get_first_ref(dom_node, 'obd')
1073 self.add_module('lustre/ost', 'ost')
1076 if is_prepared(self.uuid):
1078 self.info(self.obd_uuid)
1079 lctl.newdev(attach="ost %s %s" % (self.name, self.uuid),
1080 setup ="%s" % (self.obd_uuid))
1083 # virtual interface for OSC and LOV
1085 def __init__(self,dom_node):
1086 Module.__init__(self, 'VOSC', dom_node)
1087 if dom_node.nodeName == 'lov':
1088 self.osc = LOV(dom_node)
1090 self.osc = OSC(dom_node)
1095 def load_module(self):
1096 self.osc.load_module()
1097 def cleanup_module(self):
1098 self.osc.cleanup_module()
1102 def __init__(self,dom_node):
1103 Module.__init__(self, 'OSC', dom_node)
1104 self.obd_uuid = get_first_ref(dom_node, 'obd')
1105 self.ost_uuid = get_first_ref(dom_node, 'ost')
1106 self.lookup_server(self.ost_uuid)
1107 self.add_module('lustre/osc', 'osc')
1109 def prepare(self, ignore_connect_failure = 0):
1110 if is_prepared(self.uuid):
1112 self.info(self.obd_uuid, self.ost_uuid)
1113 srv = self.get_server()
1116 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
1120 lctl.add_route_host(r[0], srv.uuid, r[1], r[2])
1122 panic ("no route to", srv.nid)
1123 except CommandError:
1124 if (ignore_connect_failure == 0):
1127 lctl.newdev(attach="osc %s %s" % (self.name, self.uuid),
1128 setup ="%s %s" %(self.obd_uuid, srv.uuid))
1131 if not is_prepared(self.uuid):
1133 srv = self.get_server()
1135 Module.cleanup(self)
1137 self.info(self.obd_uuid, self.ost_uuid)
1141 lctl.del_route_host(r[0], srv.uuid, r[1], r[2])
1142 except CommandError, e:
1143 print "del_route failed: ", self.name
1146 Module.cleanup(self)
1149 class ECHO_CLIENT(Module):
1150 def __init__(self,dom_node):
1151 Module.__init__(self, 'ECHO_CLIENT', dom_node)
1152 self.add_module('lustre/obdecho', 'obdecho')
1153 self.lov_uuid = get_first_ref(dom_node, 'osc')
1154 l = lookup(self.dom_node.parentNode, self.lov_uuid)
1158 if is_prepared(self.uuid):
1160 self.osc.prepare() # XXX This is so cheating. -p
1161 self.info(self.lov_uuid)
1163 lctl.newdev(attach="echo_client %s %s" % (self.name, self.uuid),
1164 setup = self.lov_uuid)
1167 if not is_prepared(self.uuid):
1171 def load_module(self):
1172 self.osc.load_module()
1173 Module.load_module(self)
1174 def cleanup_module(self):
1175 Module.cleanup_module(self)
1176 self.osc.cleanup_module()
1179 class Mountpoint(Module):
1180 def __init__(self,dom_node):
1181 Module.__init__(self, 'MTPT', dom_node)
1182 self.path = get_text(dom_node, 'path')
1183 self.mds_uuid = get_first_ref(dom_node, 'mds')
1184 self.lov_uuid = get_first_ref(dom_node, 'osc')
1185 self.add_module('lustre/mdc', 'mdc')
1186 self.add_module('lustre/llite', 'llite')
1187 l = lookup(self.dom_node.parentNode, self.lov_uuid)
1192 mdc_uuid = prepare_mdc(self.dom_node.parentNode, self.mds_uuid)
1193 self.info(self.path, self.mds_uuid, self.lov_uuid)
1194 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s none %s" % \
1195 (self.lov_uuid, mdc_uuid, self.path)
1196 run("mkdir", self.path)
1199 panic("mount failed:", self.path)
1202 self.info(self.path, self.mds_uuid,self.lov_uuid)
1204 (rc, out) = run("umount -f", self.path)
1206 (rc, out) = run("umount", self.path)
1208 log("umount failed, cleanup will most likely not work.")
1209 l = lookup(self.dom_node.parentNode, self.lov_uuid)
1211 cleanup_mdc(self.dom_node.parentNode, self.mds_uuid)
1213 def load_module(self):
1214 self.osc.load_module()
1215 Module.load_module(self)
1216 def cleanup_module(self):
1217 Module.cleanup_module(self)
1218 self.osc.cleanup_module()
1221 # ============================================================
1222 # XML processing and query
1223 # TODO: Change query funcs to use XPath, which is muc cleaner
1225 def get_device(obd):
1226 list = obd.getElementsByTagName('device')
1230 size = get_attr_int(dev, 'size', 0)
1231 return dev.firstChild.data, size
1234 # Get the text content from the first matching child
1235 # If there is no content (or it is all whitespace), return
1237 def get_text(dom_node, tag, default=""):
1238 list = dom_node.getElementsByTagName(tag)
1241 dom_node.normalize()
1242 if dom_node.firstChild:
1243 txt = string.strip(dom_node.firstChild.data)
1248 def get_text_int(dom_node, tag, default=0):
1249 list = dom_node.getElementsByTagName(tag)
1253 dom_node.normalize()
1254 if dom_node.firstChild:
1255 txt = string.strip(dom_node.firstChild.data)
1260 panic("text value is not integer:", txt)
1263 def get_attr(dom_node, attr, default=""):
1264 v = dom_node.getAttribute(attr)
1269 def get_attr_int(dom_node, attr, default=0):
1271 v = dom_node.getAttribute(attr)
1276 panic("attr value is not integer", v)
1279 def get_first_ref(dom_node, tag):
1280 """ Get the first uuidref of the type TAG. Used one only
1281 one is expected. Returns the uuid."""
1283 refname = '%s_ref' % tag
1284 list = dom_node.getElementsByTagName(refname)
1286 uuid = getRef(list[0])
1289 def get_all_refs(dom_node, tag):
1290 """ Get all the refs of type TAG. Returns list of uuids. """
1292 refname = '%s_ref' % tag
1293 list = dom_node.getElementsByTagName(refname)
1296 uuids.append(getRef(i))
1299 def get_ost_net(dom_node, uuid):
1300 ost = lookup(dom_node, uuid)
1301 uuid = get_first_ref(ost, 'network')
1304 return lookup(dom_node, uuid)
1306 def nid2server(dom_node, nid):
1307 netlist = dom_node.getElementsByTagName('network')
1308 for net_node in netlist:
1309 if get_text(net_node, 'server') == nid:
1310 return Network(net_node)
1313 def lookup(dom_node, uuid):
1314 for n in dom_node.childNodes:
1315 if n.nodeType == n.ELEMENT_NODE:
1316 if getUUID(n) == uuid:
1323 # Get name attribute of dom_node
1324 def getName(dom_node):
1325 return dom_node.getAttribute('name')
1327 def getRef(dom_node):
1328 return dom_node.getAttribute('uuidref')
1330 # Get name attribute of dom_node
1331 def getUUID(dom_node):
1332 return dom_node.getAttribute('uuid')
1334 # the tag name is the service type
1335 # fixme: this should do some checks to make sure the dom_node is a service
1336 def getServiceType(dom_node):
1337 return dom_node.nodeName
1340 # determine what "level" a particular node is at.
1341 # the order of iniitailization is based on level.
1342 def getServiceLevel(dom_node):
1343 type = getServiceType(dom_node)
1345 if type in ('network',):
1347 elif type in ('device', 'ldlm'):
1349 elif type in ('obd', 'mdd'):
1351 elif type in ('mds','ost'):
1353 elif type in ('mdc','osc'):
1355 elif type in ('lov', 'lovconfig'):
1357 elif type in ('mountpoint', 'echo_client'):
1360 if ret < config.minlevel() or ret > config.maxlevel():
1365 # return list of services in a profile. list is a list of tuples
1366 # [(level, dom_node),]
1367 def getServices(lustreNode, profileNode):
1369 for n in profileNode.childNodes:
1370 if n.nodeType == n.ELEMENT_NODE:
1371 servNode = lookup(lustreNode, getRef(n))
1374 panic('service not found: ' + getRef(n))
1375 level = getServiceLevel(servNode)
1377 list.append((level, servNode))
1381 def getByName(lustreNode, name, tag):
1382 ndList = lustreNode.getElementsByTagName(tag)
1384 if getName(nd) == name:
1389 ############################################################
1391 # FIXME: clean this mess up!
1394 def prepare_mdc(dom_node, mds_uuid):
1396 mds_node = lookup(dom_node, mds_uuid);
1398 panic("no mds:", mds_uuid)
1399 if saved_mdc.has_key(mds_uuid):
1400 return saved_mdc[mds_uuid]
1403 saved_mdc[mds_uuid] = mdc.uuid
1406 def cleanup_mdc(dom_node, mds_uuid):
1408 mds_node = lookup(dom_node, mds_uuid);
1410 panic("no mds:", mds_uuid)
1411 if not saved_mdc.has_key(mds_uuid):
1414 saved_mdc[mds_uuid] = mdc.uuid
1417 ############################################################
1418 # routing ("rooting")
1424 def init_node(dom_node):
1425 global local_node, router_flag
1426 netlist = dom_node.getElementsByTagName('network')
1427 for dom_net in netlist:
1428 type = get_attr(dom_net, 'type')
1429 gw = get_text(dom_net, 'server')
1430 local_node.append((type, gw))
1432 def node_needs_router():
1435 def get_routes(type, gw, dom_net):
1436 """ Return the routes as a list of tuples of the form:
1437 [(type, gw, lo, hi),]"""
1439 tbl = dom_net.getElementsByTagName('route_tbl')
1441 routes = t.getElementsByTagName('route')
1443 lo = get_attr(r, 'lo')
1444 hi = get_attr(r, 'hi', '')
1445 res.append((type, gw, lo, hi))
1449 def init_route_config(lustre):
1450 """ Scan the lustre config looking for routers. Build list of
1452 global routes, router_flag
1454 list = lustre.getElementsByTagName('node')
1456 if get_attr(node, 'router'):
1458 for (local_type, local_nid) in local_node:
1460 netlist = node.getElementsByTagName('network')
1461 for dom_net in netlist:
1462 if local_type == get_attr(dom_net, 'type'):
1463 gw = get_text(dom_net, 'server')
1467 for dom_net in netlist:
1468 if local_type != get_attr(dom_net, 'type'):
1469 for route in get_routes(local_type, gw, dom_net):
1470 routes.append(route)
1475 for iface in local_node:
1476 if net.net_type == iface[0]:
1480 def find_route(net):
1481 global local_node, routes
1482 frm_type = local_node[0][0]
1483 to_type = net.net_type
1485 debug ('looking for route to', to_type,to)
1494 ############################################################
1497 def startService(dom_node, module_flag):
1498 type = getServiceType(dom_node)
1499 debug('Service:', type, getName(dom_node), getUUID(dom_node))
1500 # there must be a more dynamic way of doing this...
1506 elif type == 'lovconfig':
1507 n = LOVConfig(dom_node)
1508 elif type == 'network':
1509 n = Network(dom_node)
1520 elif type == 'mountpoint':
1521 n = Mountpoint(dom_node)
1522 elif type == 'echo_client':
1523 n = ECHO_CLIENT(dom_node)
1525 panic ("unknown service type:", type)
1530 if config.cleanup():
1535 if config.nosetup():
1537 if config.cleanup():
1543 # Prepare the system to run lustre using a particular profile
1544 # in a the configuration.
1545 # * load & the modules
1546 # * setup networking for the current node
1547 # * make sure partitions are in place and prepared
1548 # * initialize devices with lctl
1549 # Levels is important, and needs to be enforced.
1550 def startProfile(lustreNode, profileNode, module_flag):
1552 panic("profile:", profile, "not found.")
1553 services = getServices(lustreNode, profileNode)
1554 if config.cleanup():
1557 startService(s[1], module_flag)
1562 def doHost(lustreNode, hosts):
1566 dom_node = getByName(lustreNode, h, 'node')
1571 print 'No host entry found.'
1574 if not get_attr(dom_node, 'router'):
1576 init_route_config(lustreNode)
1581 # Two step process: (1) load modules, (2) setup lustre
1582 # if not cleaning, load modules first.
1583 module_flag = not config.cleanup()
1584 reflist = dom_node.getElementsByTagName('profile')
1585 for profile in reflist:
1586 startProfile(lustreNode, profile, module_flag)
1588 if not config.cleanup():
1589 sys_set_debug_path()
1590 script = config.gdb_script()
1591 run(lctl.lctl, ' modules >', script)
1593 # dump /tmp/ogdb and sleep/pause here
1594 log ("The GDB module script is in", script)
1597 module_flag = not module_flag
1598 for profile in reflist:
1599 startProfile(lustreNode, profile, module_flag)
1601 ############################################################
1602 # Command line processing
1604 def parse_cmdline(argv):
1605 short_opts = "hdnvf"
1606 long_opts = ["ldap", "reformat", "lustre=", "verbose", "gdb",
1607 "portals=", "makeldiff", "cleanup", "noexec",
1608 "help", "node=", "nomod", "nosetup",
1609 "dump=", "force", "minlevel=", "maxlevel="]
1613 opts, args = getopt.getopt(argv, short_opts, long_opts)
1614 except getopt.error:
1619 if o in ("-h", "--help"):
1621 if o in ("-d","--cleanup"):
1623 if o in ("-v", "--verbose"):
1625 if o in ("-n", "--noexec"):
1628 if o == "--portals":
1632 if o == "--reformat":
1640 if o == "--nosetup":
1644 if o in ("-f", "--force"):
1646 if o in ("--minlevel",):
1648 if o in ("--maxlevel",):
1657 s = urllib.urlopen(url)
1663 def setupModulePath(cmd):
1664 base = os.path.dirname(cmd)
1665 if os.access(base+"/Makefile", os.R_OK):
1666 config.src_dir(base + "/../../")
1668 def sys_set_debug_path():
1669 debug("debug path: ", config.debug_path())
1673 fp = open('/proc/sys/portals/debug_path', 'w')
1674 fp.write(config.debug_path())
1679 #/proc/sys/net/core/rmem_max
1680 #/proc/sys/net/core/wmem_max
1681 def sys_set_netmem_max(path, max):
1682 debug("setting", path, "to at least", max)
1690 fp = open(path, 'w')
1691 fp.write('%d\n' %(max))
1695 def sys_make_devices():
1696 if not os.access('/dev/portals', os.R_OK):
1697 run('mknod /dev/portals c 10 240')
1698 if not os.access('/dev/obd', os.R_OK):
1699 run('mknod /dev/obd c 10 241')
1702 # Add dir to the global PATH, if not already there.
1703 def add_to_path(new_dir):
1704 syspath = string.split(os.environ['PATH'], ':')
1705 if new_dir in syspath:
1707 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
1710 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
1711 # ensure basic elements are in the system path
1712 def sanitise_path():
1713 for dir in DEFAULT_PATH:
1716 # Initialize or shutdown lustre according to a configuration file
1717 # * prepare the system for lustre
1718 # * configure devices with lctl
1719 # Shutdown does steps in reverse
1722 global TCP_ACCEPTOR, lctl, MAXTCPBUF
1723 host = socket.gethostname()
1725 # the PRNG is normally seeded with time(), which is not so good for starting
1726 # time-synchronized clusters
1727 input = open('/dev/urandom', 'r')
1729 print 'Unable to open /dev/urandom!'
1731 seed = input.read(32)
1737 args = parse_cmdline(sys.argv[1:])
1739 if not os.access(args[0], os.R_OK):
1740 print 'File not found or readable:', args[0]
1742 dom = xml.dom.minidom.parse(args[0])
1744 xmldata = fetch(config.url())
1745 dom = xml.dom.minidom.parseString(xmldata)
1751 node_list.append(config.node())
1754 node_list.append(host)
1755 node_list.append('localhost')
1756 debug("configuring for host: ", node_list)
1759 config._debug_path = config._debug_path + '-' + host
1760 config._gdb_script = config._gdb_script + '-' + host
1762 TCP_ACCEPTOR = find_prog('acceptor')
1763 if not TCP_ACCEPTOR:
1765 TCP_ACCEPTOR = 'acceptor'
1766 debug('! acceptor not found')
1768 panic('acceptor not found')
1770 lctl = LCTLInterface('lctl')
1772 setupModulePath(sys.argv[0])
1774 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
1775 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
1776 doHost(dom.documentElement, node_list)
1778 if __name__ == "__main__":
1781 except LconfError, e:
1783 except CommandError, e:
1787 if first_cleanup_error:
1788 sys.exit(first_cleanup_error)