3 # Copyright (C) 2002 Cluster File Systems, Inc.
4 # Author: Robert Read <rread@clusterfs.com>
5 # This file is part of Lustre, http://www.lustre.org.
7 # Lustre is free software; you can redistribute it and/or
8 # modify it under the terms of version 2 of the GNU General Public
9 # License as published by the Free Software Foundation.
11 # Lustre is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with Lustre; if not, write to the Free Software
18 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 # lconf - lustre configuration tool
22 # lconf is the main driver script for starting and stopping
23 # lustre filesystem services.
25 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import string, os, stat, popen2, socket, time, random
30 import xml.dom.minidom
35 DEFAULT_TCPBUF = 1048576
37 # Maximum number of devices to search for.
38 # (the /dev/loop* nodes need to be created beforehand)
39 MAX_LOOP_DEVICES = 256
41 first_cleanup_error = 0
42 def cleanup_error(rc):
43 global first_cleanup_error
44 if not first_cleanup_error:
45 first_cleanup_error = rc
49 print """usage: lconf config.xml
51 config.xml Lustre configuration in xml format.
52 --get <url> URL to fetch a config file
53 --node <nodename> Load config for <nodename>
54 -d | --cleanup Cleans up config. (Shutdown)
55 -f | --force Forced unmounting and/or obd detach during cleanup
56 -v | --verbose Print system commands as they are run
57 -h | --help Print this help
58 --gdb Prints message after creating gdb module script
59 and sleeps for 5 seconds.
60 -n | --noexec Prints the commands and steps that will be run for a
61 config without executing them. This can used to check if a
62 config file is doing what it should be doing. (Implies -v)
63 --nomod Skip load/unload module step.
64 --nosetup Skip device setup/cleanup step.
65 --reformat Reformat all devices (without question)
66 --dump <file> Dump the kernel debug log before portals is unloaded
67 --minlevel <num> Specify the minimum level of services to configure/cleanup (default 0)
68 --maxlevel <num> Specify the maximum level of services to configure/cleanup (default 100)
69 Levels are aproximatly like:
76 70 - mountpoint, echo_client
79 --ldap server LDAP server with lustre config database
80 --makeldiff Translate xml source to LDIFF
81 This are perhaps not needed:
82 --lustre="src dir" Base directory of lustre sources. Used to search
84 --portals=src Portals source
88 # ============================================================
89 # Config parameters, encapsulated in a class
105 self._gdb_script = '/tmp/ogdb'
106 self._debug_path = '/tmp/lustre-log'
107 self._dump_file = None
112 def verbose(self, flag = None):
113 if flag: self._verbose = flag
116 def noexec(self, flag = None):
117 if flag: self._noexec = flag
120 def reformat(self, flag = None):
121 if flag: self._reformat = flag
122 return self._reformat
124 def cleanup(self, flag = None):
125 if flag: self._cleanup = flag
128 def gdb(self, flag = None):
129 if flag: self._gdb = flag
132 def nomod(self, flag = None):
133 if flag: self._nomod = flag
136 def nosetup(self, flag = None):
137 if flag: self._nosetup = flag
140 def force(self, flag = None):
141 if flag: self._force = flag
144 def node(self, val = None):
145 if val: self._node = val
148 def url(self, val = None):
149 if val: self._url = val
152 def gdb_script(self):
153 if os.path.isdir('/r'):
154 return '/r' + self._gdb_script
156 return self._gdb_script
158 def debug_path(self):
159 if os.path.isdir('/r'):
160 return '/r' + self._debug_path
162 return self._debug_path
164 def src_dir(self, val = None):
165 if val: self._src_dir = val
168 def dump_file(self, val = None):
169 if val: self._dump_file = val
170 return self._dump_file
172 def minlevel(self, val = None):
173 if val: self._minlevel = int(val)
174 return self._minlevel
176 def maxlevel(self, val = None):
177 if val: self._maxlevel = int(val)
178 return self._maxlevel
184 # ============================================================
185 # debugging and error funcs
187 def fixme(msg = "this feature"):
188 raise LconfError, msg + ' not implmemented yet.'
191 msg = string.join(map(str,args))
192 if not config.noexec():
193 raise LconfError(msg)
198 msg = string.join(map(str,args))
203 print string.strip(s)
207 msg = string.join(map(str,args))
210 # ============================================================
211 # locally defined exceptions
212 class CommandError (exceptions.Exception):
213 def __init__(self, cmd_name, cmd_err, rc=None):
214 self.cmd_name = cmd_name
215 self.cmd_err = cmd_err
220 if type(self.cmd_err) == types.StringType:
222 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
224 print "! %s: %s" % (self.cmd_name, self.cmd_err)
225 elif type(self.cmd_err) == types.ListType:
227 print "! %s (error %d):" % (self.cmd_name, self.rc)
229 print "! %s:" % (self.cmd_name)
230 for s in self.cmd_err:
231 print "> %s" %(string.strip(s))
235 class LconfError (exceptions.Exception):
236 def __init__(self, args):
240 # ============================================================
241 # handle lctl interface
244 Manage communication with lctl
247 def __init__(self, cmd):
249 Initialize close by finding the lctl binary.
251 self.lctl = find_prog(cmd)
254 debug('! lctl not found')
257 raise CommandError('lctl', "unable to find lctl binary.")
262 the cmds are written to stdin of lctl
263 lctl doesn't return errors when run in script mode, so
265 should modify command line to accept multiple commands, or
266 create complex command line options
268 debug("+", self.lctl, cmds)
269 if config.noexec(): return (0, [])
270 p = popen2.Popen3(self.lctl, 1)
271 p.tochild.write(cmds + "\n")
273 out = p.fromchild.readlines()
274 err = p.childerr.readlines()
276 if os.WIFEXITED(ret):
277 rc = os.WEXITSTATUS(ret)
281 raise CommandError(self.lctl, err, rc)
284 def runcmd(self, *args):
286 run lctl using the command line
288 cmd = string.join(map(str,args))
289 debug("+", self.lctl, cmd)
290 rc, out = run(self.lctl, cmd)
292 raise CommandError(self.lctl, out, rc)
296 def network(self, net, nid):
297 """ initialized network and add "self" """
298 # Idea: "mynid" could be used for all network types to add "self," and then
299 # this special case would be gone and the "self" hack would be hidden.
300 if net in ('tcp', 'toe'):
305 quit""" % (net, nid, nid)
314 # create a new connection
315 def connect(self, net, nid, port, servuuid, send_mem, recv_mem):
316 if net in ('tcp', 'toe'):
323 quit""" % (net, servuuid, nid, send_mem, recv_mem, nid, port, )
329 quit""" % (net, servuuid, nid, nid, port, )
333 # add a route to a range
334 def add_route(self, net, gw, lo, hi):
338 quit """ % (net, gw, lo, hi)
342 def del_route(self, net, gw, lo, hi):
350 # add a route to a host
351 def add_route_host(self, net, uuid, gw, tgt):
356 quit """ % (net, uuid, tgt, gw, tgt)
359 # add a route to a range
360 def del_route_host(self, net, uuid, gw, tgt):
366 quit """ % (net, uuid, tgt)
369 # disconnect one connection
370 def disconnect(self, net, nid, port, servuuid):
376 quit""" % (net, nid, servuuid)
380 def disconnectAll(self, net):
389 # create a new device with lctl
390 def newdev(self, attach, setup = ""):
395 quit""" % (attach, setup)
399 def cleanup(self, name, uuid):
405 quit""" % (name, ('', 'force')[config.force()])
409 def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist):
413 lov_setconfig %s %d %d %d %s %s
414 quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
418 def dump(self, dump_file):
421 quit""" % (dump_file)
424 # get list of devices
425 def device_list(self):
426 rc, out = self.runcmd('device_list')
430 def lustre_version(self):
431 rc, out = self.runcmd('version')
434 # ============================================================
435 # Various system-level functions
436 # (ideally moved to their own module)
438 # Run a command and return the output and status.
439 # stderr is sent to /dev/null, could use popen3 to
440 # save it if necessary
442 cmd = string.join(map(str,args))
444 if config.noexec(): return (0, [])
445 f = os.popen(cmd + ' 2>&1')
454 # Run a command in the background.
455 def run_daemon(*args):
456 cmd = string.join(map(str,args))
458 if config.noexec(): return 0
459 f = os.popen(cmd + ' 2>&1')
467 # Determine full path to use for an external command
468 # searches dirname(argv[0]) first, then PATH
470 syspath = string.split(os.environ['PATH'], ':')
471 cmdpath = os.path.dirname(sys.argv[0])
472 syspath.insert(0, cmdpath);
473 syspath.insert(0, os.path.join(cmdpath, '../../portals/linux/utils/'))
475 prog = os.path.join(d,cmd)
476 if os.access(prog, os.X_OK):
480 # Recursively look for file starting at base dir
481 def do_find_file(base, mod):
482 fullname = os.path.join(base, mod)
483 if os.access(fullname, os.R_OK):
485 for d in os.listdir(base):
486 dir = os.path.join(base,d)
487 if os.path.isdir(dir):
488 module = do_find_file(dir, mod)
492 def find_module(src_dir, dev_dir, modname):
493 mod = '%s.o' % (modname)
494 module = src_dir +'/'+ dev_dir +'/'+ mod
496 if os.access(module, os.R_OK):
502 # is the path a block device?
509 return stat.S_ISBLK(s[stat.ST_MODE])
511 # build fs according to type
513 def mkfs(fstype, dev):
514 if(fstype in ('ext3', 'extN')):
515 mkfs = 'mkfs.ext2 -j -b 4096'
517 print 'unsupported fs type: ', fstype
518 if not is_block(dev):
522 (ret, out) = run (mkfs, force, dev)
524 panic("Unable to build fs:", dev)
525 # enable hash tree indexing on fsswe
526 # FIXME: this check can probably go away on 2.5
528 htree = 'echo "feature FEATURE_C5" | debugfs -w'
529 (ret, out) = run (htree, dev)
531 panic("Unable to enable htree:", dev)
533 # some systems use /dev/loopN, some /dev/loop/N
537 if not os.access(loop + str(0), os.R_OK):
539 if not os.access(loop + str(0), os.R_OK):
540 panic ("can't access loop devices")
543 # find loop device assigned to thefile
546 for n in xrange(0, MAX_LOOP_DEVICES):
548 if os.access(dev, os.R_OK):
549 (stat, out) = run('losetup', dev)
550 if (out and stat == 0):
551 m = re.search(r'\((.*)\)', out[0])
552 if m and file == m.group(1):
558 # create file if necessary and assign the first free loop device
559 def init_loop(file, size, fstype):
560 dev = find_loop(file)
562 print 'WARNING file:', file, 'already mapped to', dev
564 if config.reformat() or not os.access(file, os.R_OK | os.W_OK):
565 run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size, file))
567 # find next free loop
568 for n in xrange(0, MAX_LOOP_DEVICES):
570 if os.access(dev, os.R_OK):
571 (stat, out) = run('losetup', dev)
573 run('losetup', dev, file)
576 print "out of loop devices"
578 print "out of loop devices"
581 # undo loop assignment
582 def clean_loop(file):
583 dev = find_loop(file)
585 ret, out = run('losetup -d', dev)
587 log('unable to clean loop device:', dev, 'for file:', file)
590 # determine if dev is formatted as a <fstype> filesystem
591 def need_format(fstype, dev):
592 # FIXME don't know how to implement this
595 # initialize a block device if needed
596 def block_dev(dev, size, fstype, format):
597 if config.noexec(): return dev
598 if not is_block(dev):
599 dev = init_loop(dev, size, fstype)
600 if config.reformat() or (need_format(fstype, dev) and format == 'yes'):
604 # panic("device:", dev,
605 # "not prepared, and autoformat is not set.\n",
606 # "Rerun with --reformat option to format ALL filesystems")
611 """lookup IP address for an interface"""
612 rc, out = run("/sbin/ifconfig", iface)
615 addr = string.split(out[1])[1]
616 ip = string.split(addr, ':')[1]
619 def get_local_address(net_type, wildcard):
620 """Return the local address for the network type."""
622 if net_type in ('tcp', 'toe'):
624 iface, star = string.split(wildcard, ':')
625 local = if2addr(iface)
627 panic ("unable to determine ip for:", wildcard)
629 host = socket.gethostname()
630 local = socket.gethostbyname(host)
631 elif net_type == 'elan':
632 # awk '/NodeId/ { print $2 }' '/proc/elan/device0/position'
634 fp = open('/proc/elan/device0/position', 'r')
635 lines = fp.readlines()
644 elif net_type == 'gm':
645 fixme("automatic local address for GM")
649 def is_prepared(uuid):
650 """Return true if a device exists for the uuid"""
651 # expect this format:
652 # 1 UP ldlm ldlm ldlm_UUID 2
654 out = lctl.device_list()
656 if uuid == string.split(s)[4]:
658 except CommandError, e:
663 # ============================================================
664 # Classes to prepare and cleanup the various objects
667 """ Base class for the rest of the modules. The default cleanup method is
668 defined here, as well as some utilitiy funcs.
670 def __init__(self, module_name, dom_node):
671 self.dom_node = dom_node
672 self.module_name = module_name
673 self.name = get_attr(dom_node, 'name')
674 self.uuid = get_attr(dom_node, 'uuid')
675 self.kmodule_list = []
679 def info(self, *args):
680 msg = string.join(map(str,args))
681 print self.module_name + ":", self.name, self.uuid, msg
684 def lookup_server(self, srv_uuid):
685 """ Lookup a server's network information """
686 net = get_ost_net(self.dom_node.parentNode, srv_uuid)
688 panic ("Unable to find a server for:", srv_uuid)
689 self._server = Network(net)
691 def get_server(self):
695 """ default cleanup, used for most modules """
697 srv = self.get_server()
698 if srv and local_net(srv):
700 lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
701 except CommandError, e:
702 log(self.module_name, "disconnect failed: ", self.name)
706 lctl.cleanup(self.name, self.uuid)
707 except CommandError, e:
708 log(self.module_name, "cleanup failed: ", self.name)
712 def add_module(self, dev_dir, modname):
713 """Append a module to list of modules to load."""
714 self.kmodule_list.append((dev_dir, modname))
716 def mod_loaded(self, modname):
717 """Check if a module is already loaded. Look in /proc/modules for it."""
718 fp = open('/proc/modules')
719 lines = fp.readlines()
721 # please forgive my tired fingers for this one
722 ret = filter(lambda word, mod=modname: word == mod,
723 map(lambda line: string.split(line)[0], lines))
726 def load_module(self):
727 """Load all the modules in the list in the order they appear."""
728 for dev_dir, mod in self.kmodule_list:
729 # (rc, out) = run ('/sbin/lsmod | grep -s', mod)
730 if self.mod_loaded(mod) and not config.noexec():
732 log ('loading module:', mod)
734 module = find_module(config.src_dir(),dev_dir, mod)
736 panic('module not found:', mod)
737 (rc, out) = run('/sbin/insmod', module)
739 raise CommandError('insmod', out, rc)
741 (rc, out) = run('/sbin/modprobe', mod)
743 raise CommandError('modprobe', out, rc)
745 def cleanup_module(self):
746 """Unload the modules in the list in reverse order."""
747 rev = self.kmodule_list
749 for dev_dir, mod in rev:
750 if not self.mod_loaded(mod):
753 if mod == 'portals' and config.dump_file():
754 lctl.dump(config.dump_file())
755 log('unloading module:', mod)
758 (rc, out) = run('/sbin/rmmod', mod)
760 log('! unable to unload module:', mod)
764 class Network(Module):
765 def __init__(self,dom_node):
766 Module.__init__(self, 'NETWORK', dom_node)
767 self.net_type = get_attr(dom_node,'type')
768 self.nid = get_text(dom_node, 'server', '*')
769 self.port = get_text_int(dom_node, 'port', 0)
770 self.send_mem = get_text_int(dom_node, 'send_mem', DEFAULT_TCPBUF)
771 self.recv_mem = get_text_int(dom_node, 'recv_mem', DEFAULT_TCPBUF)
773 self.nid = get_local_address(self.net_type, self.nid)
775 panic("unable to set nid for", self.net_type, self.nid)
776 debug("nid:", self.nid)
778 self.add_module('portals/linux/oslib/', 'portals')
779 if node_needs_router():
780 self.add_module('portals/linux/router', 'kptlrouter')
781 if self.net_type == 'tcp':
782 self.add_module('portals/linux/socknal', 'ksocknal')
783 if self.net_type == 'toe':
784 self.add_module('portals/linux/toenal', 'ktoenal')
785 if self.net_type == 'elan':
786 self.add_module('portals/linux/rqswnal', 'kqswnal')
787 if self.net_type == 'gm':
788 self.add_module('portals/linux/gmnal', 'kgmnal')
789 self.add_module('lustre/obdclass', 'obdclass')
790 self.add_module('lustre/ptlrpc', 'ptlrpc')
793 self.info(self.net_type, self.nid, self.port)
794 if self.net_type in ('tcp', 'toe'):
795 nal_id = '' # default is socknal
796 if self.net_type == 'toe':
798 ret, out = run(TCP_ACCEPTOR, '-s', self.send_mem, '-r', self.recv_mem, nal_id, self.port)
800 raise CommandError(TCP_ACCEPTOR, out, ret)
801 ret = self.dom_node.getElementsByTagName('route_tbl')
803 for r in a.getElementsByTagName('route'):
804 net_type = get_attr(r, 'type')
805 gw = get_attr(r, 'gw')
806 lo = get_attr(r, 'lo')
807 hi = get_attr(r,'hi', '')
808 lctl.add_route(net_type, gw, lo, hi)
809 if net_type in ('tcp', 'toe') and net_type == self.net_type and hi == '':
810 srv = nid2server(self.dom_node.parentNode.parentNode, lo)
812 panic("no server for nid", lo)
814 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
817 lctl.network(self.net_type, self.nid)
818 lctl.newdev(attach = "ptlrpc RPCDEV RPCDEV_UUID")
821 self.info(self.net_type, self.nid, self.port)
822 ret = self.dom_node.getElementsByTagName('route_tbl')
824 for r in a.getElementsByTagName('route'):
825 lo = get_attr(r, 'lo')
826 hi = get_attr(r,'hi', '')
827 if self.net_type in ('tcp', 'toe') and hi == '':
828 srv = nid2server(self.dom_node.parentNode.parentNode, lo)
830 panic("no server for nid", lo)
833 lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
834 except CommandError, e:
835 print "disconnect failed: ", self.name
839 lctl.del_route(self.net_type, self.nid, lo, hi)
840 except CommandError, e:
841 print "del_route failed: ", self.name
846 lctl.cleanup("RPCDEV", "RPCDEV_UUID")
847 except CommandError, e:
848 print "cleanup failed: ", self.name
852 lctl.disconnectAll(self.net_type)
853 except CommandError, e:
854 print "disconnectAll failed: ", self.name
857 if self.net_type in ('tcp', 'toe'):
858 # yikes, this ugly! need to save pid in /var/something
859 run("killall acceptor")
862 def __init__(self,dom_node):
863 Module.__init__(self, 'LDLM', dom_node)
864 self.add_module('lustre/ldlm', 'ldlm')
866 if is_prepared(self.uuid):
869 lctl.newdev(attach="ldlm %s %s" % (self.name, self.uuid),
873 def __init__(self,dom_node):
874 Module.__init__(self, 'LOV', dom_node)
875 self.mds_uuid = get_first_ref(dom_node, 'mds')
876 mds= lookup(dom_node.parentNode, self.mds_uuid)
877 self.mds_name = getName(mds)
878 devs = dom_node.getElementsByTagName('devices')
881 self.stripe_sz = get_attr_int(dev_node, 'stripesize', 65536)
882 self.stripe_off = get_attr_int(dev_node, 'stripeoffset', 0)
883 self.pattern = get_attr_int(dev_node, 'pattern', 0)
884 self.devlist = get_all_refs(dev_node, 'osc')
885 self.stripe_cnt = get_attr_int(dev_node, 'stripecount', len(self.devlist))
886 self.add_module('lustre/mdc', 'mdc')
887 self.add_module('lustre/lov', 'lov')
890 if is_prepared(self.uuid):
892 for osc_uuid in self.devlist:
893 osc = lookup(self.dom_node.parentNode, osc_uuid)
897 # Ignore connection failures, because the LOV will DTRT with
898 # an unconnected OSC.
899 n.prepare(ignore_connect_failure=1)
901 print "Error preparing OSC %s (inactive)\n" % osc_uuid
903 panic('osc not found:', osc_uuid)
904 mdc_uuid = prepare_mdc(self.dom_node.parentNode, self.mds_uuid)
905 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
906 self.stripe_off, self.pattern, self.devlist, self.mds_name)
907 lctl.newdev(attach="lov %s %s" % (self.name, self.uuid),
908 setup ="%s" % (mdc_uuid))
911 if not is_prepared(self.uuid):
913 for osc_uuid in self.devlist:
914 osc = lookup(self.dom_node.parentNode, osc_uuid)
919 panic('osc not found:', osc_uuid)
921 cleanup_mdc(self.dom_node.parentNode, self.mds_uuid)
924 def load_module(self):
925 for osc_uuid in self.devlist:
926 osc = lookup(self.dom_node.parentNode, osc_uuid)
932 panic('osc not found:', osc_uuid)
933 Module.load_module(self)
936 def cleanup_module(self):
937 Module.cleanup_module(self)
938 for osc_uuid in self.devlist:
939 osc = lookup(self.dom_node.parentNode, osc_uuid)
945 panic('osc not found:', osc_uuid)
947 class LOVConfig(Module):
948 def __init__(self,dom_node):
949 Module.__init__(self, 'LOVConfig', dom_node)
950 self.lov_uuid = get_first_ref(dom_node, 'lov')
951 l = lookup(dom_node.parentNode, self.lov_uuid)
956 self.info(lov.mds_uuid, lov.stripe_cnt, lov.stripe_sz, lov.stripe_off,
957 lov.pattern, lov.devlist, lov.mds_name)
958 lctl.lov_setconfig(lov.uuid, lov.mds_name, lov.stripe_cnt,
959 lov.stripe_sz, lov.stripe_off, lov.pattern,
960 string.join(lov.devlist))
968 def __init__(self,dom_node):
969 Module.__init__(self, 'MDS', dom_node)
970 self.devname, self.size = get_device(dom_node)
971 self.fstype = get_text(dom_node, 'fstype')
972 # FIXME: if fstype not set, then determine based on kernel version
973 self.format = get_text(dom_node, 'autoformat', "no")
974 if self.fstype == 'extN':
975 self.add_module('lustre/extN', 'extN')
976 self.add_module('lustre/mds', 'mds')
977 self.add_module('lustre/mds', 'mds_%s' % (self.fstype))
980 if is_prepared(self.uuid):
982 self.info(self.devname, self.fstype, self.format)
983 blkdev = block_dev(self.devname, self.size, self.fstype, self.format)
984 if not is_prepared('MDT_UUID'):
985 lctl.newdev(attach="mdt %s %s" % ('MDT', 'MDT_UUID'),
987 lctl.newdev(attach="mds %s %s" % (self.name, self.uuid),
988 setup ="%s %s" %(blkdev, self.fstype))
990 if is_prepared('MDT_UUID'):
992 lctl.cleanup("MDT", "MDT_UUID")
993 except CommandError, e:
994 print "cleanup failed: ", self.name
997 if not is_prepared(self.uuid):
1000 clean_loop(self.devname)
1002 # Very unusual case, as there is no MDC element in the XML anymore
1003 # Builds itself from an MDS node
1005 def __init__(self,dom_node):
1006 self.mds = MDS(dom_node)
1007 self.dom_node = dom_node
1008 self.module_name = 'MDC'
1009 self.kmodule_list = []
1013 host = socket.gethostname()
1014 self.name = 'MDC_%s' % (self.mds.name)
1015 self.uuid = '%s_%05x_%05x' % (self.name, int(random.random() * 1048576),
1016 int(random.random() * 1048576))
1018 self.lookup_server(self.mds.uuid)
1019 self.add_module('lustre/mdc', 'mdc')
1022 if is_prepared(self.uuid):
1024 self.info(self.mds.uuid)
1025 srv = self.get_server()
1026 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
1027 lctl.newdev(attach="mdc %s %s" % (self.name, self.uuid),
1028 setup ="%s %s" %(self.mds.uuid, srv.uuid))
1031 def __init__(self, dom_node):
1032 Module.__init__(self, 'OBD', dom_node)
1033 self.obdtype = get_attr(dom_node, 'type')
1034 self.devname, self.size = get_device(dom_node)
1035 self.fstype = get_text(dom_node, 'fstype')
1036 # FIXME: if fstype not set, then determine based on kernel version
1037 self.format = get_text(dom_node, 'autoformat', 'yes')
1038 if self.fstype == 'extN':
1039 self.add_module('lustre/extN', 'extN')
1040 self.add_module('lustre/' + self.obdtype, self.obdtype)
1042 # need to check /proc/mounts and /etc/mtab before
1043 # formatting anything.
1044 # FIXME: check if device is already formatted.
1046 if is_prepared(self.uuid):
1048 self.info(self.obdtype, self.devname, self.size, self.fstype, self.format)
1049 if self.obdtype == 'obdecho':
1052 blkdev = block_dev(self.devname, self.size, self.fstype, self.format)
1053 lctl.newdev(attach="%s %s %s" % (self.obdtype, self.name, self.uuid),
1054 setup ="%s %s" %(blkdev, self.fstype))
1056 if not is_prepared(self.uuid):
1058 Module.cleanup(self)
1059 if not self.obdtype == 'obdecho':
1060 clean_loop(self.devname)
1063 def __init__(self,dom_node):
1064 Module.__init__(self, 'OST', dom_node)
1065 self.obd_uuid = get_first_ref(dom_node, 'obd')
1066 self.add_module('lustre/ost', 'ost')
1069 if is_prepared(self.uuid):
1071 self.info(self.obd_uuid)
1072 lctl.newdev(attach="ost %s %s" % (self.name, self.uuid),
1073 setup ="%s" % (self.obd_uuid))
1076 # virtual interface for OSC and LOV
1078 def __init__(self,dom_node):
1079 Module.__init__(self, 'VOSC', dom_node)
1080 if dom_node.nodeName == 'lov':
1081 self.osc = LOV(dom_node)
1083 self.osc = OSC(dom_node)
1088 def load_module(self):
1089 self.osc.load_module()
1090 def cleanup_module(self):
1091 self.osc.cleanup_module()
1095 def __init__(self,dom_node):
1096 Module.__init__(self, 'OSC', dom_node)
1097 self.obd_uuid = get_first_ref(dom_node, 'obd')
1098 self.ost_uuid = get_first_ref(dom_node, 'ost')
1099 self.lookup_server(self.ost_uuid)
1100 self.add_module('lustre/osc', 'osc')
1102 def prepare(self, ignore_connect_failure = 0):
1103 if is_prepared(self.uuid):
1105 self.info(self.obd_uuid, self.ost_uuid)
1106 srv = self.get_server()
1109 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
1113 lctl.add_route_host(r[0], srv.uuid, r[1], r[2])
1115 panic ("no route to", srv.nid)
1116 except CommandError:
1117 if (ignore_connect_failure == 0):
1120 lctl.newdev(attach="osc %s %s" % (self.name, self.uuid),
1121 setup ="%s %s" %(self.obd_uuid, srv.uuid))
1124 if not is_prepared(self.uuid):
1126 srv = self.get_server()
1128 Module.cleanup(self)
1130 self.info(self.obd_uuid, self.ost_uuid)
1134 lctl.del_route_host(r[0], srv.uuid, r[1], r[2])
1135 except CommandError, e:
1136 print "del_route failed: ", self.name
1139 Module.cleanup(self)
1142 class ECHO_CLIENT(Module):
1143 def __init__(self,dom_node):
1144 Module.__init__(self, 'ECHO_CLIENT', dom_node)
1145 self.add_module('lustre/obdecho', 'obdecho')
1146 self.lov_uuid = get_first_ref(dom_node, 'osc')
1147 l = lookup(self.dom_node.parentNode, self.lov_uuid)
1151 if is_prepared(self.uuid):
1153 self.osc.prepare() # XXX This is so cheating. -p
1154 self.info(self.lov_uuid)
1156 lctl.newdev(attach="echo_client %s %s" % (self.name, self.uuid),
1157 setup = self.lov_uuid)
1160 if not is_prepared(self.uuid):
1164 def load_module(self):
1165 self.osc.load_module()
1166 Module.load_module(self)
1167 def cleanup_module(self):
1168 Module.cleanup_module(self)
1169 self.osc.cleanup_module()
1172 class Mountpoint(Module):
1173 def __init__(self,dom_node):
1174 Module.__init__(self, 'MTPT', dom_node)
1175 self.path = get_text(dom_node, 'path')
1176 self.mds_uuid = get_first_ref(dom_node, 'mds')
1177 self.lov_uuid = get_first_ref(dom_node, 'osc')
1178 self.add_module('lustre/mdc', 'mdc')
1179 self.add_module('lustre/llite', 'llite')
1180 l = lookup(self.dom_node.parentNode, self.lov_uuid)
1185 mdc_uuid = prepare_mdc(self.dom_node.parentNode, self.mds_uuid)
1186 self.info(self.path, self.mds_uuid, self.lov_uuid)
1187 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s none %s" % \
1188 (self.lov_uuid, mdc_uuid, self.path)
1189 run("mkdir", self.path)
1192 panic("mount failed:", self.path)
1195 self.info(self.path, self.mds_uuid,self.lov_uuid)
1197 (rc, out) = run("umount -f", self.path)
1199 (rc, out) = run("umount", self.path)
1201 log("umount failed, cleanup will most likely not work.")
1202 l = lookup(self.dom_node.parentNode, self.lov_uuid)
1204 cleanup_mdc(self.dom_node.parentNode, self.mds_uuid)
1206 def load_module(self):
1207 self.osc.load_module()
1208 Module.load_module(self)
1209 def cleanup_module(self):
1210 Module.cleanup_module(self)
1211 self.osc.cleanup_module()
1214 # ============================================================
1215 # XML processing and query
1216 # TODO: Change query funcs to use XPath, which is muc cleaner
1218 def get_device(obd):
1219 list = obd.getElementsByTagName('device')
1223 size = get_attr_int(dev, 'size', 0)
1224 return dev.firstChild.data, size
1227 # Get the text content from the first matching child
1228 # If there is no content (or it is all whitespace), return
1230 def get_text(dom_node, tag, default=""):
1231 list = dom_node.getElementsByTagName(tag)
1234 dom_node.normalize()
1235 if dom_node.firstChild:
1236 txt = string.strip(dom_node.firstChild.data)
1241 def get_text_int(dom_node, tag, default=0):
1242 list = dom_node.getElementsByTagName(tag)
1246 dom_node.normalize()
1247 if dom_node.firstChild:
1248 txt = string.strip(dom_node.firstChild.data)
1253 panic("text value is not integer:", txt)
1256 def get_attr(dom_node, attr, default=""):
1257 v = dom_node.getAttribute(attr)
1262 def get_attr_int(dom_node, attr, default=0):
1264 v = dom_node.getAttribute(attr)
1269 panic("attr value is not integer", v)
1272 def get_first_ref(dom_node, tag):
1273 """ Get the first uuidref of the type TAG. Used one only
1274 one is expected. Returns the uuid."""
1276 refname = '%s_ref' % tag
1277 list = dom_node.getElementsByTagName(refname)
1279 uuid = getRef(list[0])
1282 def get_all_refs(dom_node, tag):
1283 """ Get all the refs of type TAG. Returns list of uuids. """
1285 refname = '%s_ref' % tag
1286 list = dom_node.getElementsByTagName(refname)
1289 uuids.append(getRef(i))
1292 def get_ost_net(dom_node, uuid):
1293 ost = lookup(dom_node, uuid)
1294 uuid = get_first_ref(ost, 'network')
1297 return lookup(dom_node, uuid)
1299 def nid2server(dom_node, nid):
1300 netlist = dom_node.getElementsByTagName('network')
1301 for net_node in netlist:
1302 if get_text(net_node, 'server') == nid:
1303 return Network(net_node)
1306 def lookup(dom_node, uuid):
1307 for n in dom_node.childNodes:
1308 if n.nodeType == n.ELEMENT_NODE:
1309 if getUUID(n) == uuid:
1316 # Get name attribute of dom_node
1317 def getName(dom_node):
1318 return dom_node.getAttribute('name')
1320 def getRef(dom_node):
1321 return dom_node.getAttribute('uuidref')
1323 # Get name attribute of dom_node
1324 def getUUID(dom_node):
1325 return dom_node.getAttribute('uuid')
1327 # the tag name is the service type
1328 # fixme: this should do some checks to make sure the dom_node is a service
1329 def getServiceType(dom_node):
1330 return dom_node.nodeName
1333 # determine what "level" a particular node is at.
1334 # the order of iniitailization is based on level.
1335 def getServiceLevel(dom_node):
1336 type = getServiceType(dom_node)
1338 if type in ('network',):
1340 elif type in ('device', 'ldlm'):
1342 elif type in ('obd', 'mdd'):
1344 elif type in ('mds','ost'):
1346 elif type in ('mdc','osc'):
1348 elif type in ('lov', 'lovconfig'):
1350 elif type in ('mountpoint', 'echo_client'):
1353 if ret < config.minlevel() or ret > config.maxlevel():
1358 # return list of services in a profile. list is a list of tuples
1359 # [(level, dom_node),]
1360 def getServices(lustreNode, profileNode):
1362 for n in profileNode.childNodes:
1363 if n.nodeType == n.ELEMENT_NODE:
1364 servNode = lookup(lustreNode, getRef(n))
1367 panic('service not found: ' + getRef(n))
1368 level = getServiceLevel(servNode)
1370 list.append((level, servNode))
1374 def getByName(lustreNode, name, tag):
1375 ndList = lustreNode.getElementsByTagName(tag)
1377 if getName(nd) == name:
1382 ############################################################
1384 # FIXME: clean this mess up!
1387 def prepare_mdc(dom_node, mds_uuid):
1389 mds_node = lookup(dom_node, mds_uuid);
1391 panic("no mds:", mds_uuid)
1392 if saved_mdc.has_key(mds_uuid):
1393 return saved_mdc[mds_uuid]
1396 saved_mdc[mds_uuid] = mdc.uuid
1399 def cleanup_mdc(dom_node, mds_uuid):
1401 mds_node = lookup(dom_node, mds_uuid);
1403 panic("no mds:", mds_uuid)
1404 if not saved_mdc.has_key(mds_uuid):
1407 saved_mdc[mds_uuid] = mdc.uuid
1410 ############################################################
1411 # routing ("rooting")
1417 def init_node(dom_node):
1418 global local_node, router_flag
1419 netlist = dom_node.getElementsByTagName('network')
1420 for dom_net in netlist:
1421 type = get_attr(dom_net, 'type')
1422 gw = get_text(dom_net, 'server')
1423 local_node.append((type, gw))
1425 def node_needs_router():
1428 def get_routes(type, gw, dom_net):
1429 """ Return the routes as a list of tuples of the form:
1430 [(type, gw, lo, hi),]"""
1432 tbl = dom_net.getElementsByTagName('route_tbl')
1434 routes = t.getElementsByTagName('route')
1436 lo = get_attr(r, 'lo')
1437 hi = get_attr(r, 'hi', '')
1438 res.append((type, gw, lo, hi))
1442 def init_route_config(lustre):
1443 """ Scan the lustre config looking for routers. Build list of
1445 global routes, router_flag
1447 list = lustre.getElementsByTagName('node')
1449 if get_attr(node, 'router'):
1451 for (local_type, local_nid) in local_node:
1453 netlist = node.getElementsByTagName('network')
1454 for dom_net in netlist:
1455 if local_type == get_attr(dom_net, 'type'):
1456 gw = get_text(dom_net, 'server')
1460 for dom_net in netlist:
1461 if local_type != get_attr(dom_net, 'type'):
1462 for route in get_routes(local_type, gw, dom_net):
1463 routes.append(route)
1468 for iface in local_node:
1469 if net.net_type == iface[0]:
1473 def find_route(net):
1474 global local_node, routes
1475 frm_type = local_node[0][0]
1476 to_type = net.net_type
1478 debug ('looking for route to', to_type,to)
1487 ############################################################
1490 def startService(dom_node, module_flag):
1491 type = getServiceType(dom_node)
1492 debug('Service:', type, getName(dom_node), getUUID(dom_node))
1493 # there must be a more dynamic way of doing this...
1499 elif type == 'lovconfig':
1500 n = LOVConfig(dom_node)
1501 elif type == 'network':
1502 n = Network(dom_node)
1513 elif type == 'mountpoint':
1514 n = Mountpoint(dom_node)
1515 elif type == 'echo_client':
1516 n = ECHO_CLIENT(dom_node)
1518 panic ("unknown service type:", type)
1523 if config.cleanup():
1528 if config.nosetup():
1530 if config.cleanup():
1536 # Prepare the system to run lustre using a particular profile
1537 # in a the configuration.
1538 # * load & the modules
1539 # * setup networking for the current node
1540 # * make sure partitions are in place and prepared
1541 # * initialize devices with lctl
1542 # Levels is important, and needs to be enforced.
1543 def startProfile(lustreNode, profileNode, module_flag):
1545 panic("profile:", profile, "not found.")
1546 services = getServices(lustreNode, profileNode)
1547 if config.cleanup():
1550 startService(s[1], module_flag)
1555 def doHost(lustreNode, hosts):
1559 dom_node = getByName(lustreNode, h, 'node')
1564 print 'No host entry found.'
1567 if not get_attr(dom_node, 'router'):
1569 init_route_config(lustreNode)
1574 # Two step process: (1) load modules, (2) setup lustre
1575 # if not cleaning, load modules first.
1576 module_flag = not config.cleanup()
1577 reflist = dom_node.getElementsByTagName('profile')
1578 for profile in reflist:
1579 startProfile(lustreNode, profile, module_flag)
1581 if not config.cleanup():
1582 sys_set_debug_path()
1583 script = config.gdb_script()
1584 run(lctl.lctl, ' modules >', script)
1586 # dump /tmp/ogdb and sleep/pause here
1587 log ("The GDB module script is in", script)
1590 module_flag = not module_flag
1591 for profile in reflist:
1592 startProfile(lustreNode, profile, module_flag)
1594 ############################################################
1595 # Command line processing
1597 def parse_cmdline(argv):
1598 short_opts = "hdnvf"
1599 long_opts = ["ldap", "reformat", "lustre=", "verbose", "gdb",
1600 "portals=", "makeldiff", "cleanup", "noexec",
1601 "help", "node=", "nomod", "nosetup",
1602 "dump=", "force", "minlevel=", "maxlevel="]
1606 opts, args = getopt.getopt(argv, short_opts, long_opts)
1607 except getopt.error:
1612 if o in ("-h", "--help"):
1614 if o in ("-d","--cleanup"):
1616 if o in ("-v", "--verbose"):
1618 if o in ("-n", "--noexec"):
1621 if o == "--portals":
1625 if o == "--reformat":
1633 if o == "--nosetup":
1637 if o in ("-f", "--force"):
1639 if o in ("--minlevel",):
1641 if o in ("--maxlevel",):
1650 s = urllib.urlopen(url)
1656 def setupModulePath(cmd):
1657 base = os.path.dirname(cmd)
1658 if os.access(base+"/Makefile", os.R_OK):
1659 config.src_dir(base + "/../../")
1661 def sys_set_debug_path():
1662 debug("debug path: ", config.debug_path())
1666 fp = open('/proc/sys/portals/debug_path', 'w')
1667 fp.write(config.debug_path())
1672 #/proc/sys/net/core/rmem_max
1673 #/proc/sys/net/core/wmem_max
1674 def sys_set_netmem_max(path, max):
1675 debug("setting", path, "to at least", max)
1683 fp = open(path, 'w')
1684 fp.write('%d\n' %(max))
1688 def sys_make_devices():
1689 if not os.access('/dev/portals', os.R_OK):
1690 run('mknod /dev/portals c 10 240')
1691 if not os.access('/dev/obd', os.R_OK):
1692 run('mknod /dev/obd c 10 241')
1695 # Add dir to the global PATH, if not already there.
1696 def add_to_path(new_dir):
1697 syspath = string.split(os.environ['PATH'], ':')
1698 if new_dir in syspath:
1700 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
1703 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
1704 # ensure basic elements are in the system path
1705 def sanitise_path():
1706 for dir in DEFAULT_PATH:
1709 # Initialize or shutdown lustre according to a configuration file
1710 # * prepare the system for lustre
1711 # * configure devices with lctl
1712 # Shutdown does steps in reverse
1715 global TCP_ACCEPTOR, lctl, MAXTCPBUF
1716 host = socket.gethostname()
1718 # the PRNG is normally seeded with time(), which is not so good for starting
1719 # time-synchronized clusters
1720 input = open('/dev/urandom', 'r')
1722 print 'Unable to open /dev/urandom!'
1724 seed = input.read(32)
1730 args = parse_cmdline(sys.argv[1:])
1732 if not os.access(args[0], os.R_OK):
1733 print 'File not found or readable:', args[0]
1735 dom = xml.dom.minidom.parse(args[0])
1737 xmldata = fetch(config.url())
1738 dom = xml.dom.minidom.parseString(xmldata)
1744 node_list.append(config.node())
1747 node_list.append(host)
1748 node_list.append('localhost')
1749 debug("configuring for host: ", node_list)
1752 config._debug_path = config._debug_path + '-' + host
1753 config._gdb_script = config._gdb_script + '-' + host
1755 TCP_ACCEPTOR = find_prog('acceptor')
1756 if not TCP_ACCEPTOR:
1758 TCP_ACCEPTOR = 'acceptor'
1759 debug('! acceptor not found')
1761 panic('acceptor not found')
1763 lctl = LCTLInterface('lctl')
1765 setupModulePath(sys.argv[0])
1767 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
1768 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
1769 doHost(dom.documentElement, node_list)
1771 if __name__ == "__main__":
1774 except LconfError, e:
1776 except CommandError, e:
1780 if first_cleanup_error:
1781 sys.exit(first_cleanup_error)