3 # Copyright (C) 2002 Cluster File Systems, Inc.
4 # Author: Robert Read <rread@clusterfs.com>
5 # This file is part of Lustre, http://www.lustre.org.
7 # Lustre is free software; you can redistribute it and/or
8 # modify it under the terms of version 2 of the GNU General Public
9 # License as published by the Free Software Foundation.
11 # Lustre is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with Lustre; if not, write to the Free Software
18 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 # lconf - lustre configuration tool
22 # lconf is the main driver script for starting and stopping
23 # lustre filesystem services.
25 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import string, os, stat, popen2, socket, time, random
30 import xml.dom.minidom
35 DEFAULT_TCPBUF = 1048576
37 # Maximum number of devices to search for.
38 # (the /dev/loop* nodes need to be created beforehand)
39 MAX_LOOP_DEVICES = 256
41 first_cleanup_error = 0
42 def cleanup_error(rc):
43 global first_cleanup_error
44 if not first_cleanup_error:
45 first_cleanup_error = rc
49 print """usage: lconf config.xml
51 config.xml Lustre configuration in xml format.
52 --get <url> URL to fetch a config file
53 --node <nodename> Load config for <nodename>
54 -d | --cleanup Cleans up config. (Shutdown)
55 -f | --force Forced unmounting and/or obd detach during cleanup
56 -v | --verbose Print system commands as they are run
57 -h | --help Print this help
58 --gdb Prints message after creating gdb module script
59 and sleeps for 5 seconds.
60 -n | --noexec Prints the commands and steps that will be run for a
61 config without executing them. This can used to check if a
62 config file is doing what it should be doing. (Implies -v)
63 --nomod Skip load/unload module step.
64 --nosetup Skip device setup/cleanup step.
65 --reformat Reformat all devices (without question)
66 --dump <file> Dump the kernel debug log before portals is unloaded
67 --minlevel <num> Specify the minimum level of services to configure/cleanup (default 0)
68 --maxlevel <num> Specify the maximum level of services to configure/cleanup (default 100)
69 Levels are aproximatly like:
79 --ldap server LDAP server with lustre config database
80 --makeldiff Translate xml source to LDIFF
81 This are perhaps not needed:
82 --lustre="src dir" Base directory of lustre sources. Used to search
84 --portals=src Portals source
88 # ============================================================
89 # Config parameters, encapsulated in a class
105 self._gdb_script = '/tmp/ogdb'
106 self._debug_path = '/tmp/lustre-log'
107 self._dump_file = None
112 def verbose(self, flag = None):
113 if flag: self._verbose = flag
116 def noexec(self, flag = None):
117 if flag: self._noexec = flag
120 def reformat(self, flag = None):
121 if flag: self._reformat = flag
122 return self._reformat
124 def cleanup(self, flag = None):
125 if flag: self._cleanup = flag
128 def gdb(self, flag = None):
129 if flag: self._gdb = flag
132 def nomod(self, flag = None):
133 if flag: self._nomod = flag
136 def nosetup(self, flag = None):
137 if flag: self._nosetup = flag
140 def force(self, flag = None):
141 if flag: self._force = flag
144 def node(self, val = None):
145 if val: self._node = val
148 def url(self, val = None):
149 if val: self._url = val
152 def gdb_script(self):
153 if os.path.isdir('/r'):
154 return '/r' + self._gdb_script
156 return self._gdb_script
158 def debug_path(self):
159 if os.path.isdir('/r'):
160 return '/r' + self._debug_path
162 return self._debug_path
164 def src_dir(self, val = None):
165 if val: self._src_dir = val
168 def dump_file(self, val = None):
169 if val: self._dump_file = val
170 return self._dump_file
172 def minlevel(self, val = None):
173 if val: self._minlevel = int(val)
174 return self._minlevel
176 def maxlevel(self, val = None):
177 if val: self._maxlevel = int(val)
178 return self._maxlevel
184 # ============================================================
185 # debugging and error funcs
187 def fixme(msg = "this feature"):
188 raise LconfError, msg + ' not implmemented yet.'
191 msg = string.join(map(str,args))
192 if not config.noexec():
193 raise LconfError(msg)
198 msg = string.join(map(str,args))
203 print string.strip(s)
207 msg = string.join(map(str,args))
210 # ============================================================
211 # locally defined exceptions
212 class CommandError (exceptions.Exception):
213 def __init__(self, cmd_name, cmd_err, rc=None):
214 self.cmd_name = cmd_name
215 self.cmd_err = cmd_err
220 if type(self.cmd_err) == types.StringType:
222 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
224 print "! %s: %s" % (self.cmd_name, self.cmd_err)
225 elif type(self.cmd_err) == types.ListType:
227 print "! %s (error %d):" % (self.cmd_name, self.rc)
229 print "! %s:" % (self.cmd_name)
230 for s in self.cmd_err:
231 print "> %s" %(string.strip(s))
235 class LconfError (exceptions.Exception):
236 def __init__(self, args):
240 # ============================================================
241 # handle lctl interface
244 Manage communication with lctl
247 def __init__(self, cmd):
249 Initialize close by finding the lctl binary.
251 self.lctl = find_prog(cmd)
254 debug('! lctl not found')
257 raise CommandError('lctl', "unable to find lctl binary.")
262 the cmds are written to stdin of lctl
263 lctl doesn't return errors when run in script mode, so
265 should modify command line to accept multiple commands, or
266 create complex command line options
268 debug("+", self.lctl, cmds)
269 if config.noexec(): return (0, [])
270 p = popen2.Popen3(self.lctl, 1)
271 p.tochild.write(cmds + "\n")
273 out = p.fromchild.readlines()
274 err = p.childerr.readlines()
276 if os.WIFEXITED(ret):
277 rc = os.WEXITSTATUS(ret)
281 raise CommandError(self.lctl, err, rc)
284 def runcmd(self, *args):
286 run lctl using the command line
288 cmd = string.join(map(str,args))
289 debug("+", self.lctl, cmd)
290 rc, out = run(self.lctl, cmd)
292 raise CommandError(self.lctl, out, rc)
296 def network(self, net, nid):
297 """ initialized network and add "self" """
298 # Idea: "mynid" could be used for all network types to add "self," and then
299 # this special case would be gone and the "self" hack would be hidden.
300 if net in ('tcp', 'toe'):
305 quit""" % (net, nid, nid)
314 # create a new connection
315 def connect(self, net, nid, port, servuuid, send_mem, recv_mem):
316 if net in ('tcp', 'toe'):
323 quit""" % (net, servuuid, nid, send_mem, recv_mem, nid, port, )
329 quit""" % (net, servuuid, nid, nid, port, )
333 # add a route to a range
334 def add_route(self, net, gw, lo, hi):
338 quit """ % (net, gw, lo, hi)
342 def del_route(self, net, gw, lo, hi):
350 # add a route to a host
351 def add_route_host(self, net, uuid, gw, tgt):
356 quit """ % (net, uuid, tgt, gw, tgt)
359 # add a route to a range
360 def del_route_host(self, net, uuid, gw, tgt):
366 quit """ % (net, uuid, tgt)
369 # disconnect one connection
370 def disconnect(self, net, nid, port, servuuid):
376 quit""" % (net, nid, servuuid)
380 def disconnectAll(self, net):
389 # create a new device with lctl
390 def newdev(self, attach, setup = ""):
395 quit""" % (attach, setup)
399 def cleanup(self, name, uuid):
405 quit""" % (name, ('', 'force')[config.force()])
409 def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist):
413 lov_setconfig %s %d %d %d %s %s
414 quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
418 def dump(self, dump_file):
421 quit""" % (dump_file)
424 # get list of devices
425 def device_list(self):
426 rc, out = self.runcmd('device_list')
429 # ============================================================
430 # Various system-level functions
431 # (ideally moved to their own module)
433 # Run a command and return the output and status.
434 # stderr is sent to /dev/null, could use popen3 to
435 # save it if necessary
437 cmd = string.join(map(str,args))
439 if config.noexec(): return (0, [])
440 f = os.popen(cmd + ' 2>&1')
449 # Run a command in the background.
450 def run_daemon(*args):
451 cmd = string.join(map(str,args))
453 if config.noexec(): return 0
454 f = os.popen(cmd + ' 2>&1')
462 # Determine full path to use for an external command
463 # searches dirname(argv[0]) first, then PATH
465 syspath = string.split(os.environ['PATH'], ':')
466 cmdpath = os.path.dirname(sys.argv[0])
467 syspath.insert(0, cmdpath);
468 syspath.insert(0, os.path.join(cmdpath, '../../portals/linux/utils/'))
470 prog = os.path.join(d,cmd)
471 if os.access(prog, os.X_OK):
475 # Recursively look for file starting at base dir
476 def do_find_file(base, mod):
477 fullname = os.path.join(base, mod)
478 if os.access(fullname, os.R_OK):
480 for d in os.listdir(base):
481 dir = os.path.join(base,d)
482 if os.path.isdir(dir):
483 module = do_find_file(dir, mod)
487 def find_module(src_dir, dev_dir, modname):
488 mod = '%s.o' % (modname)
489 module = src_dir +'/'+ dev_dir +'/'+ mod
491 if os.access(module, os.R_OK):
497 # is the path a block device?
504 return stat.S_ISBLK(s[stat.ST_MODE])
506 # build fs according to type
508 def mkfs(fstype, dev):
509 if(fstype in ('ext3', 'extN')):
510 mkfs = 'mkfs.ext2 -j -b 4096'
512 print 'unsupported fs type: ', fstype
513 if not is_block(dev):
517 (ret, out) = run (mkfs, force, dev)
519 panic("Unable to build fs:", dev)
520 # enable hash tree indexing on fsswe
521 # FIXME: this check can probably go away on 2.5
523 htree = 'echo "feature FEATURE_C5" | debugfs -w'
524 (ret, out) = run (htree, dev)
526 panic("Unable to enable htree:", dev)
528 # some systems use /dev/loopN, some /dev/loop/N
532 if not os.access(loop + str(0), os.R_OK):
534 if not os.access(loop + str(0), os.R_OK):
535 panic ("can't access loop devices")
538 # find loop device assigned to thefile
541 for n in xrange(0, MAX_LOOP_DEVICES):
543 if os.access(dev, os.R_OK):
544 (stat, out) = run('losetup', dev)
545 if (out and stat == 0):
546 m = re.search(r'\((.*)\)', out[0])
547 if m and file == m.group(1):
553 # create file if necessary and assign the first free loop device
554 def init_loop(file, size, fstype):
555 dev = find_loop(file)
557 print 'WARNING file:', file, 'already mapped to', dev
559 if config.reformat() or not os.access(file, os.R_OK | os.W_OK):
560 run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size, file))
562 # find next free loop
563 for n in xrange(0, MAX_LOOP_DEVICES):
565 if os.access(dev, os.R_OK):
566 (stat, out) = run('losetup', dev)
568 run('losetup', dev, file)
571 print "out of loop devices"
573 print "out of loop devices"
576 # undo loop assignment
577 def clean_loop(file):
578 dev = find_loop(file)
580 ret, out = run('losetup -d', dev)
582 log('unable to clean loop device:', dev, 'for file:', file)
585 # determine if dev is formatted as a <fstype> filesystem
586 def need_format(fstype, dev):
587 # FIXME don't know how to implement this
590 # initialize a block device if needed
591 def block_dev(dev, size, fstype, format):
592 if config.noexec(): return dev
593 if not is_block(dev):
594 dev = init_loop(dev, size, fstype)
595 if config.reformat() or (need_format(fstype, dev) and format == 'yes'):
599 # panic("device:", dev,
600 # "not prepared, and autoformat is not set.\n",
601 # "Rerun with --reformat option to format ALL filesystems")
606 """lookup IP address for an interface"""
607 rc, out = run("/sbin/ifconfig", iface)
610 addr = string.split(out[1])[1]
611 ip = string.split(addr, ':')[1]
614 def get_local_address(net_type, wildcard):
615 """Return the local address for the network type."""
617 if net_type in ('tcp', 'toe'):
619 iface, star = string.split(wildcard, ':')
620 local = if2addr(iface)
622 panic ("unable to determine ip for:", wildcard)
624 host = socket.gethostname()
625 local = socket.gethostbyname(host)
626 elif net_type == 'elan':
627 # awk '/NodeId/ { print $2 }' '/proc/elan/device0/position'
629 fp = open('/proc/elan/device0/position', 'r')
630 lines = fp.readlines()
639 elif net_type == 'gm':
640 fixme("automatic local address for GM")
644 def is_prepared(uuid):
645 """Return true if a device exists for the uuid"""
646 # expect this format:
647 # 1 UP ldlm ldlm ldlm_UUID 2
649 out = lctl.device_list()
651 if uuid == string.split(s)[4]:
653 except CommandError, e:
658 # ============================================================
659 # Classes to prepare and cleanup the various objects
662 """ Base class for the rest of the modules. The default cleanup method is
663 defined here, as well as some utilitiy funcs.
665 def __init__(self, module_name, dom_node):
666 self.dom_node = dom_node
667 self.module_name = module_name
668 self.name = get_attr(dom_node, 'name')
669 self.uuid = get_attr(dom_node, 'uuid')
670 self.kmodule_list = []
674 def info(self, *args):
675 msg = string.join(map(str,args))
676 print self.module_name + ":", self.name, self.uuid, msg
679 def lookup_server(self, srv_uuid):
680 """ Lookup a server's network information """
681 net = get_ost_net(self.dom_node.parentNode, srv_uuid)
683 panic ("Unable to find a server for:", srv_uuid)
684 self._server = Network(net)
686 def get_server(self):
690 """ default cleanup, used for most modules """
692 srv = self.get_server()
693 if srv and local_net(srv):
695 lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
696 except CommandError, e:
697 log(self.module_name, "disconnect failed: ", self.name)
701 lctl.cleanup(self.name, self.uuid)
702 except CommandError, e:
703 log(self.module_name, "cleanup failed: ", self.name)
707 def add_module(self, dev_dir, modname):
708 """Append a module to list of modules to load."""
709 self.kmodule_list.append((dev_dir, modname))
711 def mod_loaded(self, modname):
712 """Check if a module is already loaded. Look in /proc/modules for it."""
713 fp = open('/proc/modules')
714 lines = fp.readlines()
716 # please forgive my tired fingers for this one
717 ret = filter(lambda word, mod=modname: word == mod,
718 map(lambda line: string.split(line)[0], lines))
721 def load_module(self):
722 """Load all the modules in the list in the order they appear."""
723 for dev_dir, mod in self.kmodule_list:
724 # (rc, out) = run ('/sbin/lsmod | grep -s', mod)
725 if self.mod_loaded(mod) and not config.noexec():
727 log ('loading module:', mod)
729 module = find_module(config.src_dir(),dev_dir, mod)
731 panic('module not found:', mod)
732 (rc, out) = run('/sbin/insmod', module)
734 raise CommandError('insmod', out, rc)
736 (rc, out) = run('/sbin/modprobe', mod)
738 raise CommandError('modprobe', out, rc)
740 def cleanup_module(self):
741 """Unload the modules in the list in reverse order."""
742 rev = self.kmodule_list
744 for dev_dir, mod in rev:
745 if not self.mod_loaded(mod):
748 if mod == 'portals' and config.dump_file():
749 lctl.dump(config.dump_file())
750 log('unloading module:', mod)
753 (rc, out) = run('/sbin/rmmod', mod)
755 log('! unable to unload module:', mod)
759 class Network(Module):
760 def __init__(self,dom_node):
761 Module.__init__(self, 'NETWORK', dom_node)
762 self.net_type = get_attr(dom_node,'type')
763 self.nid = get_text(dom_node, 'server', '*')
764 self.port = get_text_int(dom_node, 'port', 0)
765 self.send_mem = get_text_int(dom_node, 'send_mem', DEFAULT_TCPBUF)
766 self.recv_mem = get_text_int(dom_node, 'recv_mem', DEFAULT_TCPBUF)
768 self.nid = get_local_address(self.net_type, self.nid)
770 panic("unable to set nid for", self.net_type, self.nid)
771 debug("nid:", self.nid)
773 self.add_module('portals/linux/oslib/', 'portals')
774 if node_needs_router():
775 self.add_module('portals/linux/router', 'kptlrouter')
776 if self.net_type == 'tcp':
777 self.add_module('portals/linux/socknal', 'ksocknal')
778 if self.net_type == 'toe':
779 self.add_module('portals/linux/toenal', 'ktoenal')
780 if self.net_type == 'elan':
781 self.add_module('portals/linux/rqswnal', 'kqswnal')
782 if self.net_type == 'gm':
783 self.add_module('portals/linux/gmnal', 'kgmnal')
784 self.add_module('lustre/obdclass', 'obdclass')
785 self.add_module('lustre/ptlrpc', 'ptlrpc')
788 self.info(self.net_type, self.nid, self.port)
789 if self.net_type in ('tcp', 'toe'):
790 nal_id = '' # default is socknal
791 if self.net_type == 'toe':
793 ret, out = run(TCP_ACCEPTOR, '-s', self.send_mem, '-r', self.recv_mem, nal_id, self.port)
795 raise CommandError(TCP_ACCEPTOR, out, ret)
796 ret = self.dom_node.getElementsByTagName('route_tbl')
798 for r in a.getElementsByTagName('route'):
799 net_type = get_attr(r, 'type')
800 gw = get_attr(r, 'gw')
801 lo = get_attr(r, 'lo')
802 hi = get_attr(r,'hi', '')
803 lctl.add_route(net_type, gw, lo, hi)
804 if net_type in ('tcp', 'toe') and net_type == self.net_type and hi == '':
805 srv = nid2server(self.dom_node.parentNode.parentNode, lo)
807 panic("no server for nid", lo)
809 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
812 lctl.network(self.net_type, self.nid)
813 lctl.newdev(attach = "ptlrpc RPCDEV RPCDEV_UUID")
816 self.info(self.net_type, self.nid, self.port)
817 ret = self.dom_node.getElementsByTagName('route_tbl')
819 for r in a.getElementsByTagName('route'):
820 lo = get_attr(r, 'lo')
821 hi = get_attr(r,'hi', '')
822 if self.net_type in ('tcp', 'toe') and hi == '':
823 srv = nid2server(self.dom_node.parentNode.parentNode, lo)
825 panic("no server for nid", lo)
828 lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
829 except CommandError, e:
830 print "disconnect failed: ", self.name
834 lctl.del_route(self.net_type, self.nid, lo, hi)
835 except CommandError, e:
836 print "del_route failed: ", self.name
841 lctl.cleanup("RPCDEV", "RPCDEV_UUID")
842 except CommandError, e:
843 print "cleanup failed: ", self.name
847 lctl.disconnectAll(self.net_type)
848 except CommandError, e:
849 print "disconnectAll failed: ", self.name
852 if self.net_type in ('tcp', 'toe'):
853 # yikes, this ugly! need to save pid in /var/something
854 run("killall acceptor")
857 def __init__(self,dom_node):
858 Module.__init__(self, 'LDLM', dom_node)
859 self.add_module('lustre/ldlm', 'ldlm')
861 if is_prepared(self.uuid):
864 lctl.newdev(attach="ldlm %s %s" % (self.name, self.uuid),
868 def __init__(self,dom_node):
869 Module.__init__(self, 'LOV', dom_node)
870 self.mds_uuid = get_first_ref(dom_node, 'mds')
871 mds= lookup(dom_node.parentNode, self.mds_uuid)
872 self.mds_name = getName(mds)
873 devs = dom_node.getElementsByTagName('devices')
876 self.stripe_sz = get_attr_int(dev_node, 'stripesize', 65536)
877 self.stripe_off = get_attr_int(dev_node, 'stripeoffset', 0)
878 self.pattern = get_attr_int(dev_node, 'pattern', 0)
879 self.devlist = get_all_refs(dev_node, 'osc')
880 self.stripe_cnt = get_attr_int(dev_node, 'stripecount', len(self.devlist))
881 self.add_module('lustre/mdc', 'mdc')
882 self.add_module('lustre/lov', 'lov')
885 if is_prepared(self.uuid):
887 for osc_uuid in self.devlist:
888 osc = lookup(self.dom_node.parentNode, osc_uuid)
893 panic('osc not found:', osc_uuid)
894 mdc_uuid = prepare_mdc(self.dom_node.parentNode, self.mds_uuid)
895 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
896 self.stripe_off, self.pattern, self.devlist, self.mds_name)
897 lctl.newdev(attach="lov %s %s" % (self.name, self.uuid),
898 setup ="%s" % (mdc_uuid))
901 if not is_prepared(self.uuid):
903 for osc_uuid in self.devlist:
904 osc = lookup(self.dom_node.parentNode, osc_uuid)
909 panic('osc not found:', osc_uuid)
911 cleanup_mdc(self.dom_node.parentNode, self.mds_uuid)
914 def load_module(self):
915 for osc_uuid in self.devlist:
916 osc = lookup(self.dom_node.parentNode, osc_uuid)
922 panic('osc not found:', osc_uuid)
923 Module.load_module(self)
926 def cleanup_module(self):
927 Module.cleanup_module(self)
928 for osc_uuid in self.devlist:
929 osc = lookup(self.dom_node.parentNode, osc_uuid)
935 panic('osc not found:', osc_uuid)
937 class LOVConfig(Module):
938 def __init__(self,dom_node):
939 Module.__init__(self, 'LOVConfig', dom_node)
940 self.lov_uuid = get_first_ref(dom_node, 'lov')
941 l = lookup(dom_node.parentNode, self.lov_uuid)
946 self.info(lov.mds_uuid, lov.stripe_cnt, lov.stripe_sz, lov.stripe_off,
947 lov.pattern, lov.devlist, lov.mds_name)
948 lctl.lov_setconfig(lov.uuid, lov.mds_name, lov.stripe_cnt,
949 lov.stripe_sz, lov.stripe_off, lov.pattern,
950 string.join(lov.devlist))
958 def __init__(self,dom_node):
959 Module.__init__(self, 'MDS', dom_node)
960 self.devname, self.size = get_device(dom_node)
961 self.fstype = get_text(dom_node, 'fstype')
962 # FIXME: if fstype not set, then determine based on kernel version
963 self.format = get_text(dom_node, 'autoformat', "no")
964 if self.fstype == 'extN':
965 self.add_module('lustre/extN', 'extN')
966 self.add_module('lustre/mds', 'mds')
967 self.add_module('lustre/mds', 'mds_%s' % (self.fstype))
970 if is_prepared(self.uuid):
972 self.info(self.devname, self.fstype, self.format)
973 blkdev = block_dev(self.devname, self.size, self.fstype, self.format)
974 if not is_prepared('MDT_UUID'):
975 lctl.newdev(attach="mdt %s %s" % ('MDT', 'MDT_UUID'),
977 lctl.newdev(attach="mds %s %s" % (self.name, self.uuid),
978 setup ="%s %s" %(blkdev, self.fstype))
980 if is_prepared('MDT_UUID'):
982 lctl.cleanup("MDT", "MDT_UUID")
983 except CommandError, e:
984 print "cleanup failed: ", self.name
987 if not is_prepared(self.uuid):
990 clean_loop(self.devname)
992 # Very unusual case, as there is no MDC element in the XML anymore
993 # Builds itself from an MDS node
995 def __init__(self,dom_node):
996 self.mds = MDS(dom_node)
997 self.dom_node = dom_node
998 self.module_name = 'MDC'
999 self.kmodule_list = []
1003 host = socket.gethostname()
1004 self.name = 'MDC_%s' % (self.mds.name)
1005 self.uuid = '%s_%05x_%05x' % (self.name, int(random.random() * 1048576),
1006 int(random.random() * 1048576))
1008 self.lookup_server(self.mds.uuid)
1009 self.add_module('lustre/mdc', 'mdc')
1012 if is_prepared(self.uuid):
1014 self.info(self.mds.uuid)
1015 srv = self.get_server()
1016 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
1017 lctl.newdev(attach="mdc %s %s" % (self.name, self.uuid),
1018 setup ="%s %s" %(self.mds.uuid, srv.uuid))
1021 def __init__(self, dom_node):
1022 Module.__init__(self, 'OBD', dom_node)
1023 self.obdtype = get_attr(dom_node, 'type')
1024 self.devname, self.size = get_device(dom_node)
1025 self.fstype = get_text(dom_node, 'fstype')
1026 # FIXME: if fstype not set, then determine based on kernel version
1027 self.format = get_text(dom_node, 'autoformat', 'yes')
1028 if self.fstype == 'extN':
1029 self.add_module('lustre/extN', 'extN')
1030 self.add_module('lustre/' + self.obdtype, self.obdtype)
1032 # need to check /proc/mounts and /etc/mtab before
1033 # formatting anything.
1034 # FIXME: check if device is already formatted.
1036 if is_prepared(self.uuid):
1038 self.info(self.obdtype, self.devname, self.size, self.fstype, self.format)
1039 if self.obdtype == 'obdecho':
1042 blkdev = block_dev(self.devname, self.size, self.fstype, self.format)
1043 lctl.newdev(attach="%s %s %s" % (self.obdtype, self.name, self.uuid),
1044 setup ="%s %s" %(blkdev, self.fstype))
1046 if not is_prepared(self.uuid):
1048 Module.cleanup(self)
1049 if not self.obdtype == 'obdecho':
1050 clean_loop(self.devname)
1053 def __init__(self,dom_node):
1054 Module.__init__(self, 'OST', dom_node)
1055 self.obd_uuid = get_first_ref(dom_node, 'obd')
1056 self.add_module('lustre/ost', 'ost')
1059 if is_prepared(self.uuid):
1061 self.info(self.obd_uuid)
1062 lctl.newdev(attach="ost %s %s" % (self.name, self.uuid),
1063 setup ="%s" % (self.obd_uuid))
1066 # virtual interface for OSC and LOV
1068 def __init__(self,dom_node):
1069 Module.__init__(self, 'VOSC', dom_node)
1070 if dom_node.nodeName == 'lov':
1071 self.osc = LOV(dom_node)
1073 self.osc = OSC(dom_node)
1078 def load_module(self):
1079 self.osc.load_module()
1080 def cleanup_module(self):
1081 self.osc.cleanup_module()
1085 def __init__(self,dom_node):
1086 Module.__init__(self, 'OSC', dom_node)
1087 self.obd_uuid = get_first_ref(dom_node, 'obd')
1088 self.ost_uuid = get_first_ref(dom_node, 'ost')
1089 self.lookup_server(self.ost_uuid)
1090 self.add_module('lustre/osc', 'osc')
1093 if is_prepared(self.uuid):
1095 self.info(self.obd_uuid, self.ost_uuid)
1096 srv = self.get_server()
1098 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
1102 lctl.add_route_host(r[0], srv.uuid, r[1], r[2])
1104 panic ("no route to", srv.nid)
1106 lctl.newdev(attach="osc %s %s" % (self.name, self.uuid),
1107 setup ="%s %s" %(self.obd_uuid, srv.uuid))
1110 if not is_prepared(self.uuid):
1112 srv = self.get_server()
1114 Module.cleanup(self)
1116 self.info(self.obd_uuid, self.ost_uuid)
1120 lctl.del_route_host(r[0], srv.uuid, r[1], r[2])
1121 except CommandError, e:
1122 print "del_route failed: ", self.name
1125 Module.cleanup(self)
1128 class ECHO_CLIENT(Module):
1129 def __init__(self,dom_node):
1130 Module.__init__(self, 'ECHO_CLIENT', dom_node)
1131 self.obd_uuid = get_first_ref(dom_node, 'osc')
1132 debug("HERE",self.obd_uuid)
1133 self.add_module('lustre/obdecho', 'obdecho')
1136 if is_prepared(self.uuid):
1138 self.info(self.obd_uuid)
1140 lctl.newdev(attach="echo_client %s %s" % (self.name, self.uuid),
1141 setup = self.obd_uuid)
1144 if not is_prepared(self.uuid):
1146 Module.cleanup(self)
1149 class Mountpoint(Module):
1150 def __init__(self,dom_node):
1151 Module.__init__(self, 'MTPT', dom_node)
1152 self.path = get_text(dom_node, 'path')
1153 self.mds_uuid = get_first_ref(dom_node, 'mds')
1154 self.lov_uuid = get_first_ref(dom_node, 'osc')
1155 self.add_module('lustre/mdc', 'mdc')
1156 self.add_module('lustre/llite', 'llite')
1157 l = lookup(self.dom_node.parentNode, self.lov_uuid)
1162 mdc_uuid = prepare_mdc(self.dom_node.parentNode, self.mds_uuid)
1163 self.info(self.path, self.mds_uuid, self.lov_uuid)
1164 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s none %s" % \
1165 (self.lov_uuid, mdc_uuid, self.path)
1166 run("mkdir", self.path)
1169 panic("mount failed:", self.path)
1172 self.info(self.path, self.mds_uuid,self.lov_uuid)
1174 (rc, out) = run("umount -f", self.path)
1176 (rc, out) = run("umount", self.path)
1178 log("umount failed, cleanup will most likely not work.")
1179 l = lookup(self.dom_node.parentNode, self.lov_uuid)
1181 cleanup_mdc(self.dom_node.parentNode, self.mds_uuid)
1183 def load_module(self):
1184 self.osc.load_module()
1185 Module.load_module(self)
1186 def cleanup_module(self):
1187 Module.cleanup_module(self)
1188 self.osc.cleanup_module()
1191 # ============================================================
1192 # XML processing and query
1193 # TODO: Change query funcs to use XPath, which is muc cleaner
1195 def get_device(obd):
1196 list = obd.getElementsByTagName('device')
1200 size = get_attr_int(dev, 'size', 0)
1201 return dev.firstChild.data, size
1204 # Get the text content from the first matching child
1205 # If there is no content (or it is all whitespace), return
1207 def get_text(dom_node, tag, default=""):
1208 list = dom_node.getElementsByTagName(tag)
1211 dom_node.normalize()
1212 if dom_node.firstChild:
1213 txt = string.strip(dom_node.firstChild.data)
1218 def get_text_int(dom_node, tag, default=0):
1219 list = dom_node.getElementsByTagName(tag)
1223 dom_node.normalize()
1224 if dom_node.firstChild:
1225 txt = string.strip(dom_node.firstChild.data)
1230 panic("text value is not integer:", txt)
1233 def get_attr(dom_node, attr, default=""):
1234 v = dom_node.getAttribute(attr)
1239 def get_attr_int(dom_node, attr, default=0):
1241 v = dom_node.getAttribute(attr)
1246 panic("attr value is not integer", v)
1249 def get_first_ref(dom_node, tag):
1250 """ Get the first uuidref of the type TAG. Used one only
1251 one is expected. Returns the uuid."""
1253 refname = '%s_ref' % tag
1254 list = dom_node.getElementsByTagName(refname)
1256 uuid = getRef(list[0])
1259 def get_all_refs(dom_node, tag):
1260 """ Get all the refs of type TAG. Returns list of uuids. """
1262 refname = '%s_ref' % tag
1263 list = dom_node.getElementsByTagName(refname)
1266 uuids.append(getRef(i))
1269 def get_ost_net(dom_node, uuid):
1270 ost = lookup(dom_node, uuid)
1271 uuid = get_first_ref(ost, 'network')
1274 return lookup(dom_node, uuid)
1276 def nid2server(dom_node, nid):
1277 netlist = dom_node.getElementsByTagName('network')
1278 for net_node in netlist:
1279 if get_text(net_node, 'server') == nid:
1280 return Network(net_node)
1283 def lookup(dom_node, uuid):
1284 for n in dom_node.childNodes:
1285 if n.nodeType == n.ELEMENT_NODE:
1286 if getUUID(n) == uuid:
1293 # Get name attribute of dom_node
1294 def getName(dom_node):
1295 return dom_node.getAttribute('name')
1297 def getRef(dom_node):
1298 return dom_node.getAttribute('uuidref')
1300 # Get name attribute of dom_node
1301 def getUUID(dom_node):
1302 return dom_node.getAttribute('uuid')
1304 # the tag name is the service type
1305 # fixme: this should do some checks to make sure the dom_node is a service
1306 def getServiceType(dom_node):
1307 return dom_node.nodeName
1310 # determine what "level" a particular node is at.
1311 # the order of iniitailization is based on level.
1312 def getServiceLevel(dom_node):
1313 type = getServiceType(dom_node)
1315 if type in ('network',):
1317 elif type in ('device', 'ldlm'):
1319 elif type in ('obd', 'mdd'):
1321 elif type in ('mds','ost'):
1323 elif type in ('mdc','osc'):
1325 elif type in ('lov', 'lovconfig'):
1327 elif type in ('mountpoint', 'echo_client'):
1330 if ret < config.minlevel() or ret > config.maxlevel():
1335 # return list of services in a profile. list is a list of tuples
1336 # [(level, dom_node),]
1337 def getServices(lustreNode, profileNode):
1339 for n in profileNode.childNodes:
1340 if n.nodeType == n.ELEMENT_NODE:
1341 servNode = lookup(lustreNode, getRef(n))
1344 panic('service not found: ' + getRef(n))
1345 level = getServiceLevel(servNode)
1347 list.append((level, servNode))
1351 def getByName(lustreNode, name, tag):
1352 ndList = lustreNode.getElementsByTagName(tag)
1354 if getName(nd) == name:
1359 ############################################################
1361 # FIXME: clean this mess up!
1364 def prepare_mdc(dom_node, mds_uuid):
1366 mds_node = lookup(dom_node, mds_uuid);
1368 panic("no mds:", mds_uuid)
1369 if saved_mdc.has_key(mds_uuid):
1370 return saved_mdc[mds_uuid]
1373 saved_mdc[mds_uuid] = mdc.uuid
1376 def cleanup_mdc(dom_node, mds_uuid):
1378 mds_node = lookup(dom_node, mds_uuid);
1380 panic("no mds:", mds_uuid)
1381 if not saved_mdc.has_key(mds_uuid):
1384 saved_mdc[mds_uuid] = mdc.uuid
1387 ############################################################
1388 # routing ("rooting")
1394 def init_node(dom_node):
1395 global local_node, router_flag
1396 netlist = dom_node.getElementsByTagName('network')
1397 for dom_net in netlist:
1398 type = get_attr(dom_net, 'type')
1399 gw = get_text(dom_net, 'server')
1400 local_node.append((type, gw))
1402 def node_needs_router():
1405 def get_routes(type, gw, dom_net):
1406 """ Return the routes as a list of tuples of the form:
1407 [(type, gw, lo, hi),]"""
1409 tbl = dom_net.getElementsByTagName('route_tbl')
1411 routes = t.getElementsByTagName('route')
1413 lo = get_attr(r, 'lo')
1414 hi = get_attr(r, 'hi', '')
1415 res.append((type, gw, lo, hi))
1419 def init_route_config(lustre):
1420 """ Scan the lustre config looking for routers. Build list of
1422 global routes, router_flag
1424 list = lustre.getElementsByTagName('node')
1426 if get_attr(node, 'router'):
1428 for (local_type, local_nid) in local_node:
1430 netlist = node.getElementsByTagName('network')
1431 for dom_net in netlist:
1432 if local_type == get_attr(dom_net, 'type'):
1433 gw = get_text(dom_net, 'server')
1437 for dom_net in netlist:
1438 if local_type != get_attr(dom_net, 'type'):
1439 for route in get_routes(local_type, gw, dom_net):
1440 routes.append(route)
1445 for iface in local_node:
1446 if net.net_type == iface[0]:
1450 def find_route(net):
1451 global local_node, routes
1452 frm_type = local_node[0][0]
1453 to_type = net.net_type
1455 debug ('looking for route to', to_type,to)
1464 ############################################################
1467 def startService(dom_node, module_flag):
1468 type = getServiceType(dom_node)
1469 debug('Service:', type, getName(dom_node), getUUID(dom_node))
1470 # there must be a more dynamic way of doing this...
1476 elif type == 'lovconfig':
1477 n = LOVConfig(dom_node)
1478 elif type == 'network':
1479 n = Network(dom_node)
1488 elif type == 'echo_client':
1489 n = ECHO_CLIENT(dom_node)
1492 elif type == 'mountpoint':
1493 n = Mountpoint(dom_node)
1495 panic ("unknown service type:", type)
1500 if config.cleanup():
1505 if config.nosetup():
1507 if config.cleanup():
1513 # Prepare the system to run lustre using a particular profile
1514 # in a the configuration.
1515 # * load & the modules
1516 # * setup networking for the current node
1517 # * make sure partitions are in place and prepared
1518 # * initialize devices with lctl
1519 # Levels is important, and needs to be enforced.
1520 def startProfile(lustreNode, profileNode, module_flag):
1522 panic("profile:", profile, "not found.")
1523 services = getServices(lustreNode, profileNode)
1524 if config.cleanup():
1527 startService(s[1], module_flag)
1532 def doHost(lustreNode, hosts):
1536 dom_node = getByName(lustreNode, h, 'node')
1541 print 'No host entry found.'
1544 if not get_attr(dom_node, 'router'):
1546 init_route_config(lustreNode)
1551 # Two step process: (1) load modules, (2) setup lustre
1552 # if not cleaning, load modules first.
1553 module_flag = not config.cleanup()
1554 reflist = dom_node.getElementsByTagName('profile')
1555 for profile in reflist:
1556 startProfile(lustreNode, profile, module_flag)
1558 if not config.cleanup():
1559 sys_set_debug_path()
1560 script = config.gdb_script()
1561 run(lctl.lctl, ' modules >', script)
1563 # dump /tmp/ogdb and sleep/pause here
1564 log ("The GDB module script is in", script)
1567 module_flag = not module_flag
1568 for profile in reflist:
1569 startProfile(lustreNode, profile, module_flag)
1571 ############################################################
1572 # Command line processing
1574 def parse_cmdline(argv):
1575 short_opts = "hdnvf"
1576 long_opts = ["ldap", "reformat", "lustre=", "verbose", "gdb",
1577 "portals=", "makeldiff", "cleanup", "noexec",
1578 "help", "node=", "nomod", "nosetup",
1579 "dump=", "force", "minlevel=", "maxlevel="]
1583 opts, args = getopt.getopt(argv, short_opts, long_opts)
1584 except getopt.error:
1589 if o in ("-h", "--help"):
1591 if o in ("-d","--cleanup"):
1593 if o in ("-v", "--verbose"):
1595 if o in ("-n", "--noexec"):
1598 if o == "--portals":
1602 if o == "--reformat":
1610 if o == "--nosetup":
1614 if o in ("-f", "--force"):
1616 if o in ("--minlevel",):
1618 if o in ("--maxlevel",):
1627 s = urllib.urlopen(url)
1633 def setupModulePath(cmd):
1634 base = os.path.dirname(cmd)
1635 if os.access(base+"/Makefile", os.R_OK):
1636 config.src_dir(base + "/../../")
1638 def sys_set_debug_path():
1639 debug("debug path: ", config.debug_path())
1643 fp = open('/proc/sys/portals/debug_path', 'w')
1644 fp.write(config.debug_path())
1649 #/proc/sys/net/core/rmem_max
1650 #/proc/sys/net/core/wmem_max
1651 def sys_set_netmem_max(path, max):
1652 debug("setting", path, "to at least", max)
1660 fp = open(path, 'w')
1661 fp.write('%d\n' %(max))
1665 def sys_make_devices():
1666 if not os.access('/dev/portals', os.R_OK):
1667 run('mknod /dev/portals c 10 240')
1668 if not os.access('/dev/obd', os.R_OK):
1669 run('mknod /dev/obd c 10 241')
1672 # Add dir to the global PATH, if not already there.
1673 def add_to_path(new_dir):
1674 syspath = string.split(os.environ['PATH'], ':')
1675 if new_dir in syspath:
1677 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
1680 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
1681 # ensure basic elements are in the system path
1682 def sanitise_path():
1683 for dir in DEFAULT_PATH:
1686 # Initialize or shutdown lustre according to a configuration file
1687 # * prepare the system for lustre
1688 # * configure devices with lctl
1689 # Shutdown does steps in reverse
1692 global TCP_ACCEPTOR, lctl, MAXTCPBUF
1693 host = socket.gethostname()
1695 # the PRNG is normally seeded with time(), which is not so good for starting
1696 # time-synchronized clusters
1697 input = open('/dev/urandom', 'r')
1699 print 'Unable to open /dev/urandom!'
1701 seed = input.read(32)
1707 args = parse_cmdline(sys.argv[1:])
1709 if not os.access(args[0], os.R_OK):
1710 print 'File not found or readable:', args[0]
1712 dom = xml.dom.minidom.parse(args[0])
1714 xmldata = fetch(config.url())
1715 dom = xml.dom.minidom.parseString(xmldata)
1721 node_list.append(config.node())
1724 node_list.append(host)
1725 node_list.append('localhost')
1726 debug("configuring for host: ", node_list)
1729 config._debug_path = config._debug_path + '-' + host
1730 config._gdb_script = config._gdb_script + '-' + host
1732 TCP_ACCEPTOR = find_prog('acceptor')
1733 if not TCP_ACCEPTOR:
1735 TCP_ACCEPTOR = 'acceptor'
1736 debug('! acceptor not found')
1738 panic('acceptor not found')
1740 lctl = LCTLInterface('lctl')
1742 setupModulePath(sys.argv[0])
1744 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
1745 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
1746 doHost(dom.documentElement, node_list)
1748 if __name__ == "__main__":
1751 except LconfError, e:
1753 except CommandError, e:
1757 if first_cleanup_error:
1758 sys.exit(first_cleanup_error)