3 # Copyright (C) 2002 Cluster File Systems, Inc.
4 # Author: Robert Read <rread@clusterfs.com>
5 # This file is part of Lustre, http://www.lustre.org.
7 # Lustre is free software; you can redistribute it and/or
8 # modify it under the terms of version 2 of the GNU General Public
9 # License as published by the Free Software Foundation.
11 # Lustre is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with Lustre; if not, write to the Free Software
18 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 # lconf - lustre configuration tool
22 # lconf is the main driver script for starting and stopping
23 # lustre filesystem services.
25 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import string, os, stat, popen2, socket, time
30 import xml.dom.minidom
35 DEFAULT_TCPBUF = 1048576
37 # Maximum number of devices to search for.
38 # (the /dev/loop* nodes need to be created beforehand)
39 MAX_LOOP_DEVICES = 256
41 first_cleanup_error = 0
42 def cleanup_error(rc):
43 global first_cleanup_error
44 if not first_cleanup_error:
45 first_cleanup_error = rc
49 print """usage: lconf config.xml
51 config.xml Lustre configuration in xml format.
52 --get <url> URL to fetch a config file
53 --node <nodename> Load config for <nodename>
54 -d | --cleanup Cleans up config. (Shutdown)
55 -f | --force Forced unmounting and/or obd detach during cleanup
56 -v | --verbose Print system commands as they are run
57 -h | --help Print this help
58 --gdb Prints message after creating gdb module script
59 and sleeps for 5 seconds.
60 -n | --noexec Prints the commands and steps that will be run for a
61 config without executing them. This can used to check if a
62 config file is doing what it should be doing. (Implies -v)
63 --nomod Skip load/unload module step.
64 --nosetup Skip device setup/cleanup step.
65 --reformat Reformat all devices (without question)
66 --dump <file> Dump the kernel debug log before portals is unloaded
67 --startlevel <num> Specify the level of services to start with (default 0)
68 --endlevel <num> Specify the level of services to end with (default 100)
69 Levels are aproximatly like:
79 --ldap server LDAP server with lustre config database
80 --makeldiff Translate xml source to LDIFF
81 This are perhaps not needed:
82 --lustre="src dir" Base directory of lustre sources. Used to search
84 --portals=src Portals source
88 # ============================================================
89 # Config parameters, encapsulated in a class
105 self._gdb_script = '/tmp/ogdb'
106 self._debug_path = '/tmp/lustre-log'
107 self._dump_file = None
109 self._start_level = 0
110 self._end_level = 100
112 def verbose(self, flag = None):
113 if flag: self._verbose = flag
116 def noexec(self, flag = None):
117 if flag: self._noexec = flag
120 def reformat(self, flag = None):
121 if flag: self._reformat = flag
122 return self._reformat
124 def cleanup(self, flag = None):
125 if flag: self._cleanup = flag
128 def gdb(self, flag = None):
129 if flag: self._gdb = flag
132 def nomod(self, flag = None):
133 if flag: self._nomod = flag
136 def nosetup(self, flag = None):
137 if flag: self._nosetup = flag
140 def force(self, flag = None):
141 if flag: self._force = flag
144 def node(self, val = None):
145 if val: self._node = val
148 def url(self, val = None):
149 if val: self._url = val
152 def gdb_script(self):
153 if os.path.isdir('/r'):
154 return '/r' + self._gdb_script
156 return self._gdb_script
158 def debug_path(self):
159 if os.path.isdir('/r'):
160 return '/r' + self._debug_path
162 return self._debug_path
164 def src_dir(self, val = None):
165 if val: self._src_dir = val
168 def dump_file(self, val = None):
169 if val: self._dump_file = val
170 return self._dump_file
172 def startlevel(self, val = None):
173 if val: self._start_level = int(val)
174 return self._start_level
176 def endlevel(self, val = None):
177 if val: self._end_level = int(val)
178 return self._end_level
184 # ============================================================
185 # debugging and error funcs
187 def fixme(msg = "this feature"):
188 raise LconfError, msg + ' not implmemented yet.'
191 msg = string.join(map(str,args))
192 if not config.noexec():
193 raise LconfError(msg)
198 msg = string.join(map(str,args))
203 print string.strip(s)
207 msg = string.join(map(str,args))
210 # ============================================================
211 # locally defined exceptions
212 class CommandError (exceptions.Exception):
213 def __init__(self, cmd_name, cmd_err, rc=None):
214 self.cmd_name = cmd_name
215 self.cmd_err = cmd_err
220 if type(self.cmd_err) == types.StringType:
222 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
224 print "! %s: %s" % (self.cmd_name, self.cmd_err)
225 elif type(self.cmd_err) == types.ListType:
227 print "! %s (error %d):" % (self.cmd_name, self.rc)
229 print "! %s:" % (self.cmd_name)
230 for s in self.cmd_err:
231 print "> %s" %(string.strip(s))
235 class LconfError (exceptions.Exception):
236 def __init__(self, args):
240 # ============================================================
241 # handle lctl interface
244 Manage communication with lctl
247 def __init__(self, cmd):
249 Initialize close by finding the lctl binary.
251 self.lctl = find_prog(cmd)
254 debug('! lctl not found')
257 raise CommandError('lctl', "unable to find lctl binary.")
262 the cmds are written to stdin of lctl
263 lctl doesn't return errors when run in script mode, so
265 should modify command line to accept multiple commands, or
266 create complex command line options
268 debug("+", self.lctl, cmds)
269 if config.noexec(): return (0, [])
270 p = popen2.Popen3(self.lctl, 1)
271 p.tochild.write(cmds + "\n")
273 out = p.fromchild.readlines()
274 err = p.childerr.readlines()
276 if os.WIFEXITED(ret):
277 rc = os.WEXITSTATUS(ret)
281 raise CommandError(self.lctl, err, rc)
284 def runcmd(self, *args):
286 run lctl using the command line
288 cmd = string.join(map(str,args))
289 debug("+", self.lctl, cmd)
290 rc, out = run(self.lctl, cmd)
292 raise CommandError(self.lctl, out, rc)
296 def network(self, net, nid):
297 """ initialized network and add "self" """
298 # Idea: "mynid" could be used for all network types to add "self," and then
299 # this special case would be gone and the "self" hack would be hidden.
305 quit""" % (net, nid, nid)
314 # create a new connection
315 def connect(self, net, nid, port, servuuid, send_mem, recv_mem):
323 quit""" % (net, servuuid, nid, send_mem, recv_mem, nid, port, )
329 quit""" % (net, servuuid, nid, nid, port, )
333 # add a route to a range
334 def add_route(self, net, gw, lo, hi):
338 quit """ % (net, gw, lo, hi)
342 def del_route(self, net, gw, lo, hi):
350 # add a route to a host
351 def add_route_host(self, net, uuid, gw, tgt):
356 quit """ % (net, uuid, tgt, gw, tgt)
359 # add a route to a range
360 def del_route_host(self, net, uuid, gw, tgt):
366 quit """ % (net, uuid, tgt)
369 # disconnect one connection
370 def disconnect(self, net, nid, port, servuuid):
376 quit""" % (net, nid, servuuid)
380 def disconnectAll(self, net):
389 # create a new device with lctl
390 def newdev(self, attach, setup = ""):
395 quit""" % (attach, setup)
399 def cleanup(self, name, uuid):
405 quit""" % (name, ('', 'force')[config.force()])
409 def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist):
413 lov_setconfig %s %d %d %d %s %s
414 quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
418 def dump(self, dump_file):
421 quit""" % (dump_file)
424 # get list of devices
425 def device_list(self):
426 rc, out = self.runcmd('device_list')
429 # ============================================================
430 # Various system-level functions
431 # (ideally moved to their own module)
433 # Run a command and return the output and status.
434 # stderr is sent to /dev/null, could use popen3 to
435 # save it if necessary
437 cmd = string.join(map(str,args))
439 if config.noexec(): return (0, [])
440 f = os.popen(cmd + ' 2>&1')
449 # Run a command in the background.
450 def run_daemon(*args):
451 cmd = string.join(map(str,args))
453 if config.noexec(): return 0
454 f = os.popen(cmd + ' 2>&1')
462 # Determine full path to use for an external command
463 # searches dirname(argv[0]) first, then PATH
465 syspath = string.split(os.environ['PATH'], ':')
466 cmdpath = os.path.dirname(sys.argv[0])
467 syspath.insert(0, cmdpath);
468 syspath.insert(0, os.path.join(cmdpath, '../../portals/linux/utils/'))
470 prog = os.path.join(d,cmd)
471 if os.access(prog, os.X_OK):
475 # Recursively look for file starting at base dir
476 def do_find_file(base, mod):
477 fullname = os.path.join(base, mod)
478 if os.access(fullname, os.R_OK):
480 for d in os.listdir(base):
481 dir = os.path.join(base,d)
482 if os.path.isdir(dir):
483 module = do_find_file(dir, mod)
487 def find_module(src_dir, dev_dir, modname):
488 mod = '%s.o' % (modname)
489 module = src_dir +'/'+ dev_dir +'/'+ mod
491 if os.access(module, os.R_OK):
497 # is the path a block device?
504 return stat.S_ISBLK(s[stat.ST_MODE])
506 # build fs according to type
508 def mkfs(fstype, dev):
509 if(fstype in ('ext3', 'extN')):
510 mkfs = 'mkfs.ext2 -j -b 4096'
512 print 'unsupported fs type: ', fstype
513 if not is_block(dev):
517 (ret, out) = run (mkfs, force, dev)
519 panic("Unable to build fs:", dev)
520 # enable hash tree indexing on fsswe
521 # FIXME: this check can probably go away on 2.5
523 htree = 'echo "feature FEATURE_C5" | debugfs -w'
524 (ret, out) = run (htree, dev)
526 panic("Unable to enable htree:", dev)
528 # some systems use /dev/loopN, some /dev/loop/N
532 if not os.access(loop + str(0), os.R_OK):
534 if not os.access(loop + str(0), os.R_OK):
535 panic ("can't access loop devices")
538 # find loop device assigned to thefile
541 for n in xrange(0, MAX_LOOP_DEVICES):
543 if os.access(dev, os.R_OK):
544 (stat, out) = run('losetup', dev)
545 if (out and stat == 0):
546 m = re.search(r'\((.*)\)', out[0])
547 if m and file == m.group(1):
553 # create file if necessary and assign the first free loop device
554 def init_loop(file, size, fstype):
555 dev = find_loop(file)
557 print 'WARNING file:', file, 'already mapped to', dev
559 if config.reformat() or not os.access(file, os.R_OK | os.W_OK):
560 run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size, file))
562 # find next free loop
563 for n in xrange(0, MAX_LOOP_DEVICES):
565 if os.access(dev, os.R_OK):
566 (stat, out) = run('losetup', dev)
568 run('losetup', dev, file)
571 print "out of loop devices"
573 print "out of loop devices"
576 # undo loop assignment
577 def clean_loop(file):
578 dev = find_loop(file)
580 ret, out = run('losetup -d', dev)
582 log('unable to clean loop device:', dev, 'for file:', file)
585 # determine if dev is formatted as a <fstype> filesystem
586 def need_format(fstype, dev):
587 # FIXME don't know how to implement this
590 # initialize a block device if needed
591 def block_dev(dev, size, fstype, format):
592 if config.noexec(): return dev
593 if not is_block(dev):
594 dev = init_loop(dev, size, fstype)
595 if config.reformat() or (need_format(fstype, dev) and format == 'yes'):
599 # panic("device:", dev,
600 # "not prepared, and autoformat is not set.\n",
601 # "Rerun with --reformat option to format ALL filesystems")
606 """lookup IP address for an interface"""
607 rc, out = run("/sbin/ifconfig", iface)
610 addr = string.split(out[1])[1]
611 ip = string.split(addr, ':')[1]
614 def get_local_address(net_type, wildcard):
615 """Return the local address for the network type."""
617 if net_type == 'tcp':
619 iface, star = string.split(wildcard, ':')
620 local = if2addr(iface)
622 panic ("unable to determine ip for:", wildcard)
624 host = socket.gethostname()
625 local = socket.gethostbyname(host)
626 elif net_type == 'elan':
627 # awk '/NodeId/ { print $2 }' '/proc/elan/device0/position'
629 fp = open('/proc/elan/device0/position', 'r')
630 lines = fp.readlines()
639 elif net_type == 'gm':
640 fixme("automatic local address for GM")
644 def is_prepared(uuid):
645 """Return true if a device exists for the uuid"""
646 # expect this format:
647 # 1 UP ldlm ldlm ldlm_UUID 2
648 out = lctl.device_list()
650 if uuid == string.split(s)[4]:
655 # ============================================================
656 # Classes to prepare and cleanup the various objects
659 """ Base class for the rest of the modules. The default cleanup method is
660 defined here, as well as some utilitiy funcs.
662 def __init__(self, module_name, dom_node):
663 self.dom_node = dom_node
664 self.module_name = module_name
665 self.name = get_attr(dom_node, 'name')
666 self.uuid = get_attr(dom_node, 'uuid')
667 self.kmodule_list = []
671 def info(self, *args):
672 msg = string.join(map(str,args))
673 print self.module_name + ":", self.name, self.uuid, msg
676 def lookup_server(self, srv_uuid):
677 """ Lookup a server's network information """
678 net = get_ost_net(self.dom_node.parentNode, srv_uuid)
680 panic ("Unable to find a server for:", srv_uuid)
681 self._server = Network(net)
683 def get_server(self):
687 """ default cleanup, used for most modules """
689 srv = self.get_server()
690 if srv and local_net(srv):
692 lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
693 except CommandError, e:
694 log(self.module_name, "disconnect failed: ", self.name)
698 lctl.cleanup(self.name, self.uuid)
699 except CommandError, e:
700 log(self.module_name, "cleanup failed: ", self.name)
704 def add_module(self, dev_dir, modname):
705 """Append a module to list of modules to load."""
706 self.kmodule_list.append((dev_dir, modname))
708 def mod_loaded(self, modname):
709 """Check if a module is already loaded. Look in /proc/modules for it."""
710 fp = open('/proc/modules')
711 lines = fp.readlines()
713 # please forgive my tired fingers for this one
714 ret = filter(lambda word, mod=modname: word == mod,
715 map(lambda line: string.split(line)[0], lines))
718 def load_module(self):
719 """Load all the modules in the list in the order they appear."""
720 for dev_dir, mod in self.kmodule_list:
721 # (rc, out) = run ('/sbin/lsmod | grep -s', mod)
722 if self.mod_loaded(mod) and not config.noexec():
724 log ('loading module:', mod)
726 module = find_module(config.src_dir(),dev_dir, mod)
728 panic('module not found:', mod)
729 (rc, out) = run('/sbin/insmod', module)
731 raise CommandError('insmod', out, rc)
733 (rc, out) = run('/sbin/modprobe', mod)
735 raise CommandError('modprobe', out, rc)
737 def cleanup_module(self):
738 """Unload the modules in the list in reverse order."""
739 rev = self.kmodule_list
741 for dev_dir, mod in rev:
742 if not self.mod_loaded(mod):
745 if mod == 'portals' and config.dump_file():
746 lctl.dump(config.dump_file())
747 log('unloading module:', mod)
750 (rc, out) = run('/sbin/rmmod', mod)
752 log('! unable to unload module:', mod)
756 class Network(Module):
757 def __init__(self,dom_node):
758 Module.__init__(self, 'NETWORK', dom_node)
759 self.net_type = get_attr(dom_node,'type')
760 self.nid = get_text(dom_node, 'server', '*')
761 self.port = get_text_int(dom_node, 'port', 0)
762 self.send_mem = get_text_int(dom_node, 'send_mem', DEFAULT_TCPBUF)
763 self.recv_mem = get_text_int(dom_node, 'recv_mem', DEFAULT_TCPBUF)
765 self.nid = get_local_address(self.net_type, self.nid)
767 panic("unable to set nid for", self.net_type, self.nid)
768 debug("nid:", self.nid)
770 self.add_module('portals/linux/oslib/', 'portals')
771 if node_needs_router():
772 self.add_module('portals/linux/router', 'kptlrouter')
773 if self.net_type == 'tcp':
774 self.add_module('portals/linux/socknal', 'ksocknal')
775 if self.net_type == 'toe':
776 self.add_module('portals/linux/toenal', 'ktoenal')
777 if self.net_type == 'elan':
778 self.add_module('portals/linux/rqswnal', 'kqswnal')
779 if self.net_type == 'gm':
780 self.add_module('portals/linux/gmnal', 'kgmnal')
781 self.add_module('lustre/obdclass', 'obdclass')
782 self.add_module('lustre/ptlrpc', 'ptlrpc')
785 self.info(self.net_type, self.nid, self.port)
786 if self.net_type in ('tcp', 'toe'):
787 nal_id = '' # default is socknal
788 if self.net_type == 'toe':
790 ret, out = run(TCP_ACCEPTOR, '-s', self.send_mem, '-r', self.recv_mem, nal_id, self.port)
792 raise CommandError(TCP_ACCEPTOR, out, ret)
793 ret = self.dom_node.getElementsByTagName('route_tbl')
795 for r in a.getElementsByTagName('route'):
796 net_type = get_attr(r, 'type')
797 gw = get_attr(r, 'gw')
798 lo = get_attr(r, 'lo')
799 hi = get_attr(r,'hi', '')
800 lctl.add_route(net_type, gw, lo, hi)
801 if net_type == 'tcp' and net_type == self.net_type and hi == '':
802 srv = nid2server(self.dom_node.parentNode.parentNode, lo)
804 panic("no server for nid", lo)
806 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
809 lctl.network(self.net_type, self.nid)
810 lctl.newdev(attach = "ptlrpc RPCDEV RPCDEV_UUID")
813 self.info(self.net_type, self.nid, self.port)
814 ret = self.dom_node.getElementsByTagName('route_tbl')
816 for r in a.getElementsByTagName('route'):
817 lo = get_attr(r, 'lo')
818 hi = get_attr(r,'hi', '')
819 if self.net_type == 'tcp' and hi == '':
820 srv = nid2server(self.dom_node.parentNode.parentNode, lo)
822 panic("no server for nid", lo)
825 lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
826 except CommandError, e:
827 print "disconnect failed: ", self.name
831 lctl.del_route(self.net_type, self.nid, lo, hi)
832 except CommandError, e:
833 print "del_route failed: ", self.name
838 lctl.cleanup("RPCDEV", "RPCDEV_UUID")
839 except CommandError, e:
840 print "cleanup failed: ", self.name
844 lctl.disconnectAll(self.net_type)
845 except CommandError, e:
846 print "disconnectAll failed: ", self.name
849 if self.net_type == 'tcp':
850 # yikes, this ugly! need to save pid in /var/something
851 run("killall acceptor")
854 def __init__(self,dom_node):
855 Module.__init__(self, 'LDLM', dom_node)
856 self.add_module('lustre/ldlm', 'ldlm')
859 lctl.newdev(attach="ldlm %s %s" % (self.name, self.uuid),
863 def __init__(self,dom_node):
864 Module.__init__(self, 'LOV', dom_node)
865 self.mds_uuid = get_first_ref(dom_node, 'mds')
866 mds= lookup(dom_node.parentNode, self.mds_uuid)
867 self.mds_name = getName(mds)
868 devs = dom_node.getElementsByTagName('devices')
871 self.stripe_sz = get_attr_int(dev_node, 'stripesize', 65536)
872 self.stripe_off = get_attr_int(dev_node, 'stripeoffset', 0)
873 self.pattern = get_attr_int(dev_node, 'pattern', 0)
874 self.devlist = get_all_refs(dev_node, 'osc')
875 self.stripe_cnt = get_attr_int(dev_node, 'stripecount', len(self.devlist))
876 self.add_module('lustre/mdc', 'mdc')
877 self.add_module('lustre/lov', 'lov')
880 for osc_uuid in self.devlist:
881 osc = lookup(self.dom_node.parentNode, osc_uuid)
886 panic('osc not found:', osc_uuid)
887 mdc_uuid = prepare_mdc(self.dom_node.parentNode, self.mds_uuid)
888 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
889 self.stripe_off, self.pattern, self.devlist, self.mds_name)
890 lctl.newdev(attach="lov %s %s" % (self.name, self.uuid),
891 setup ="%s" % (mdc_uuid))
894 for osc_uuid in self.devlist:
895 osc = lookup(self.dom_node.parentNode, osc_uuid)
900 panic('osc not found:', osc_uuid)
902 cleanup_mdc(self.dom_node.parentNode, self.mds_uuid)
905 def load_module(self):
906 for osc_uuid in self.devlist:
907 osc = lookup(self.dom_node.parentNode, osc_uuid)
913 panic('osc not found:', osc_uuid)
914 Module.load_module(self)
917 def cleanup_module(self):
918 Module.cleanup_module(self)
919 for osc_uuid in self.devlist:
920 osc = lookup(self.dom_node.parentNode, osc_uuid)
926 panic('osc not found:', osc_uuid)
928 class LOVConfig(Module):
929 def __init__(self,dom_node):
930 Module.__init__(self, 'LOVConfig', dom_node)
931 self.lov_uuid = get_first_ref(dom_node, 'lov')
932 l = lookup(dom_node.parentNode, self.lov_uuid)
937 self.info(lov.mds_uuid, lov.stripe_cnt, lov.stripe_sz, lov.stripe_off,
938 lov.pattern, lov.devlist, lov.mds_name)
939 lctl.lov_setconfig(lov.uuid, lov.mds_name, lov.stripe_cnt,
940 lov.stripe_sz, lov.stripe_off, lov.pattern,
941 string.join(lov.devlist))
949 def __init__(self,dom_node):
950 Module.__init__(self, 'MDS', dom_node)
951 self.devname, self.size = get_device(dom_node)
952 self.fstype = get_text(dom_node, 'fstype')
953 # FIXME: if fstype not set, then determine based on kernel version
954 self.format = get_text(dom_node, 'autoformat', "no")
955 if self.fstype == 'extN':
956 self.add_module('lustre/extN', 'extN')
957 self.add_module('lustre/mds', 'mds')
958 self.add_module('lustre/mds', 'mds_%s' % (self.fstype))
961 self.info(self.devname, self.fstype, self.format)
962 blkdev = block_dev(self.devname, self.size, self.fstype, self.format)
963 if not is_prepared('MDT_UUID'):
964 lctl.newdev(attach="mdt %s %s" % ('MDT', 'MDT_UUID'),
966 lctl.newdev(attach="mds %s %s" % (self.name, self.uuid),
967 setup ="%s %s" %(blkdev, self.fstype))
969 if is_prepared('MDT_UUID'):
971 lctl.cleanup("MDT", "MDT_UUID")
972 except CommandError, e:
973 print "cleanup failed: ", self.name
977 clean_loop(self.devname)
979 # Very unusual case, as there is no MDC element in the XML anymore
980 # Builds itself from an MDS node
982 def __init__(self,dom_node):
983 self.mds = MDS(dom_node)
984 self.dom_node = dom_node
985 self.module_name = 'MDC'
986 self.kmodule_list = []
990 host = socket.gethostname()
991 self.name = 'MDC_%s_%s' % ( host, self.mds.name )
992 self.uuid = self.name + '_UUID'
994 self.lookup_server(self.mds.uuid)
995 self.add_module('lustre/mdc', 'mdc')
998 self.info(self.mds.uuid)
999 srv = self.get_server()
1000 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
1001 lctl.newdev(attach="mdc %s %s" % (self.name, self.uuid),
1002 setup ="%s %s" %(self.mds.uuid, srv.uuid))
1005 def __init__(self, dom_node):
1006 Module.__init__(self, 'OBD', dom_node)
1007 self.obdtype = get_attr(dom_node, 'type')
1008 self.devname, self.size = get_device(dom_node)
1009 self.fstype = get_text(dom_node, 'fstype')
1010 # FIXME: if fstype not set, then determine based on kernel version
1011 self.format = get_text(dom_node, 'autoformat', 'yes')
1012 if self.fstype == 'extN':
1013 self.add_module('lustre/extN', 'extN')
1014 self.add_module('lustre/' + self.obdtype, self.obdtype)
1016 # need to check /proc/mounts and /etc/mtab before
1017 # formatting anything.
1018 # FIXME: check if device is already formatted.
1020 self.info(self.obdtype, self.devname, self.size, self.fstype, self.format)
1021 if self.obdtype == 'obdecho':
1024 blkdev = block_dev(self.devname, self.size, self.fstype, self.format)
1025 lctl.newdev(attach="%s %s %s" % (self.obdtype, self.name, self.uuid),
1026 setup ="%s %s" %(blkdev, self.fstype))
1028 Module.cleanup(self)
1029 if not self.obdtype == 'obdecho':
1030 clean_loop(self.devname)
1033 def __init__(self,dom_node):
1034 Module.__init__(self, 'OST', dom_node)
1035 self.obd_uuid = get_first_ref(dom_node, 'obd')
1036 self.add_module('lustre/ost', 'ost')
1039 self.info(self.obd_uuid)
1040 lctl.newdev(attach="ost %s %s" % (self.name, self.uuid),
1041 setup ="%s" % (self.obd_uuid))
1044 # virtual interface for OSC and LOV
1046 def __init__(self,dom_node):
1047 Module.__init__(self, 'VOSC', dom_node)
1048 if dom_node.nodeName == 'lov':
1049 self.osc = LOV(dom_node)
1051 self.osc = OSC(dom_node)
1056 def load_module(self):
1057 self.osc.load_module()
1058 def cleanup_module(self):
1059 self.osc.cleanup_module()
1063 def __init__(self,dom_node):
1064 Module.__init__(self, 'OSC', dom_node)
1065 self.obd_uuid = get_first_ref(dom_node, 'obd')
1066 self.ost_uuid = get_first_ref(dom_node, 'ost')
1067 self.lookup_server(self.ost_uuid)
1068 self.add_module('lustre/osc', 'osc')
1071 self.info(self.obd_uuid, self.ost_uuid)
1072 srv = self.get_server()
1074 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
1078 lctl.add_route_host(r[0], srv.uuid, r[1], r[2])
1080 panic ("no route to", srv.nid)
1082 lctl.newdev(attach="osc %s %s" % (self.name, self.uuid),
1083 setup ="%s %s" %(self.obd_uuid, srv.uuid))
1086 srv = self.get_server()
1088 Module.cleanup(self)
1090 self.info(self.obd_uuid, self.ost_uuid)
1094 lctl.del_route_host(r[0], srv.uuid, r[1], r[2])
1095 except CommandError, e:
1096 print "del_route failed: ", self.name
1099 Module.cleanup(self)
1102 class Mountpoint(Module):
1103 def __init__(self,dom_node):
1104 Module.__init__(self, 'MTPT', dom_node)
1105 self.path = get_text(dom_node, 'path')
1106 self.mds_uuid = get_first_ref(dom_node, 'mds')
1107 self.lov_uuid = get_first_ref(dom_node, 'osc')
1108 self.add_module('lustre/mdc', 'mdc')
1109 self.add_module('lustre/llite', 'llite')
1110 l = lookup(self.dom_node.parentNode, self.lov_uuid)
1115 mdc_uuid = prepare_mdc(self.dom_node.parentNode, self.mds_uuid)
1116 self.info(self.path, self.mds_uuid, self.lov_uuid)
1117 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s none %s" % \
1118 (self.lov_uuid, mdc_uuid, self.path)
1119 run("mkdir", self.path)
1122 panic("mount failed:", self.path)
1125 self.info(self.path, self.mds_uuid,self.lov_uuid)
1127 (rc, out) = run("umount -f", self.path)
1129 (rc, out) = run("umount", self.path)
1131 log("umount failed, cleanup will most likely not work.")
1132 l = lookup(self.dom_node.parentNode, self.lov_uuid)
1134 cleanup_mdc(self.dom_node.parentNode, self.mds_uuid)
1136 def load_module(self):
1137 self.osc.load_module()
1138 Module.load_module(self)
1139 def cleanup_module(self):
1140 Module.cleanup_module(self)
1141 self.osc.cleanup_module()
1144 # ============================================================
1145 # XML processing and query
1146 # TODO: Change query funcs to use XPath, which is muc cleaner
1148 def get_device(obd):
1149 list = obd.getElementsByTagName('device')
1153 size = get_attr_int(dev, 'size', 0)
1154 return dev.firstChild.data, size
1157 # Get the text content from the first matching child
1158 # If there is no content (or it is all whitespace), return
1160 def get_text(dom_node, tag, default=""):
1161 list = dom_node.getElementsByTagName(tag)
1164 dom_node.normalize()
1165 if dom_node.firstChild:
1166 txt = string.strip(dom_node.firstChild.data)
1171 def get_text_int(dom_node, tag, default=0):
1172 list = dom_node.getElementsByTagName(tag)
1176 dom_node.normalize()
1177 if dom_node.firstChild:
1178 txt = string.strip(dom_node.firstChild.data)
1183 panic("text value is not integer:", txt)
1186 def get_attr(dom_node, attr, default=""):
1187 v = dom_node.getAttribute(attr)
1192 def get_attr_int(dom_node, attr, default=0):
1194 v = dom_node.getAttribute(attr)
1199 panic("attr value is not integer", v)
1202 def get_first_ref(dom_node, tag):
1203 """ Get the first uuidref of the type TAG. Used one only
1204 one is expected. Returns the uuid."""
1206 refname = '%s_ref' % tag
1207 list = dom_node.getElementsByTagName(refname)
1209 uuid = getRef(list[0])
1212 def get_all_refs(dom_node, tag):
1213 """ Get all the refs of type TAG. Returns list of uuids. """
1215 refname = '%s_ref' % tag
1216 list = dom_node.getElementsByTagName(refname)
1219 uuids.append(getRef(i))
1222 def get_ost_net(dom_node, uuid):
1223 ost = lookup(dom_node, uuid)
1224 uuid = get_first_ref(ost, 'network')
1227 return lookup(dom_node, uuid)
1229 def nid2server(dom_node, nid):
1230 netlist = dom_node.getElementsByTagName('network')
1231 for net_node in netlist:
1232 if get_text(net_node, 'server') == nid:
1233 return Network(net_node)
1236 def lookup(dom_node, uuid):
1237 for n in dom_node.childNodes:
1238 if n.nodeType == n.ELEMENT_NODE:
1239 if getUUID(n) == uuid:
1246 # Get name attribute of dom_node
1247 def getName(dom_node):
1248 return dom_node.getAttribute('name')
1250 def getRef(dom_node):
1251 return dom_node.getAttribute('uuidref')
1253 # Get name attribute of dom_node
1254 def getUUID(dom_node):
1255 return dom_node.getAttribute('uuid')
1257 # the tag name is the service type
1258 # fixme: this should do some checks to make sure the dom_node is a service
1259 def getServiceType(dom_node):
1260 return dom_node.nodeName
1263 # determine what "level" a particular node is at.
1264 # the order of iniitailization is based on level.
1265 def getServiceLevel(dom_node):
1266 type = getServiceType(dom_node)
1268 if type in ('network',):
1270 elif type in ('device', 'ldlm'):
1272 elif type in ('obd', 'mdd'):
1274 elif type in ('mds','ost'):
1276 elif type in ('mdc','osc'):
1278 elif type in ('lov', 'lovconfig'):
1280 elif type in ('mountpoint',):
1283 if ret < config.startlevel() or ret > config.endlevel():
1288 # return list of services in a profile. list is a list of tuples
1289 # [(level, dom_node),]
1290 def getServices(lustreNode, profileNode):
1292 for n in profileNode.childNodes:
1293 if n.nodeType == n.ELEMENT_NODE:
1294 servNode = lookup(lustreNode, getRef(n))
1297 panic('service not found: ' + getRef(n))
1298 level = getServiceLevel(servNode)
1300 list.append((level, servNode))
1304 def getByName(lustreNode, name, tag):
1305 ndList = lustreNode.getElementsByTagName(tag)
1307 if getName(nd) == name:
1312 ############################################################
1314 # FIXME: clean this mess up!
1317 def prepare_mdc(dom_node, mds_uuid):
1319 mds_node = lookup(dom_node, mds_uuid);
1321 panic("no mds:", mds_uuid)
1322 if saved_mdc.has_key(mds_uuid):
1323 return saved_mdc[mds_uuid]
1326 saved_mdc[mds_uuid] = mdc.uuid
1329 def cleanup_mdc(dom_node, mds_uuid):
1331 mds_node = lookup(dom_node, mds_uuid);
1333 panic("no mds:", mds_uuid)
1334 if not saved_mdc.has_key(mds_uuid):
1337 saved_mdc[mds_uuid] = mdc.uuid
1340 ############################################################
1341 # routing ("rooting")
1347 def init_node(dom_node):
1348 global local_node, router_flag
1349 netlist = dom_node.getElementsByTagName('network')
1350 for dom_net in netlist:
1351 type = get_attr(dom_net, 'type')
1352 gw = get_text(dom_net, 'server')
1353 local_node.append((type, gw))
1355 def node_needs_router():
1358 def get_routes(type, gw, dom_net):
1359 """ Return the routes as a list of tuples of the form:
1360 [(type, gw, lo, hi),]"""
1362 tbl = dom_net.getElementsByTagName('route_tbl')
1364 routes = t.getElementsByTagName('route')
1366 lo = get_attr(r, 'lo')
1367 hi = get_attr(r, 'hi', '')
1368 res.append((type, gw, lo, hi))
1372 def init_route_config(lustre):
1373 """ Scan the lustre config looking for routers. Build list of
1375 global routes, router_flag
1377 list = lustre.getElementsByTagName('node')
1379 if get_attr(node, 'router'):
1381 for (local_type, local_nid) in local_node:
1383 netlist = node.getElementsByTagName('network')
1384 for dom_net in netlist:
1385 if local_type == get_attr(dom_net, 'type'):
1386 gw = get_text(dom_net, 'server')
1390 for dom_net in netlist:
1391 if local_type != get_attr(dom_net, 'type'):
1392 for route in get_routes(local_type, gw, dom_net):
1393 routes.append(route)
1398 for iface in local_node:
1399 if net.net_type == iface[0]:
1403 def find_route(net):
1404 global local_node, routes
1405 frm_type = local_node[0][0]
1406 to_type = net.net_type
1408 debug ('looking for route to', to_type,to)
1417 ############################################################
1420 def startService(dom_node, module_flag):
1421 type = getServiceType(dom_node)
1422 debug('Service:', type, getName(dom_node), getUUID(dom_node))
1423 # there must be a more dynamic way of doing this...
1429 elif type == 'lovconfig':
1430 n = LOVConfig(dom_node)
1431 elif type == 'network':
1432 n = Network(dom_node)
1443 elif type == 'mountpoint':
1444 n = Mountpoint(dom_node)
1446 panic ("unknown service type:", type)
1451 if config.cleanup():
1456 if config.nosetup():
1458 if config.cleanup():
1464 # Prepare the system to run lustre using a particular profile
1465 # in a the configuration.
1466 # * load & the modules
1467 # * setup networking for the current node
1468 # * make sure partitions are in place and prepared
1469 # * initialize devices with lctl
1470 # Levels is important, and needs to be enforced.
1471 def startProfile(lustreNode, profileNode, module_flag):
1473 panic("profile:", profile, "not found.")
1474 services = getServices(lustreNode, profileNode)
1475 if config.cleanup():
1478 startService(s[1], module_flag)
1483 def doHost(lustreNode, hosts):
1487 dom_node = getByName(lustreNode, h, 'node')
1492 print 'No host entry found.'
1495 if not get_attr(dom_node, 'router'):
1497 init_route_config(lustreNode)
1502 # Two step process: (1) load modules, (2) setup lustre
1503 # if not cleaning, load modules first.
1504 module_flag = not config.cleanup()
1505 reflist = dom_node.getElementsByTagName('profile')
1506 for profile in reflist:
1507 startProfile(lustreNode, profile, module_flag)
1509 if not config.cleanup():
1510 sys_set_debug_path()
1511 script = config.gdb_script()
1512 run(lctl.lctl, ' modules >', script)
1514 # dump /tmp/ogdb and sleep/pause here
1515 log ("The GDB module script is in", script)
1518 module_flag = not module_flag
1519 for profile in reflist:
1520 startProfile(lustreNode, profile, module_flag)
1522 ############################################################
1523 # Command line processing
1525 def parse_cmdline(argv):
1526 short_opts = "hdnvf"
1527 long_opts = ["ldap", "reformat", "lustre=", "verbose", "gdb",
1528 "portals=", "makeldiff", "cleanup", "noexec",
1529 "help", "node=", "nomod", "nosetup",
1530 "dump=", "force", "startlevel=", "endlevel="]
1534 opts, args = getopt.getopt(argv, short_opts, long_opts)
1535 except getopt.error:
1540 if o in ("-h", "--help"):
1542 if o in ("-d","--cleanup"):
1544 if o in ("-v", "--verbose"):
1546 if o in ("-n", "--noexec"):
1549 if o == "--portals":
1553 if o == "--reformat":
1561 if o == "--nosetup":
1565 if o in ("-f", "--force"):
1567 if o in ("--startlevel",):
1568 config.startlevel(a)
1569 if o in ("--endlevel",):
1578 s = urllib.urlopen(url)
1584 def setupModulePath(cmd):
1585 base = os.path.dirname(cmd)
1586 if os.access(base+"/Makefile", os.R_OK):
1587 config.src_dir(base + "/../../")
1589 def sys_set_debug_path():
1590 debug("debug path: ", config.debug_path())
1594 fp = open('/proc/sys/portals/debug_path', 'w')
1595 fp.write(config.debug_path())
1600 #/proc/sys/net/core/rmem_max
1601 #/proc/sys/net/core/wmem_max
1602 def sys_set_netmem_max(path, max):
1603 debug("setting", path, "to at least", max)
1611 fp = open(path, 'w')
1612 fp.write('%d\n' %(max))
1616 def sys_make_devices():
1617 if not os.access('/dev/portals', os.R_OK):
1618 run('mknod /dev/portals c 10 240')
1619 if not os.access('/dev/obd', os.R_OK):
1620 run('mknod /dev/obd c 10 241')
1623 # Add dir to the global PATH, if not already there.
1624 def add_to_path(new_dir):
1625 syspath = string.split(os.environ['PATH'], ':')
1626 if new_dir in syspath:
1628 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
1631 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
1632 # ensure basic elements are in the system path
1633 def sanitise_path():
1634 for dir in DEFAULT_PATH:
1637 # Initialize or shutdown lustre according to a configuration file
1638 # * prepare the system for lustre
1639 # * configure devices with lctl
1640 # Shutdown does steps in reverse
1643 global TCP_ACCEPTOR, lctl, MAXTCPBUF
1644 host = socket.gethostname()
1648 args = parse_cmdline(sys.argv[1:])
1650 if not os.access(args[0], os.R_OK):
1651 print 'File not found or readable:', args[0]
1653 dom = xml.dom.minidom.parse(args[0])
1655 xmldata = fetch(config.url())
1656 dom = xml.dom.minidom.parseString(xmldata)
1662 node_list.append(config.node())
1665 node_list.append(host)
1666 node_list.append('localhost')
1667 debug("configuring for host: ", node_list)
1670 config._debug_path = config._debug_path + '-' + host
1671 config._gdb_script = config._gdb_script + '-' + host
1673 TCP_ACCEPTOR = find_prog('acceptor')
1674 if not TCP_ACCEPTOR:
1676 TCP_ACCEPTOR = 'acceptor'
1677 debug('! acceptor not found')
1679 panic('acceptor not found')
1681 lctl = LCTLInterface('lctl')
1683 setupModulePath(sys.argv[0])
1685 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
1686 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
1687 doHost(dom.documentElement, node_list)
1689 if __name__ == "__main__":
1692 except LconfError, e:
1694 except CommandError, e:
1698 if first_cleanup_error:
1699 sys.exit(first_cleanup_error)