3 # Copyright (C) 2002 Cluster File Systems, Inc.
4 # Author: Robert Read <rread@clusterfs.com>
5 # This file is part of Lustre, http://www.lustre.org.
7 # Lustre is free software; you can redistribute it and/or
8 # modify it under the terms of version 2 of the GNU General Public
9 # License as published by the Free Software Foundation.
11 # Lustre is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with Lustre; if not, write to the Free Software
18 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 # lconf - lustre configuration tool
22 # lconf is the main driver script for starting and stopping
23 # lustre filesystem services.
25 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import string, os, stat, popen2, socket, time
30 import xml.dom.minidom
35 DEFAULT_TCPBUF = 1048576
37 # Maximum number of devices to search for.
38 # (the /dev/loop* nodes need to be created beforehand)
39 MAX_LOOP_DEVICES = 256
41 first_cleanup_error = 0
42 def cleanup_error(rc):
43 global first_cleanup_error
44 if not first_cleanup_error:
45 first_cleanup_error = rc
49 print """usage: lconf config.xml
51 config.xml Lustre configuration in xml format.
52 --get <url> URL to fetch a config file
53 --node <nodename> Load config for <nodename>
54 -d | --cleanup Cleans up config. (Shutdown)
55 -f | --force Forced unmounting and/or obd detach during cleanup
56 -v | --verbose Print system commands as they are run
57 -h | --help Print this help
58 --gdb Prints message after creating gdb module script
59 and sleeps for 5 seconds.
60 -n | --noexec Prints the commands and steps that will be run for a
61 config without executing them. This can used to check if a
62 config file is doing what it should be doing. (Implies -v)
63 --nomod Skip load/unload module step.
64 --nosetup Skip device setup/cleanup step.
65 --reformat Reformat all devices (without question)
66 --dump <file> Dump the kernel debug log before portals is unloaded
67 --minlevel <num> Specify the minimum level of services to configure/cleanup (default 0)
68 --maxlevel <num> Specify the maximum level of services to configure/cleanup (default 100)
69 Levels are aproximatly like:
79 --ldap server LDAP server with lustre config database
80 --makeldiff Translate xml source to LDIFF
81 This are perhaps not needed:
82 --lustre="src dir" Base directory of lustre sources. Used to search
84 --portals=src Portals source
88 # ============================================================
89 # Config parameters, encapsulated in a class
105 self._gdb_script = '/tmp/ogdb'
106 self._debug_path = '/tmp/lustre-log'
107 self._dump_file = None
112 def verbose(self, flag = None):
113 if flag: self._verbose = flag
116 def noexec(self, flag = None):
117 if flag: self._noexec = flag
120 def reformat(self, flag = None):
121 if flag: self._reformat = flag
122 return self._reformat
124 def cleanup(self, flag = None):
125 if flag: self._cleanup = flag
128 def gdb(self, flag = None):
129 if flag: self._gdb = flag
132 def nomod(self, flag = None):
133 if flag: self._nomod = flag
136 def nosetup(self, flag = None):
137 if flag: self._nosetup = flag
140 def force(self, flag = None):
141 if flag: self._force = flag
144 def node(self, val = None):
145 if val: self._node = val
148 def url(self, val = None):
149 if val: self._url = val
152 def gdb_script(self):
153 if os.path.isdir('/r'):
154 return '/r' + self._gdb_script
156 return self._gdb_script
158 def debug_path(self):
159 if os.path.isdir('/r'):
160 return '/r' + self._debug_path
162 return self._debug_path
164 def src_dir(self, val = None):
165 if val: self._src_dir = val
168 def dump_file(self, val = None):
169 if val: self._dump_file = val
170 return self._dump_file
172 def minlevel(self, val = None):
173 if val: self._minlevel = int(val)
174 return self._minlevel
176 def maxlevel(self, val = None):
177 if val: self._maxlevel = int(val)
178 return self._maxlevel
184 # ============================================================
185 # debugging and error funcs
187 def fixme(msg = "this feature"):
188 raise LconfError, msg + ' not implmemented yet.'
191 msg = string.join(map(str,args))
192 if not config.noexec():
193 raise LconfError(msg)
198 msg = string.join(map(str,args))
203 print string.strip(s)
207 msg = string.join(map(str,args))
210 # ============================================================
211 # locally defined exceptions
212 class CommandError (exceptions.Exception):
213 def __init__(self, cmd_name, cmd_err, rc=None):
214 self.cmd_name = cmd_name
215 self.cmd_err = cmd_err
220 if type(self.cmd_err) == types.StringType:
222 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
224 print "! %s: %s" % (self.cmd_name, self.cmd_err)
225 elif type(self.cmd_err) == types.ListType:
227 print "! %s (error %d):" % (self.cmd_name, self.rc)
229 print "! %s:" % (self.cmd_name)
230 for s in self.cmd_err:
231 print "> %s" %(string.strip(s))
235 class LconfError (exceptions.Exception):
236 def __init__(self, args):
240 # ============================================================
241 # handle lctl interface
244 Manage communication with lctl
247 def __init__(self, cmd):
249 Initialize close by finding the lctl binary.
251 self.lctl = find_prog(cmd)
254 debug('! lctl not found')
257 raise CommandError('lctl', "unable to find lctl binary.")
262 the cmds are written to stdin of lctl
263 lctl doesn't return errors when run in script mode, so
265 should modify command line to accept multiple commands, or
266 create complex command line options
268 debug("+", self.lctl, cmds)
269 if config.noexec(): return (0, [])
270 p = popen2.Popen3(self.lctl, 1)
271 p.tochild.write(cmds + "\n")
273 out = p.fromchild.readlines()
274 err = p.childerr.readlines()
276 if os.WIFEXITED(ret):
277 rc = os.WEXITSTATUS(ret)
281 raise CommandError(self.lctl, err, rc)
284 def runcmd(self, *args):
286 run lctl using the command line
288 cmd = string.join(map(str,args))
289 debug("+", self.lctl, cmd)
290 rc, out = run(self.lctl, cmd)
292 raise CommandError(self.lctl, out, rc)
296 def network(self, net, nid):
297 """ initialized network and add "self" """
298 # Idea: "mynid" could be used for all network types to add "self," and then
299 # this special case would be gone and the "self" hack would be hidden.
305 quit""" % (net, nid, nid)
314 # create a new connection
315 def connect(self, net, nid, port, servuuid, send_mem, recv_mem):
323 quit""" % (net, servuuid, nid, send_mem, recv_mem, nid, port, )
329 quit""" % (net, servuuid, nid, nid, port, )
333 # add a route to a range
334 def add_route(self, net, gw, lo, hi):
338 quit """ % (net, gw, lo, hi)
342 def del_route(self, net, gw, lo, hi):
350 # add a route to a host
351 def add_route_host(self, net, uuid, gw, tgt):
356 quit """ % (net, uuid, tgt, gw, tgt)
359 # add a route to a range
360 def del_route_host(self, net, uuid, gw, tgt):
366 quit """ % (net, uuid, tgt)
369 # disconnect one connection
370 def disconnect(self, net, nid, port, servuuid):
376 quit""" % (net, nid, servuuid)
380 def disconnectAll(self, net):
389 # create a new device with lctl
390 def newdev(self, attach, setup = ""):
395 quit""" % (attach, setup)
399 def cleanup(self, name, uuid):
405 quit""" % (name, ('', 'force')[config.force()])
409 def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist):
413 lov_setconfig %s %d %d %d %s %s
414 quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
418 def dump(self, dump_file):
421 quit""" % (dump_file)
424 # get list of devices
425 def device_list(self):
426 rc, out = self.runcmd('device_list')
429 # ============================================================
430 # Various system-level functions
431 # (ideally moved to their own module)
433 # Run a command and return the output and status.
434 # stderr is sent to /dev/null, could use popen3 to
435 # save it if necessary
437 cmd = string.join(map(str,args))
439 if config.noexec(): return (0, [])
440 f = os.popen(cmd + ' 2>&1')
449 # Run a command in the background.
450 def run_daemon(*args):
451 cmd = string.join(map(str,args))
453 if config.noexec(): return 0
454 f = os.popen(cmd + ' 2>&1')
462 # Determine full path to use for an external command
463 # searches dirname(argv[0]) first, then PATH
465 syspath = string.split(os.environ['PATH'], ':')
466 cmdpath = os.path.dirname(sys.argv[0])
467 syspath.insert(0, cmdpath);
468 syspath.insert(0, os.path.join(cmdpath, '../../portals/linux/utils/'))
470 prog = os.path.join(d,cmd)
471 if os.access(prog, os.X_OK):
475 # Recursively look for file starting at base dir
476 def do_find_file(base, mod):
477 fullname = os.path.join(base, mod)
478 if os.access(fullname, os.R_OK):
480 for d in os.listdir(base):
481 dir = os.path.join(base,d)
482 if os.path.isdir(dir):
483 module = do_find_file(dir, mod)
487 def find_module(src_dir, dev_dir, modname):
488 mod = '%s.o' % (modname)
489 module = src_dir +'/'+ dev_dir +'/'+ mod
491 if os.access(module, os.R_OK):
497 # is the path a block device?
504 return stat.S_ISBLK(s[stat.ST_MODE])
506 # build fs according to type
508 def mkfs(fstype, dev):
509 if(fstype in ('ext3', 'extN')):
510 mkfs = 'mkfs.ext2 -j -b 4096'
512 print 'unsupported fs type: ', fstype
513 if not is_block(dev):
517 (ret, out) = run (mkfs, force, dev)
519 panic("Unable to build fs:", dev)
520 # enable hash tree indexing on fsswe
521 # FIXME: this check can probably go away on 2.5
523 htree = 'echo "feature FEATURE_C5" | debugfs -w'
524 (ret, out) = run (htree, dev)
526 panic("Unable to enable htree:", dev)
528 # some systems use /dev/loopN, some /dev/loop/N
532 if not os.access(loop + str(0), os.R_OK):
534 if not os.access(loop + str(0), os.R_OK):
535 panic ("can't access loop devices")
538 # find loop device assigned to thefile
541 for n in xrange(0, MAX_LOOP_DEVICES):
543 if os.access(dev, os.R_OK):
544 (stat, out) = run('losetup', dev)
545 if (out and stat == 0):
546 m = re.search(r'\((.*)\)', out[0])
547 if m and file == m.group(1):
553 # create file if necessary and assign the first free loop device
554 def init_loop(file, size, fstype):
555 dev = find_loop(file)
557 print 'WARNING file:', file, 'already mapped to', dev
559 if config.reformat() or not os.access(file, os.R_OK | os.W_OK):
560 run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size, file))
562 # find next free loop
563 for n in xrange(0, MAX_LOOP_DEVICES):
565 if os.access(dev, os.R_OK):
566 (stat, out) = run('losetup', dev)
568 run('losetup', dev, file)
571 print "out of loop devices"
573 print "out of loop devices"
576 # undo loop assignment
577 def clean_loop(file):
578 dev = find_loop(file)
580 ret, out = run('losetup -d', dev)
582 log('unable to clean loop device:', dev, 'for file:', file)
585 # determine if dev is formatted as a <fstype> filesystem
586 def need_format(fstype, dev):
587 # FIXME don't know how to implement this
590 # initialize a block device if needed
591 def block_dev(dev, size, fstype, format):
592 if config.noexec(): return dev
593 if not is_block(dev):
594 dev = init_loop(dev, size, fstype)
595 if config.reformat() or (need_format(fstype, dev) and format == 'yes'):
599 # panic("device:", dev,
600 # "not prepared, and autoformat is not set.\n",
601 # "Rerun with --reformat option to format ALL filesystems")
606 """lookup IP address for an interface"""
607 rc, out = run("/sbin/ifconfig", iface)
610 addr = string.split(out[1])[1]
611 ip = string.split(addr, ':')[1]
614 def get_local_address(net_type, wildcard):
615 """Return the local address for the network type."""
617 if net_type == 'tcp':
619 iface, star = string.split(wildcard, ':')
620 local = if2addr(iface)
622 panic ("unable to determine ip for:", wildcard)
624 host = socket.gethostname()
625 local = socket.gethostbyname(host)
626 elif net_type == 'elan':
627 # awk '/NodeId/ { print $2 }' '/proc/elan/device0/position'
629 fp = open('/proc/elan/device0/position', 'r')
630 lines = fp.readlines()
639 elif net_type == 'gm':
640 fixme("automatic local address for GM")
644 def is_prepared(uuid):
645 """Return true if a device exists for the uuid"""
646 # expect this format:
647 # 1 UP ldlm ldlm ldlm_UUID 2
649 out = lctl.device_list()
651 if uuid == string.split(s)[4]:
653 except CommandError, e:
658 # ============================================================
659 # Classes to prepare and cleanup the various objects
662 """ Base class for the rest of the modules. The default cleanup method is
663 defined here, as well as some utilitiy funcs.
665 def __init__(self, module_name, dom_node):
666 self.dom_node = dom_node
667 self.module_name = module_name
668 self.name = get_attr(dom_node, 'name')
669 self.uuid = get_attr(dom_node, 'uuid')
670 self.kmodule_list = []
674 def info(self, *args):
675 msg = string.join(map(str,args))
676 print self.module_name + ":", self.name, self.uuid, msg
679 def lookup_server(self, srv_uuid):
680 """ Lookup a server's network information """
681 net = get_ost_net(self.dom_node.parentNode, srv_uuid)
683 panic ("Unable to find a server for:", srv_uuid)
684 self._server = Network(net)
686 def get_server(self):
690 """ default cleanup, used for most modules """
691 if not is_prepared(self.uuid):
694 srv = self.get_server()
695 if srv and local_net(srv):
697 lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
698 except CommandError, e:
699 log(self.module_name, "disconnect failed: ", self.name)
703 lctl.cleanup(self.name, self.uuid)
704 except CommandError, e:
705 log(self.module_name, "cleanup failed: ", self.name)
709 def add_module(self, dev_dir, modname):
710 """Append a module to list of modules to load."""
711 self.kmodule_list.append((dev_dir, modname))
713 def mod_loaded(self, modname):
714 """Check if a module is already loaded. Look in /proc/modules for it."""
715 fp = open('/proc/modules')
716 lines = fp.readlines()
718 # please forgive my tired fingers for this one
719 ret = filter(lambda word, mod=modname: word == mod,
720 map(lambda line: string.split(line)[0], lines))
723 def load_module(self):
724 """Load all the modules in the list in the order they appear."""
725 for dev_dir, mod in self.kmodule_list:
726 # (rc, out) = run ('/sbin/lsmod | grep -s', mod)
727 if self.mod_loaded(mod) and not config.noexec():
729 log ('loading module:', mod)
731 module = find_module(config.src_dir(),dev_dir, mod)
733 panic('module not found:', mod)
734 (rc, out) = run('/sbin/insmod', module)
736 raise CommandError('insmod', out, rc)
738 (rc, out) = run('/sbin/modprobe', mod)
740 raise CommandError('modprobe', out, rc)
742 def cleanup_module(self):
743 """Unload the modules in the list in reverse order."""
744 rev = self.kmodule_list
746 for dev_dir, mod in rev:
747 if not self.mod_loaded(mod):
750 if mod == 'portals' and config.dump_file():
751 lctl.dump(config.dump_file())
752 log('unloading module:', mod)
755 (rc, out) = run('/sbin/rmmod', mod)
757 log('! unable to unload module:', mod)
761 class Network(Module):
762 def __init__(self,dom_node):
763 Module.__init__(self, 'NETWORK', dom_node)
764 self.net_type = get_attr(dom_node,'type')
765 self.nid = get_text(dom_node, 'server', '*')
766 self.port = get_text_int(dom_node, 'port', 0)
767 self.send_mem = get_text_int(dom_node, 'send_mem', DEFAULT_TCPBUF)
768 self.recv_mem = get_text_int(dom_node, 'recv_mem', DEFAULT_TCPBUF)
770 self.nid = get_local_address(self.net_type, self.nid)
772 panic("unable to set nid for", self.net_type, self.nid)
773 debug("nid:", self.nid)
775 self.add_module('portals/linux/oslib/', 'portals')
776 if node_needs_router():
777 self.add_module('portals/linux/router', 'kptlrouter')
778 if self.net_type == 'tcp':
779 self.add_module('portals/linux/socknal', 'ksocknal')
780 if self.net_type == 'toe':
781 self.add_module('portals/linux/toenal', 'ktoenal')
782 if self.net_type == 'elan':
783 self.add_module('portals/linux/rqswnal', 'kqswnal')
784 if self.net_type == 'gm':
785 self.add_module('portals/linux/gmnal', 'kgmnal')
786 self.add_module('lustre/obdclass', 'obdclass')
787 self.add_module('lustre/ptlrpc', 'ptlrpc')
790 self.info(self.net_type, self.nid, self.port)
791 if self.net_type in ('tcp', 'toe'):
792 nal_id = '' # default is socknal
793 if self.net_type == 'toe':
795 ret, out = run(TCP_ACCEPTOR, '-s', self.send_mem, '-r', self.recv_mem, nal_id, self.port)
797 raise CommandError(TCP_ACCEPTOR, out, ret)
798 ret = self.dom_node.getElementsByTagName('route_tbl')
800 for r in a.getElementsByTagName('route'):
801 net_type = get_attr(r, 'type')
802 gw = get_attr(r, 'gw')
803 lo = get_attr(r, 'lo')
804 hi = get_attr(r,'hi', '')
805 lctl.add_route(net_type, gw, lo, hi)
806 if net_type == 'tcp' and net_type == self.net_type and hi == '':
807 srv = nid2server(self.dom_node.parentNode.parentNode, lo)
809 panic("no server for nid", lo)
811 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
814 lctl.network(self.net_type, self.nid)
815 lctl.newdev(attach = "ptlrpc RPCDEV RPCDEV_UUID")
818 self.info(self.net_type, self.nid, self.port)
819 ret = self.dom_node.getElementsByTagName('route_tbl')
821 for r in a.getElementsByTagName('route'):
822 lo = get_attr(r, 'lo')
823 hi = get_attr(r,'hi', '')
824 if self.net_type == 'tcp' and hi == '':
825 srv = nid2server(self.dom_node.parentNode.parentNode, lo)
827 panic("no server for nid", lo)
830 lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
831 except CommandError, e:
832 print "disconnect failed: ", self.name
836 lctl.del_route(self.net_type, self.nid, lo, hi)
837 except CommandError, e:
838 print "del_route failed: ", self.name
843 lctl.cleanup("RPCDEV", "RPCDEV_UUID")
844 except CommandError, e:
845 print "cleanup failed: ", self.name
849 lctl.disconnectAll(self.net_type)
850 except CommandError, e:
851 print "disconnectAll failed: ", self.name
854 if self.net_type == 'tcp':
855 # yikes, this ugly! need to save pid in /var/something
856 run("killall acceptor")
859 def __init__(self,dom_node):
860 Module.__init__(self, 'LDLM', dom_node)
861 self.add_module('lustre/ldlm', 'ldlm')
863 if is_prepared(self.uuid):
866 lctl.newdev(attach="ldlm %s %s" % (self.name, self.uuid),
870 def __init__(self,dom_node):
871 Module.__init__(self, 'LOV', dom_node)
872 self.mds_uuid = get_first_ref(dom_node, 'mds')
873 mds= lookup(dom_node.parentNode, self.mds_uuid)
874 self.mds_name = getName(mds)
875 devs = dom_node.getElementsByTagName('devices')
878 self.stripe_sz = get_attr_int(dev_node, 'stripesize', 65536)
879 self.stripe_off = get_attr_int(dev_node, 'stripeoffset', 0)
880 self.pattern = get_attr_int(dev_node, 'pattern', 0)
881 self.devlist = get_all_refs(dev_node, 'osc')
882 self.stripe_cnt = get_attr_int(dev_node, 'stripecount', len(self.devlist))
883 self.add_module('lustre/mdc', 'mdc')
884 self.add_module('lustre/lov', 'lov')
887 if is_prepared(self.uuid):
889 for osc_uuid in self.devlist:
890 osc = lookup(self.dom_node.parentNode, osc_uuid)
895 panic('osc not found:', osc_uuid)
896 mdc_uuid = prepare_mdc(self.dom_node.parentNode, self.mds_uuid)
897 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
898 self.stripe_off, self.pattern, self.devlist, self.mds_name)
899 lctl.newdev(attach="lov %s %s" % (self.name, self.uuid),
900 setup ="%s" % (mdc_uuid))
903 if not is_prepared(self.uuid):
905 for osc_uuid in self.devlist:
906 osc = lookup(self.dom_node.parentNode, osc_uuid)
911 panic('osc not found:', osc_uuid)
913 cleanup_mdc(self.dom_node.parentNode, self.mds_uuid)
916 def load_module(self):
917 for osc_uuid in self.devlist:
918 osc = lookup(self.dom_node.parentNode, osc_uuid)
924 panic('osc not found:', osc_uuid)
925 Module.load_module(self)
928 def cleanup_module(self):
929 Module.cleanup_module(self)
930 for osc_uuid in self.devlist:
931 osc = lookup(self.dom_node.parentNode, osc_uuid)
937 panic('osc not found:', osc_uuid)
939 class LOVConfig(Module):
940 def __init__(self,dom_node):
941 Module.__init__(self, 'LOVConfig', dom_node)
942 self.lov_uuid = get_first_ref(dom_node, 'lov')
943 l = lookup(dom_node.parentNode, self.lov_uuid)
948 self.info(lov.mds_uuid, lov.stripe_cnt, lov.stripe_sz, lov.stripe_off,
949 lov.pattern, lov.devlist, lov.mds_name)
950 lctl.lov_setconfig(lov.uuid, lov.mds_name, lov.stripe_cnt,
951 lov.stripe_sz, lov.stripe_off, lov.pattern,
952 string.join(lov.devlist))
960 def __init__(self,dom_node):
961 Module.__init__(self, 'MDS', dom_node)
962 self.devname, self.size = get_device(dom_node)
963 self.fstype = get_text(dom_node, 'fstype')
964 # FIXME: if fstype not set, then determine based on kernel version
965 self.format = get_text(dom_node, 'autoformat', "no")
966 if self.fstype == 'extN':
967 self.add_module('lustre/extN', 'extN')
968 self.add_module('lustre/mds', 'mds')
969 self.add_module('lustre/mds', 'mds_%s' % (self.fstype))
972 if is_prepared(self.uuid):
974 self.info(self.devname, self.fstype, self.format)
975 blkdev = block_dev(self.devname, self.size, self.fstype, self.format)
976 if not is_prepared('MDT_UUID'):
977 lctl.newdev(attach="mdt %s %s" % ('MDT', 'MDT_UUID'),
979 lctl.newdev(attach="mds %s %s" % (self.name, self.uuid),
980 setup ="%s %s" %(blkdev, self.fstype))
982 if is_prepared('MDT_UUID'):
984 lctl.cleanup("MDT", "MDT_UUID")
985 except CommandError, e:
986 print "cleanup failed: ", self.name
989 if not is_prepared(self.uuid):
992 clean_loop(self.devname)
994 # Very unusual case, as there is no MDC element in the XML anymore
995 # Builds itself from an MDS node
997 def __init__(self,dom_node):
998 self.mds = MDS(dom_node)
999 self.dom_node = dom_node
1000 self.module_name = 'MDC'
1001 self.kmodule_list = []
1005 host = socket.gethostname()
1006 self.name = 'MDC_%s_%s' % ( host, self.mds.name )
1007 self.uuid = self.name + '_UUID'
1009 self.lookup_server(self.mds.uuid)
1010 self.add_module('lustre/mdc', 'mdc')
1013 if is_prepared(self.uuid):
1015 self.info(self.mds.uuid)
1016 srv = self.get_server()
1017 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
1018 lctl.newdev(attach="mdc %s %s" % (self.name, self.uuid),
1019 setup ="%s %s" %(self.mds.uuid, srv.uuid))
1022 def __init__(self, dom_node):
1023 Module.__init__(self, 'OBD', dom_node)
1024 self.obdtype = get_attr(dom_node, 'type')
1025 self.devname, self.size = get_device(dom_node)
1026 self.fstype = get_text(dom_node, 'fstype')
1027 # FIXME: if fstype not set, then determine based on kernel version
1028 self.format = get_text(dom_node, 'autoformat', 'yes')
1029 if self.fstype == 'extN':
1030 self.add_module('lustre/extN', 'extN')
1031 self.add_module('lustre/' + self.obdtype, self.obdtype)
1033 # need to check /proc/mounts and /etc/mtab before
1034 # formatting anything.
1035 # FIXME: check if device is already formatted.
1037 if is_prepared(self.uuid):
1039 self.info(self.obdtype, self.devname, self.size, self.fstype, self.format)
1040 if self.obdtype == 'obdecho':
1043 blkdev = block_dev(self.devname, self.size, self.fstype, self.format)
1044 lctl.newdev(attach="%s %s %s" % (self.obdtype, self.name, self.uuid),
1045 setup ="%s %s" %(blkdev, self.fstype))
1047 if not is_prepared(self.uuid):
1049 Module.cleanup(self)
1050 if not self.obdtype == 'obdecho':
1051 clean_loop(self.devname)
1054 def __init__(self,dom_node):
1055 Module.__init__(self, 'OST', dom_node)
1056 self.obd_uuid = get_first_ref(dom_node, 'obd')
1057 self.add_module('lustre/ost', 'ost')
1060 if is_prepared(self.uuid):
1062 self.info(self.obd_uuid)
1063 lctl.newdev(attach="ost %s %s" % (self.name, self.uuid),
1064 setup ="%s" % (self.obd_uuid))
1067 # virtual interface for OSC and LOV
1069 def __init__(self,dom_node):
1070 Module.__init__(self, 'VOSC', dom_node)
1071 if dom_node.nodeName == 'lov':
1072 self.osc = LOV(dom_node)
1074 self.osc = OSC(dom_node)
1079 def load_module(self):
1080 self.osc.load_module()
1081 def cleanup_module(self):
1082 self.osc.cleanup_module()
1086 def __init__(self,dom_node):
1087 Module.__init__(self, 'OSC', dom_node)
1088 self.obd_uuid = get_first_ref(dom_node, 'obd')
1089 self.ost_uuid = get_first_ref(dom_node, 'ost')
1090 self.lookup_server(self.ost_uuid)
1091 self.add_module('lustre/osc', 'osc')
1094 if is_prepared(self.uuid):
1096 self.info(self.obd_uuid, self.ost_uuid)
1097 srv = self.get_server()
1099 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
1103 lctl.add_route_host(r[0], srv.uuid, r[1], r[2])
1105 panic ("no route to", srv.nid)
1107 lctl.newdev(attach="osc %s %s" % (self.name, self.uuid),
1108 setup ="%s %s" %(self.obd_uuid, srv.uuid))
1111 if not is_prepared(self.uuid):
1113 srv = self.get_server()
1115 Module.cleanup(self)
1117 self.info(self.obd_uuid, self.ost_uuid)
1121 lctl.del_route_host(r[0], srv.uuid, r[1], r[2])
1122 except CommandError, e:
1123 print "del_route failed: ", self.name
1126 Module.cleanup(self)
1129 class Mountpoint(Module):
1130 def __init__(self,dom_node):
1131 Module.__init__(self, 'MTPT', dom_node)
1132 self.path = get_text(dom_node, 'path')
1133 self.mds_uuid = get_first_ref(dom_node, 'mds')
1134 self.lov_uuid = get_first_ref(dom_node, 'osc')
1135 self.add_module('lustre/mdc', 'mdc')
1136 self.add_module('lustre/llite', 'llite')
1137 l = lookup(self.dom_node.parentNode, self.lov_uuid)
1142 mdc_uuid = prepare_mdc(self.dom_node.parentNode, self.mds_uuid)
1143 self.info(self.path, self.mds_uuid, self.lov_uuid)
1144 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s none %s" % \
1145 (self.lov_uuid, mdc_uuid, self.path)
1146 run("mkdir", self.path)
1149 panic("mount failed:", self.path)
1152 self.info(self.path, self.mds_uuid,self.lov_uuid)
1154 (rc, out) = run("umount -f", self.path)
1156 (rc, out) = run("umount", self.path)
1158 log("umount failed, cleanup will most likely not work.")
1159 l = lookup(self.dom_node.parentNode, self.lov_uuid)
1161 cleanup_mdc(self.dom_node.parentNode, self.mds_uuid)
1163 def load_module(self):
1164 self.osc.load_module()
1165 Module.load_module(self)
1166 def cleanup_module(self):
1167 Module.cleanup_module(self)
1168 self.osc.cleanup_module()
1171 # ============================================================
1172 # XML processing and query
1173 # TODO: Change query funcs to use XPath, which is muc cleaner
1175 def get_device(obd):
1176 list = obd.getElementsByTagName('device')
1180 size = get_attr_int(dev, 'size', 0)
1181 return dev.firstChild.data, size
1184 # Get the text content from the first matching child
1185 # If there is no content (or it is all whitespace), return
1187 def get_text(dom_node, tag, default=""):
1188 list = dom_node.getElementsByTagName(tag)
1191 dom_node.normalize()
1192 if dom_node.firstChild:
1193 txt = string.strip(dom_node.firstChild.data)
1198 def get_text_int(dom_node, tag, default=0):
1199 list = dom_node.getElementsByTagName(tag)
1203 dom_node.normalize()
1204 if dom_node.firstChild:
1205 txt = string.strip(dom_node.firstChild.data)
1210 panic("text value is not integer:", txt)
1213 def get_attr(dom_node, attr, default=""):
1214 v = dom_node.getAttribute(attr)
1219 def get_attr_int(dom_node, attr, default=0):
1221 v = dom_node.getAttribute(attr)
1226 panic("attr value is not integer", v)
1229 def get_first_ref(dom_node, tag):
1230 """ Get the first uuidref of the type TAG. Used one only
1231 one is expected. Returns the uuid."""
1233 refname = '%s_ref' % tag
1234 list = dom_node.getElementsByTagName(refname)
1236 uuid = getRef(list[0])
1239 def get_all_refs(dom_node, tag):
1240 """ Get all the refs of type TAG. Returns list of uuids. """
1242 refname = '%s_ref' % tag
1243 list = dom_node.getElementsByTagName(refname)
1246 uuids.append(getRef(i))
1249 def get_ost_net(dom_node, uuid):
1250 ost = lookup(dom_node, uuid)
1251 uuid = get_first_ref(ost, 'network')
1254 return lookup(dom_node, uuid)
1256 def nid2server(dom_node, nid):
1257 netlist = dom_node.getElementsByTagName('network')
1258 for net_node in netlist:
1259 if get_text(net_node, 'server') == nid:
1260 return Network(net_node)
1263 def lookup(dom_node, uuid):
1264 for n in dom_node.childNodes:
1265 if n.nodeType == n.ELEMENT_NODE:
1266 if getUUID(n) == uuid:
1273 # Get name attribute of dom_node
1274 def getName(dom_node):
1275 return dom_node.getAttribute('name')
1277 def getRef(dom_node):
1278 return dom_node.getAttribute('uuidref')
1280 # Get name attribute of dom_node
1281 def getUUID(dom_node):
1282 return dom_node.getAttribute('uuid')
1284 # the tag name is the service type
1285 # fixme: this should do some checks to make sure the dom_node is a service
1286 def getServiceType(dom_node):
1287 return dom_node.nodeName
1290 # determine what "level" a particular node is at.
1291 # the order of iniitailization is based on level.
1292 def getServiceLevel(dom_node):
1293 type = getServiceType(dom_node)
1295 if type in ('network',):
1297 elif type in ('device', 'ldlm'):
1299 elif type in ('obd', 'mdd'):
1301 elif type in ('mds','ost'):
1303 elif type in ('mdc','osc'):
1305 elif type in ('lov', 'lovconfig'):
1307 elif type in ('mountpoint',):
1310 if ret < config.minlevel() or ret > config.maxlevel():
1315 # return list of services in a profile. list is a list of tuples
1316 # [(level, dom_node),]
1317 def getServices(lustreNode, profileNode):
1319 for n in profileNode.childNodes:
1320 if n.nodeType == n.ELEMENT_NODE:
1321 servNode = lookup(lustreNode, getRef(n))
1324 panic('service not found: ' + getRef(n))
1325 level = getServiceLevel(servNode)
1327 list.append((level, servNode))
1331 def getByName(lustreNode, name, tag):
1332 ndList = lustreNode.getElementsByTagName(tag)
1334 if getName(nd) == name:
1339 ############################################################
1341 # FIXME: clean this mess up!
1344 def prepare_mdc(dom_node, mds_uuid):
1346 mds_node = lookup(dom_node, mds_uuid);
1348 panic("no mds:", mds_uuid)
1349 if saved_mdc.has_key(mds_uuid):
1350 return saved_mdc[mds_uuid]
1353 saved_mdc[mds_uuid] = mdc.uuid
1356 def cleanup_mdc(dom_node, mds_uuid):
1358 mds_node = lookup(dom_node, mds_uuid);
1360 panic("no mds:", mds_uuid)
1361 if not saved_mdc.has_key(mds_uuid):
1364 saved_mdc[mds_uuid] = mdc.uuid
1367 ############################################################
1368 # routing ("rooting")
1374 def init_node(dom_node):
1375 global local_node, router_flag
1376 netlist = dom_node.getElementsByTagName('network')
1377 for dom_net in netlist:
1378 type = get_attr(dom_net, 'type')
1379 gw = get_text(dom_net, 'server')
1380 local_node.append((type, gw))
1382 def node_needs_router():
1385 def get_routes(type, gw, dom_net):
1386 """ Return the routes as a list of tuples of the form:
1387 [(type, gw, lo, hi),]"""
1389 tbl = dom_net.getElementsByTagName('route_tbl')
1391 routes = t.getElementsByTagName('route')
1393 lo = get_attr(r, 'lo')
1394 hi = get_attr(r, 'hi', '')
1395 res.append((type, gw, lo, hi))
1399 def init_route_config(lustre):
1400 """ Scan the lustre config looking for routers. Build list of
1402 global routes, router_flag
1404 list = lustre.getElementsByTagName('node')
1406 if get_attr(node, 'router'):
1408 for (local_type, local_nid) in local_node:
1410 netlist = node.getElementsByTagName('network')
1411 for dom_net in netlist:
1412 if local_type == get_attr(dom_net, 'type'):
1413 gw = get_text(dom_net, 'server')
1417 for dom_net in netlist:
1418 if local_type != get_attr(dom_net, 'type'):
1419 for route in get_routes(local_type, gw, dom_net):
1420 routes.append(route)
1425 for iface in local_node:
1426 if net.net_type == iface[0]:
1430 def find_route(net):
1431 global local_node, routes
1432 frm_type = local_node[0][0]
1433 to_type = net.net_type
1435 debug ('looking for route to', to_type,to)
1444 ############################################################
1447 def startService(dom_node, module_flag):
1448 type = getServiceType(dom_node)
1449 debug('Service:', type, getName(dom_node), getUUID(dom_node))
1450 # there must be a more dynamic way of doing this...
1456 elif type == 'lovconfig':
1457 n = LOVConfig(dom_node)
1458 elif type == 'network':
1459 n = Network(dom_node)
1470 elif type == 'mountpoint':
1471 n = Mountpoint(dom_node)
1473 panic ("unknown service type:", type)
1478 if config.cleanup():
1483 if config.nosetup():
1485 if config.cleanup():
1491 # Prepare the system to run lustre using a particular profile
1492 # in a the configuration.
1493 # * load & the modules
1494 # * setup networking for the current node
1495 # * make sure partitions are in place and prepared
1496 # * initialize devices with lctl
1497 # Levels is important, and needs to be enforced.
1498 def startProfile(lustreNode, profileNode, module_flag):
1500 panic("profile:", profile, "not found.")
1501 services = getServices(lustreNode, profileNode)
1502 if config.cleanup():
1505 startService(s[1], module_flag)
1510 def doHost(lustreNode, hosts):
1514 dom_node = getByName(lustreNode, h, 'node')
1519 print 'No host entry found.'
1522 if not get_attr(dom_node, 'router'):
1524 init_route_config(lustreNode)
1529 # Two step process: (1) load modules, (2) setup lustre
1530 # if not cleaning, load modules first.
1531 module_flag = not config.cleanup()
1532 reflist = dom_node.getElementsByTagName('profile')
1533 for profile in reflist:
1534 startProfile(lustreNode, profile, module_flag)
1536 if not config.cleanup():
1537 sys_set_debug_path()
1538 script = config.gdb_script()
1539 run(lctl.lctl, ' modules >', script)
1541 # dump /tmp/ogdb and sleep/pause here
1542 log ("The GDB module script is in", script)
1545 module_flag = not module_flag
1546 for profile in reflist:
1547 startProfile(lustreNode, profile, module_flag)
1549 ############################################################
1550 # Command line processing
1552 def parse_cmdline(argv):
1553 short_opts = "hdnvf"
1554 long_opts = ["ldap", "reformat", "lustre=", "verbose", "gdb",
1555 "portals=", "makeldiff", "cleanup", "noexec",
1556 "help", "node=", "nomod", "nosetup",
1557 "dump=", "force", "minlevel=", "maxlevel="]
1561 opts, args = getopt.getopt(argv, short_opts, long_opts)
1562 except getopt.error:
1567 if o in ("-h", "--help"):
1569 if o in ("-d","--cleanup"):
1571 if o in ("-v", "--verbose"):
1573 if o in ("-n", "--noexec"):
1576 if o == "--portals":
1580 if o == "--reformat":
1588 if o == "--nosetup":
1592 if o in ("-f", "--force"):
1594 if o in ("--minlevel",):
1596 if o in ("--maxlevel",):
1605 s = urllib.urlopen(url)
1611 def setupModulePath(cmd):
1612 base = os.path.dirname(cmd)
1613 if os.access(base+"/Makefile", os.R_OK):
1614 config.src_dir(base + "/../../")
1616 def sys_set_debug_path():
1617 debug("debug path: ", config.debug_path())
1621 fp = open('/proc/sys/portals/debug_path', 'w')
1622 fp.write(config.debug_path())
1627 #/proc/sys/net/core/rmem_max
1628 #/proc/sys/net/core/wmem_max
1629 def sys_set_netmem_max(path, max):
1630 debug("setting", path, "to at least", max)
1638 fp = open(path, 'w')
1639 fp.write('%d\n' %(max))
1643 def sys_make_devices():
1644 if not os.access('/dev/portals', os.R_OK):
1645 run('mknod /dev/portals c 10 240')
1646 if not os.access('/dev/obd', os.R_OK):
1647 run('mknod /dev/obd c 10 241')
1650 # Add dir to the global PATH, if not already there.
1651 def add_to_path(new_dir):
1652 syspath = string.split(os.environ['PATH'], ':')
1653 if new_dir in syspath:
1655 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
1658 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
1659 # ensure basic elements are in the system path
1660 def sanitise_path():
1661 for dir in DEFAULT_PATH:
1664 # Initialize or shutdown lustre according to a configuration file
1665 # * prepare the system for lustre
1666 # * configure devices with lctl
1667 # Shutdown does steps in reverse
1670 global TCP_ACCEPTOR, lctl, MAXTCPBUF
1671 host = socket.gethostname()
1675 args = parse_cmdline(sys.argv[1:])
1677 if not os.access(args[0], os.R_OK):
1678 print 'File not found or readable:', args[0]
1680 dom = xml.dom.minidom.parse(args[0])
1682 xmldata = fetch(config.url())
1683 dom = xml.dom.minidom.parseString(xmldata)
1689 node_list.append(config.node())
1692 node_list.append(host)
1693 node_list.append('localhost')
1694 debug("configuring for host: ", node_list)
1697 config._debug_path = config._debug_path + '-' + host
1698 config._gdb_script = config._gdb_script + '-' + host
1700 TCP_ACCEPTOR = find_prog('acceptor')
1701 if not TCP_ACCEPTOR:
1703 TCP_ACCEPTOR = 'acceptor'
1704 debug('! acceptor not found')
1706 panic('acceptor not found')
1708 lctl = LCTLInterface('lctl')
1710 setupModulePath(sys.argv[0])
1712 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
1713 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
1714 doHost(dom.documentElement, node_list)
1716 if __name__ == "__main__":
1719 except LconfError, e:
1721 except CommandError, e:
1725 if first_cleanup_error:
1726 sys.exit(first_cleanup_error)