3 # Copyright (C) 2002 Cluster File Systems, Inc.
4 # Author: Robert Read <rread@clusterfs.com>
5 # This file is part of Lustre, http://www.lustre.org.
7 # Lustre is free software; you can redistribute it and/or
8 # modify it under the terms of version 2 of the GNU General Public
9 # License as published by the Free Software Foundation.
11 # Lustre is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with Lustre; if not, write to the Free Software
18 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 # lconf - lustre configuration tool
22 # lconf is the main driver script for starting and stopping
23 # lustre filesystem services.
25 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import string, os, stat, popen2, socket, time, random
30 import xml.dom.minidom
35 DEFAULT_TCPBUF = 1048576
37 # Maximum number of devices to search for.
38 # (the /dev/loop* nodes need to be created beforehand)
39 MAX_LOOP_DEVICES = 256
40 PORTALS_DIR = '@PORTALSLOC@'
42 first_cleanup_error = 0
43 def cleanup_error(rc):
44 global first_cleanup_error
45 if not first_cleanup_error:
46 first_cleanup_error = rc
50 print """usage: lconf config.xml
52 config.xml Lustre configuration in xml format.
53 --get <url> URL to fetch a config file
54 --node <nodename> Load config for <nodename>
55 -d | --cleanup Cleans up config. (Shutdown)
56 -f | --force Forced unmounting and/or obd detach during cleanup
57 -v | --verbose Print system commands as they are run
58 -h | --help Print this help
59 --gdb Prints message after creating gdb module script
60 and sleeps for 5 seconds.
61 -n | --noexec Prints the commands and steps that will be run for a
62 config without executing them. This can used to check if a
63 config file is doing what it should be doing. (Implies -v)
64 --nomod Skip load/unload module step.
65 --nosetup Skip device setup/cleanup step.
66 --reformat Reformat all devices (without question)
67 --dump <file> Dump the kernel debug log before portals is unloaded
68 --minlevel <num> Specify the minimum level of services to configure/cleanup (default 0)
69 --maxlevel <num> Specify the maximum level of services to configure/cleanup (default 100)
70 Levels are aproximatly like:
77 70 - mountpoint, echo_client
80 --ldap server LDAP server with lustre config database
81 --makeldiff Translate xml source to LDIFF
82 This are perhaps not needed:
83 --lustre="src dir" Base directory of lustre sources. Used to search
85 --portals=src Portals source
89 # ============================================================
90 # Config parameters, encapsulated in a class
106 self._gdb_script = '/tmp/ogdb'
107 self._debug_path = '/tmp/lustre-log'
108 self._dump_file = None
113 def verbose(self, flag = None):
114 if flag: self._verbose = flag
117 def noexec(self, flag = None):
118 if flag: self._noexec = flag
121 def reformat(self, flag = None):
122 if flag: self._reformat = flag
123 return self._reformat
125 def cleanup(self, flag = None):
126 if flag: self._cleanup = flag
129 def gdb(self, flag = None):
130 if flag: self._gdb = flag
133 def nomod(self, flag = None):
134 if flag: self._nomod = flag
137 def nosetup(self, flag = None):
138 if flag: self._nosetup = flag
141 def force(self, flag = None):
142 if flag: self._force = flag
145 def node(self, val = None):
146 if val: self._node = val
149 def url(self, val = None):
150 if val: self._url = val
153 def gdb_script(self):
154 if os.path.isdir('/r'):
155 return '/r' + self._gdb_script
157 return self._gdb_script
159 def debug_path(self):
160 if os.path.isdir('/r'):
161 return '/r' + self._debug_path
163 return self._debug_path
165 def src_dir(self, val = None):
166 if val: self._src_dir = val
169 def dump_file(self, val = None):
170 if val: self._dump_file = val
171 return self._dump_file
173 def minlevel(self, val = None):
174 if val: self._minlevel = int(val)
175 return self._minlevel
177 def maxlevel(self, val = None):
178 if val: self._maxlevel = int(val)
179 return self._maxlevel
185 # ============================================================
186 # debugging and error funcs
188 def fixme(msg = "this feature"):
189 raise LconfError, msg + ' not implmemented yet.'
192 msg = string.join(map(str,args))
193 if not config.noexec():
194 raise LconfError(msg)
199 msg = string.join(map(str,args))
204 print string.strip(s)
208 msg = string.join(map(str,args))
211 # ============================================================
212 # locally defined exceptions
213 class CommandError (exceptions.Exception):
214 def __init__(self, cmd_name, cmd_err, rc=None):
215 self.cmd_name = cmd_name
216 self.cmd_err = cmd_err
221 if type(self.cmd_err) == types.StringType:
223 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
225 print "! %s: %s" % (self.cmd_name, self.cmd_err)
226 elif type(self.cmd_err) == types.ListType:
228 print "! %s (error %d):" % (self.cmd_name, self.rc)
230 print "! %s:" % (self.cmd_name)
231 for s in self.cmd_err:
232 print "> %s" %(string.strip(s))
236 class LconfError (exceptions.Exception):
237 def __init__(self, args):
241 # ============================================================
242 # handle lctl interface
245 Manage communication with lctl
248 def __init__(self, cmd):
250 Initialize close by finding the lctl binary.
252 self.lctl = find_prog(cmd)
255 debug('! lctl not found')
258 raise CommandError('lctl', "unable to find lctl binary.")
263 the cmds are written to stdin of lctl
264 lctl doesn't return errors when run in script mode, so
266 should modify command line to accept multiple commands, or
267 create complex command line options
269 debug("+", self.lctl, cmds)
270 if config.noexec(): return (0, [])
271 p = popen2.Popen3(self.lctl, 1)
272 p.tochild.write(cmds + "\n")
274 out = p.fromchild.readlines()
275 err = p.childerr.readlines()
277 if os.WIFEXITED(ret):
278 rc = os.WEXITSTATUS(ret)
282 raise CommandError(self.lctl, err, rc)
285 def runcmd(self, *args):
287 run lctl using the command line
289 cmd = string.join(map(str,args))
290 debug("+", self.lctl, cmd)
291 rc, out = run(self.lctl, cmd)
293 raise CommandError(self.lctl, out, rc)
297 def network(self, net, nid):
298 """ initialized network and add "self" """
299 # Idea: "mynid" could be used for all network types to add "self," and then
300 # this special case would be gone and the "self" hack would be hidden.
301 if net in ('tcp', 'toe'):
306 quit""" % (net, nid, nid)
315 # create a new connection
316 def connect(self, net, nid, port, servuuid, send_mem, recv_mem):
317 if net in ('tcp', 'toe'):
324 quit""" % (net, servuuid, nid, send_mem, recv_mem, nid, port, )
330 quit""" % (net, servuuid, nid, nid, port, )
334 # add a route to a range
335 def add_route(self, net, gw, lo, hi):
339 quit """ % (net, gw, lo, hi)
343 def del_route(self, net, gw, lo, hi):
351 # add a route to a host
352 def add_route_host(self, net, uuid, gw, tgt):
357 quit """ % (net, uuid, tgt, gw, tgt)
360 # add a route to a range
361 def del_route_host(self, net, uuid, gw, tgt):
367 quit """ % (net, uuid, tgt)
370 # disconnect one connection
371 def disconnect(self, net, nid, port, servuuid):
377 quit""" % (net, nid, servuuid)
381 def disconnectAll(self, net):
390 # create a new device with lctl
391 def newdev(self, attach, setup = ""):
396 quit""" % (attach, setup)
400 def cleanup(self, name, uuid):
406 quit""" % (name, ('', 'force')[config.force()])
410 def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist):
414 lov_setconfig %s %d %d %d %s %s
415 quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
419 def dump(self, dump_file):
422 quit""" % (dump_file)
425 # get list of devices
426 def device_list(self):
427 rc, out = self.runcmd('device_list')
431 def lustre_version(self):
432 rc, out = self.runcmd('version')
435 # ============================================================
436 # Various system-level functions
437 # (ideally moved to their own module)
439 # Run a command and return the output and status.
440 # stderr is sent to /dev/null, could use popen3 to
441 # save it if necessary
443 cmd = string.join(map(str,args))
445 if config.noexec(): return (0, [])
446 f = os.popen(cmd + ' 2>&1')
455 # Run a command in the background.
456 def run_daemon(*args):
457 cmd = string.join(map(str,args))
459 if config.noexec(): return 0
460 f = os.popen(cmd + ' 2>&1')
468 # Determine full path to use for an external command
469 # searches dirname(argv[0]) first, then PATH
471 syspath = string.split(os.environ['PATH'], ':')
472 cmdpath = os.path.dirname(sys.argv[0])
473 syspath.insert(0, cmdpath);
474 syspath.insert(0, os.path.join(cmdpath, PORTALS_DIR+'/linux/utils/'))
476 prog = os.path.join(d,cmd)
478 if os.access(prog, os.X_OK):
482 # Recursively look for file starting at base dir
483 def do_find_file(base, mod):
484 fullname = os.path.join(base, mod)
485 if os.access(fullname, os.R_OK):
487 for d in os.listdir(base):
488 dir = os.path.join(base,d)
489 if os.path.isdir(dir):
490 module = do_find_file(dir, mod)
494 def find_module(dev_dir, modname):
495 mod = '%s.o' % (modname)
497 module = dev_dir +'/'+ mod
499 if os.access(module, os.R_OK):
505 # is the path a block device?
512 return stat.S_ISBLK(s[stat.ST_MODE])
514 # build fs according to type
516 def mkfs(fstype, dev):
517 if(fstype in ('ext3', 'extN')):
518 mkfs = 'mkfs.ext2 -j -b 4096'
519 elif (fstype == 'reiserfs'):
520 mkfs = 'mkfs.reiserfs -f'
522 print 'unsupported fs type: ', fstype
523 if not is_block(dev):
524 if(fstype in ('ext3', 'extN')):
526 elif (fstype == 'reiserfs'):
529 print 'unsupported fs type: ', fstype
532 (ret, out) = run (mkfs, force, dev)
534 panic("Unable to build fs:", dev)
535 # enable hash tree indexing on fsswe
536 # FIXME: this check can probably go away on 2.5
538 htree = 'echo "feature FEATURE_C5" | debugfs -w'
539 (ret, out) = run (htree, dev)
541 panic("Unable to enable htree:", dev)
543 # some systems use /dev/loopN, some /dev/loop/N
547 if not os.access(loop + str(0), os.R_OK):
549 if not os.access(loop + str(0), os.R_OK):
550 panic ("can't access loop devices")
553 # find loop device assigned to thefile
556 for n in xrange(0, MAX_LOOP_DEVICES):
558 if os.access(dev, os.R_OK):
559 (stat, out) = run('losetup', dev)
560 if (out and stat == 0):
561 m = re.search(r'\((.*)\)', out[0])
562 if m and file == m.group(1):
568 # create file if necessary and assign the first free loop device
569 def init_loop(file, size, fstype):
570 dev = find_loop(file)
572 print 'WARNING file:', file, 'already mapped to', dev
574 if config.reformat() or not os.access(file, os.R_OK | os.W_OK):
575 run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size, file))
577 # find next free loop
578 for n in xrange(0, MAX_LOOP_DEVICES):
580 if os.access(dev, os.R_OK):
581 (stat, out) = run('losetup', dev)
583 run('losetup', dev, file)
586 print "out of loop devices"
588 print "out of loop devices"
591 # undo loop assignment
592 def clean_loop(file):
593 dev = find_loop(file)
595 ret, out = run('losetup -d', dev)
597 log('unable to clean loop device:', dev, 'for file:', file)
600 # determine if dev is formatted as a <fstype> filesystem
601 def need_format(fstype, dev):
602 # FIXME don't know how to implement this
605 # initialize a block device if needed
606 def block_dev(dev, size, fstype, format):
607 if config.noexec(): return dev
608 if not is_block(dev):
609 dev = init_loop(dev, size, fstype)
610 if config.reformat() or (need_format(fstype, dev) and format == 'yes'):
614 # panic("device:", dev,
615 # "not prepared, and autoformat is not set.\n",
616 # "Rerun with --reformat option to format ALL filesystems")
621 """lookup IP address for an interface"""
622 rc, out = run("/sbin/ifconfig", iface)
625 addr = string.split(out[1])[1]
626 ip = string.split(addr, ':')[1]
629 def get_local_address(net_type, wildcard):
630 """Return the local address for the network type."""
632 if net_type in ('tcp', 'toe'):
634 iface, star = string.split(wildcard, ':')
635 local = if2addr(iface)
637 panic ("unable to determine ip for:", wildcard)
639 host = socket.gethostname()
640 local = socket.gethostbyname(host)
641 elif net_type == 'elan':
642 # awk '/NodeId/ { print $2 }' '/proc/elan/device0/position'
644 fp = open('/proc/elan/device0/position', 'r')
645 lines = fp.readlines()
654 elif net_type == 'gm':
655 fixme("automatic local address for GM")
659 def is_prepared(uuid):
660 """Return true if a device exists for the uuid"""
661 # expect this format:
662 # 1 UP ldlm ldlm ldlm_UUID 2
664 out = lctl.device_list()
666 if uuid == string.split(s)[4]:
668 except CommandError, e:
673 # ============================================================
674 # Classes to prepare and cleanup the various objects
677 """ Base class for the rest of the modules. The default cleanup method is
678 defined here, as well as some utilitiy funcs.
680 def __init__(self, module_name, dom_node):
681 self.dom_node = dom_node
682 self.module_name = module_name
683 self.name = get_attr(dom_node, 'name')
684 self.uuid = get_attr(dom_node, 'uuid')
685 self.kmodule_list = []
689 def info(self, *args):
690 msg = string.join(map(str,args))
691 print self.module_name + ":", self.name, self.uuid, msg
694 def lookup_server(self, srv_uuid):
695 """ Lookup a server's network information """
696 net = get_ost_net(self.dom_node.parentNode, srv_uuid)
698 panic ("Unable to find a server for:", srv_uuid)
699 self._server = Network(net)
701 def get_server(self):
705 """ default cleanup, used for most modules """
707 srv = self.get_server()
708 if srv and local_net(srv):
710 lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
711 except CommandError, e:
712 log(self.module_name, "disconnect failed: ", self.name)
716 lctl.cleanup(self.name, self.uuid)
717 except CommandError, e:
718 log(self.module_name, "cleanup failed: ", self.name)
722 def add_module(self, dev_dir, modname):
723 """Append a module to list of modules to load."""
724 self.kmodule_list.append((dev_dir, modname))
726 def mod_loaded(self, modname):
727 """Check if a module is already loaded. Look in /proc/modules for it."""
728 fp = open('/proc/modules')
729 lines = fp.readlines()
731 # please forgive my tired fingers for this one
732 ret = filter(lambda word, mod=modname: word == mod,
733 map(lambda line: string.split(line)[0], lines))
736 def load_module(self):
737 """Load all the modules in the list in the order they appear."""
738 for dev_dir, mod in self.kmodule_list:
739 # (rc, out) = run ('/sbin/lsmod | grep -s', mod)
740 if self.mod_loaded(mod) and not config.noexec():
742 log ('loading module:', mod)
744 module = find_module(dev_dir, mod)
746 panic('module not found:', mod)
747 (rc, out) = run('/sbin/insmod', module)
749 raise CommandError('insmod', out, rc)
751 (rc, out) = run('/sbin/modprobe', mod)
753 raise CommandError('modprobe', out, rc)
755 def cleanup_module(self):
756 """Unload the modules in the list in reverse order."""
757 rev = self.kmodule_list
759 for dev_dir, mod in rev:
760 if not self.mod_loaded(mod):
763 if mod == 'portals' and config.dump_file():
764 lctl.dump(config.dump_file())
765 log('unloading module:', mod)
768 (rc, out) = run('/sbin/rmmod', mod)
770 log('! unable to unload module:', mod)
774 class Network(Module):
775 def __init__(self,dom_node):
776 Module.__init__(self, 'NETWORK', dom_node)
777 self.net_type = get_attr(dom_node,'type')
778 self.nid = get_text(dom_node, 'server', '*')
779 self.port = get_text_int(dom_node, 'port', 0)
780 self.send_mem = get_text_int(dom_node, 'send_mem', DEFAULT_TCPBUF)
781 self.recv_mem = get_text_int(dom_node, 'recv_mem', DEFAULT_TCPBUF)
783 self.nid = get_local_address(self.net_type, self.nid)
785 panic("unable to set nid for", self.net_type, self.nid)
786 debug("nid:", self.nid)
788 self.add_module(PORTALS_DIR+"/linux/oslib", 'portals')
789 if node_needs_router():
790 self.add_module(PORTALS_DIR+"/linux/router", 'kptlrouter')
791 if self.net_type == 'tcp':
792 self.add_module(PORTALS_DIR+"/linux/socknal", 'ksocknal')
793 if self.net_type == 'toe':
794 self.add_module(PORTALS_DIR+"/linux/toenal", 'ktoenal')
795 if self.net_type == 'elan':
796 self.add_module(PORTALS_DIR+"/linux/rqswnal", 'kqswnal')
797 if self.net_type == 'gm':
798 self.add_module(PORTALS_DIR+"/linux/gmnal", 'kgmnal')
799 self.add_module(config.src_dir()+'obdclass', 'obdclass')
800 self.add_module(config.src_dir()+'ptlrpc', 'ptlrpc')
803 self.info(self.net_type, self.nid, self.port)
804 if self.net_type in ('tcp', 'toe'):
805 nal_id = '' # default is socknal
806 if self.net_type == 'toe':
808 ret, out = run(TCP_ACCEPTOR, '-s', self.send_mem, '-r', self.recv_mem, nal_id, self.port)
810 raise CommandError(TCP_ACCEPTOR, out, ret)
811 ret = self.dom_node.getElementsByTagName('route_tbl')
813 for r in a.getElementsByTagName('route'):
814 net_type = get_attr(r, 'type')
815 gw = get_attr(r, 'gw')
816 lo = get_attr(r, 'lo')
817 hi = get_attr(r,'hi', '')
818 lctl.add_route(net_type, gw, lo, hi)
819 if net_type in ('tcp', 'toe') and net_type == self.net_type and hi == '':
820 srv = nid2server(self.dom_node.parentNode.parentNode, lo)
822 panic("no server for nid", lo)
824 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
827 lctl.network(self.net_type, self.nid)
828 lctl.newdev(attach = "ptlrpc RPCDEV RPCDEV_UUID")
831 self.info(self.net_type, self.nid, self.port)
832 ret = self.dom_node.getElementsByTagName('route_tbl')
834 for r in a.getElementsByTagName('route'):
835 lo = get_attr(r, 'lo')
836 hi = get_attr(r,'hi', '')
837 if self.net_type in ('tcp', 'toe') and hi == '':
838 srv = nid2server(self.dom_node.parentNode.parentNode, lo)
840 panic("no server for nid", lo)
843 lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
844 except CommandError, e:
845 print "disconnect failed: ", self.name
849 lctl.del_route(self.net_type, self.nid, lo, hi)
850 except CommandError, e:
851 print "del_route failed: ", self.name
856 lctl.cleanup("RPCDEV", "RPCDEV_UUID")
857 except CommandError, e:
858 print "cleanup failed: ", self.name
862 lctl.disconnectAll(self.net_type)
863 except CommandError, e:
864 print "disconnectAll failed: ", self.name
867 if self.net_type in ('tcp', 'toe'):
868 # yikes, this ugly! need to save pid in /var/something
869 run("killall acceptor")
872 def __init__(self,dom_node):
873 Module.__init__(self, 'LDLM', dom_node)
874 self.add_module(config.src_dir()+'ldlm', 'ldlm')
876 if is_prepared(self.uuid):
879 lctl.newdev(attach="ldlm %s %s" % (self.name, self.uuid),
883 def __init__(self,dom_node):
884 Module.__init__(self, 'LOV', dom_node)
885 self.mds_uuid = get_first_ref(dom_node, 'mds')
886 mds= lookup(dom_node.parentNode, self.mds_uuid)
887 self.mds_name = getName(mds)
888 devs = dom_node.getElementsByTagName('devices')
891 self.stripe_sz = get_attr_int(dev_node, 'stripesize', 65536)
892 self.stripe_off = get_attr_int(dev_node, 'stripeoffset', 0)
893 self.pattern = get_attr_int(dev_node, 'pattern', 0)
894 self.devlist = get_all_refs(dev_node, 'osc')
895 self.stripe_cnt = get_attr_int(dev_node, 'stripecount', len(self.devlist))
896 self.add_module(config.src_dir()+'mdc', 'mdc')
897 self.add_module(config.src_dir()+'lov', 'lov')
900 if is_prepared(self.uuid):
902 for osc_uuid in self.devlist:
903 osc = lookup(self.dom_node.parentNode, osc_uuid)
907 # Ignore connection failures, because the LOV will DTRT with
908 # an unconnected OSC.
909 n.prepare(ignore_connect_failure=1)
911 print "Error preparing OSC %s (inactive)\n" % osc_uuid
913 panic('osc not found:', osc_uuid)
914 mdc_uuid = prepare_mdc(self.dom_node.parentNode, self.mds_uuid)
915 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
916 self.stripe_off, self.pattern, self.devlist, self.mds_name)
917 lctl.newdev(attach="lov %s %s" % (self.name, self.uuid),
918 setup ="%s" % (mdc_uuid))
921 if not is_prepared(self.uuid):
923 for osc_uuid in self.devlist:
924 osc = lookup(self.dom_node.parentNode, osc_uuid)
929 panic('osc not found:', osc_uuid)
931 cleanup_mdc(self.dom_node.parentNode, self.mds_uuid)
934 def load_module(self):
935 for osc_uuid in self.devlist:
936 osc = lookup(self.dom_node.parentNode, osc_uuid)
942 panic('osc not found:', osc_uuid)
943 Module.load_module(self)
946 def cleanup_module(self):
947 Module.cleanup_module(self)
948 for osc_uuid in self.devlist:
949 osc = lookup(self.dom_node.parentNode, osc_uuid)
955 panic('osc not found:', osc_uuid)
957 class LOVConfig(Module):
958 def __init__(self,dom_node):
959 Module.__init__(self, 'LOVConfig', dom_node)
960 self.lov_uuid = get_first_ref(dom_node, 'lov')
961 l = lookup(dom_node.parentNode, self.lov_uuid)
966 self.info(lov.mds_uuid, lov.stripe_cnt, lov.stripe_sz, lov.stripe_off,
967 lov.pattern, lov.devlist, lov.mds_name)
968 lctl.lov_setconfig(lov.uuid, lov.mds_name, lov.stripe_cnt,
969 lov.stripe_sz, lov.stripe_off, lov.pattern,
970 string.join(lov.devlist))
978 def __init__(self,dom_node):
979 Module.__init__(self, 'MDS', dom_node)
980 self.devname, self.size = get_device(dom_node)
981 self.fstype = get_text(dom_node, 'fstype')
982 # FIXME: if fstype not set, then determine based on kernel version
983 self.format = get_text(dom_node, 'autoformat', "no")
984 if self.fstype == 'extN':
985 self.add_module(config.src_dir()+'extN', 'extN')
986 self.add_module(config.src_dir()+'mds', 'mds')
987 self.add_module(config.src_dir()+'obdclass', 'fsfilt_%s'%(self.fstype))
990 if is_prepared(self.uuid):
992 self.info(self.devname, self.fstype, self.format)
993 blkdev = block_dev(self.devname, self.size, self.fstype, self.format)
994 if not is_prepared('MDT_UUID'):
995 lctl.newdev(attach="mdt %s %s" % ('MDT', 'MDT_UUID'),
997 lctl.newdev(attach="mds %s %s" % (self.name, self.uuid),
998 setup ="%s %s" %(blkdev, self.fstype))
1000 if is_prepared('MDT_UUID'):
1002 lctl.cleanup("MDT", "MDT_UUID")
1003 except CommandError, e:
1004 print "cleanup failed: ", self.name
1007 if not is_prepared(self.uuid):
1009 Module.cleanup(self)
1010 clean_loop(self.devname)
1012 # Very unusual case, as there is no MDC element in the XML anymore
1013 # Builds itself from an MDS node
1015 def __init__(self,dom_node):
1016 self.mds = MDS(dom_node)
1017 self.dom_node = dom_node
1018 self.module_name = 'MDC'
1019 self.kmodule_list = []
1023 host = socket.gethostname()
1024 self.name = 'MDC_%s' % (self.mds.name)
1025 self.uuid = '%s_%05x_%05x' % (self.name, int(random.random() * 1048576),
1026 int(random.random() * 1048576))
1028 self.lookup_server(self.mds.uuid)
1029 self.add_module(config.src_dir()+'mdc', 'mdc')
1032 if is_prepared(self.uuid):
1034 self.info(self.mds.uuid)
1035 srv = self.get_server()
1036 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
1037 lctl.newdev(attach="mdc %s %s" % (self.name, self.uuid),
1038 setup ="%s %s" %(self.mds.uuid, srv.uuid))
1041 def __init__(self, dom_node):
1042 Module.__init__(self, 'OBD', dom_node)
1043 self.obdtype = get_attr(dom_node, 'type')
1044 self.devname, self.size = get_device(dom_node)
1045 self.fstype = get_text(dom_node, 'fstype')
1046 # FIXME: if fstype not set, then determine based on kernel version
1047 self.format = get_text(dom_node, 'autoformat', 'yes')
1048 if self.fstype == 'extN':
1049 self.add_module(config.src_dir()+'extN', 'extN')
1050 self.add_module(config.src_dir()+'' + self.obdtype, self.obdtype)
1051 self.add_module(config.src_dir()+'obdclass' , 'fsfilt_%s' % (self.fstype))
1053 # need to check /proc/mounts and /etc/mtab before
1054 # formatting anything.
1055 # FIXME: check if device is already formatted.
1057 if is_prepared(self.uuid):
1059 self.info(self.obdtype, self.devname, self.size, self.fstype, self.format)
1060 if self.obdtype == 'obdecho':
1063 blkdev = block_dev(self.devname, self.size, self.fstype, self.format)
1064 lctl.newdev(attach="%s %s %s" % (self.obdtype, self.name, self.uuid),
1065 setup ="%s %s" %(blkdev, self.fstype))
1067 if not is_prepared(self.uuid):
1069 Module.cleanup(self)
1070 if not self.obdtype == 'obdecho':
1071 clean_loop(self.devname)
1074 def __init__(self,dom_node):
1075 Module.__init__(self, 'OST', dom_node)
1076 self.obd_uuid = get_first_ref(dom_node, 'obd')
1077 self.add_module(config.src_dir()+'ost', 'ost')
1080 if is_prepared(self.uuid):
1082 self.info(self.obd_uuid)
1083 lctl.newdev(attach="ost %s %s" % (self.name, self.uuid),
1084 setup ="%s" % (self.obd_uuid))
1087 # virtual interface for OSC and LOV
1089 def __init__(self,dom_node):
1090 Module.__init__(self, 'VOSC', dom_node)
1091 if dom_node.nodeName == 'lov':
1092 self.osc = LOV(dom_node)
1094 self.osc = OSC(dom_node)
1099 def load_module(self):
1100 self.osc.load_module()
1101 def cleanup_module(self):
1102 self.osc.cleanup_module()
1106 def __init__(self,dom_node):
1107 Module.__init__(self, 'OSC', dom_node)
1108 self.obd_uuid = get_first_ref(dom_node, 'obd')
1109 self.ost_uuid = get_first_ref(dom_node, 'ost')
1110 self.lookup_server(self.ost_uuid)
1111 self.add_module(config.src_dir()+'osc', 'osc')
1113 def prepare(self, ignore_connect_failure = 0):
1114 if is_prepared(self.uuid):
1116 self.info(self.obd_uuid, self.ost_uuid)
1117 srv = self.get_server()
1120 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
1124 lctl.add_route_host(r[0], srv.uuid, r[1], r[2])
1126 panic ("no route to", srv.nid)
1127 except CommandError:
1128 if (ignore_connect_failure == 0):
1131 lctl.newdev(attach="osc %s %s" % (self.name, self.uuid),
1132 setup ="%s %s" %(self.obd_uuid, srv.uuid))
1135 if not is_prepared(self.uuid):
1137 srv = self.get_server()
1139 Module.cleanup(self)
1141 self.info(self.obd_uuid, self.ost_uuid)
1145 lctl.del_route_host(r[0], srv.uuid, r[1], r[2])
1146 except CommandError, e:
1147 print "del_route failed: ", self.name
1150 Module.cleanup(self)
1153 class ECHO_CLIENT(Module):
1154 def __init__(self,dom_node):
1155 Module.__init__(self, 'ECHO_CLIENT', dom_node)
1156 self.add_module('lustre/obdecho', 'obdecho')
1157 self.lov_uuid = get_first_ref(dom_node, 'osc')
1158 l = lookup(self.dom_node.parentNode, self.lov_uuid)
1162 if is_prepared(self.uuid):
1164 self.osc.prepare() # XXX This is so cheating. -p
1165 self.info(self.lov_uuid)
1167 lctl.newdev(attach="echo_client %s %s" % (self.name, self.uuid),
1168 setup = self.lov_uuid)
1171 if not is_prepared(self.uuid):
1175 def load_module(self):
1176 self.osc.load_module()
1177 Module.load_module(self)
1178 def cleanup_module(self):
1179 Module.cleanup_module(self)
1180 self.osc.cleanup_module()
1183 class Mountpoint(Module):
1184 def __init__(self,dom_node):
1185 Module.__init__(self, 'MTPT', dom_node)
1186 self.path = get_text(dom_node, 'path')
1187 self.mds_uuid = get_first_ref(dom_node, 'mds')
1188 self.lov_uuid = get_first_ref(dom_node, 'osc')
1189 self.add_module(config.src_dir()+'mdc', 'mdc')
1190 self.add_module(config.src_dir()+'llite', 'llite')
1191 l = lookup(self.dom_node.parentNode, self.lov_uuid)
1196 mdc_uuid = prepare_mdc(self.dom_node.parentNode, self.mds_uuid)
1197 self.info(self.path, self.mds_uuid, self.lov_uuid)
1198 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s none %s" % \
1199 (self.lov_uuid, mdc_uuid, self.path)
1200 run("mkdir", self.path)
1203 panic("mount failed:", self.path)
1206 self.info(self.path, self.mds_uuid,self.lov_uuid)
1208 (rc, out) = run("umount -f", self.path)
1210 (rc, out) = run("umount", self.path)
1212 log("umount failed, cleanup will most likely not work.")
1213 l = lookup(self.dom_node.parentNode, self.lov_uuid)
1215 cleanup_mdc(self.dom_node.parentNode, self.mds_uuid)
1217 def load_module(self):
1218 self.osc.load_module()
1219 Module.load_module(self)
1220 def cleanup_module(self):
1221 Module.cleanup_module(self)
1222 self.osc.cleanup_module()
1225 # ============================================================
1226 # XML processing and query
1227 # TODO: Change query funcs to use XPath, which is muc cleaner
1229 def get_device(obd):
1230 list = obd.getElementsByTagName('device')
1234 size = get_attr_int(dev, 'size', 0)
1235 return dev.firstChild.data, size
1238 # Get the text content from the first matching child
1239 # If there is no content (or it is all whitespace), return
1241 def get_text(dom_node, tag, default=""):
1242 list = dom_node.getElementsByTagName(tag)
1245 dom_node.normalize()
1246 if dom_node.firstChild:
1247 txt = string.strip(dom_node.firstChild.data)
1252 def get_text_int(dom_node, tag, default=0):
1253 list = dom_node.getElementsByTagName(tag)
1257 dom_node.normalize()
1258 if dom_node.firstChild:
1259 txt = string.strip(dom_node.firstChild.data)
1264 panic("text value is not integer:", txt)
1267 def get_attr(dom_node, attr, default=""):
1268 v = dom_node.getAttribute(attr)
1273 def get_attr_int(dom_node, attr, default=0):
1275 v = dom_node.getAttribute(attr)
1280 panic("attr value is not integer", v)
1283 def get_first_ref(dom_node, tag):
1284 """ Get the first uuidref of the type TAG. Used one only
1285 one is expected. Returns the uuid."""
1287 refname = '%s_ref' % tag
1288 list = dom_node.getElementsByTagName(refname)
1290 uuid = getRef(list[0])
1293 def get_all_refs(dom_node, tag):
1294 """ Get all the refs of type TAG. Returns list of uuids. """
1296 refname = '%s_ref' % tag
1297 list = dom_node.getElementsByTagName(refname)
1300 uuids.append(getRef(i))
1303 def get_ost_net(dom_node, uuid):
1304 ost = lookup(dom_node, uuid)
1305 uuid = get_first_ref(ost, 'network')
1308 return lookup(dom_node, uuid)
1310 def nid2server(dom_node, nid):
1311 netlist = dom_node.getElementsByTagName('network')
1312 for net_node in netlist:
1313 if get_text(net_node, 'server') == nid:
1314 return Network(net_node)
1317 def lookup(dom_node, uuid):
1318 for n in dom_node.childNodes:
1319 if n.nodeType == n.ELEMENT_NODE:
1320 if getUUID(n) == uuid:
1327 # Get name attribute of dom_node
1328 def getName(dom_node):
1329 return dom_node.getAttribute('name')
1331 def getRef(dom_node):
1332 return dom_node.getAttribute('uuidref')
1334 # Get name attribute of dom_node
1335 def getUUID(dom_node):
1336 return dom_node.getAttribute('uuid')
1338 # the tag name is the service type
1339 # fixme: this should do some checks to make sure the dom_node is a service
1340 def getServiceType(dom_node):
1341 return dom_node.nodeName
1344 # determine what "level" a particular node is at.
1345 # the order of iniitailization is based on level.
1346 def getServiceLevel(dom_node):
1347 type = getServiceType(dom_node)
1349 if type in ('network',):
1351 elif type in ('device', 'ldlm'):
1353 elif type in ('obd', 'mdd'):
1355 elif type in ('mds','ost'):
1357 elif type in ('mdc','osc'):
1359 elif type in ('lov', 'lovconfig'):
1361 elif type in ('mountpoint', 'echo_client'):
1364 if ret < config.minlevel() or ret > config.maxlevel():
1369 # return list of services in a profile. list is a list of tuples
1370 # [(level, dom_node),]
1371 def getServices(lustreNode, profileNode):
1373 for n in profileNode.childNodes:
1374 if n.nodeType == n.ELEMENT_NODE:
1375 servNode = lookup(lustreNode, getRef(n))
1378 panic('service not found: ' + getRef(n))
1379 level = getServiceLevel(servNode)
1381 list.append((level, servNode))
1385 def getByName(lustreNode, name, tag):
1386 ndList = lustreNode.getElementsByTagName(tag)
1388 if getName(nd) == name:
1393 ############################################################
1395 # FIXME: clean this mess up!
1398 def prepare_mdc(dom_node, mds_uuid):
1400 mds_node = lookup(dom_node, mds_uuid);
1402 panic("no mds:", mds_uuid)
1403 if saved_mdc.has_key(mds_uuid):
1404 return saved_mdc[mds_uuid]
1407 saved_mdc[mds_uuid] = mdc.uuid
1410 def cleanup_mdc(dom_node, mds_uuid):
1412 mds_node = lookup(dom_node, mds_uuid);
1414 panic("no mds:", mds_uuid)
1415 if not saved_mdc.has_key(mds_uuid):
1418 saved_mdc[mds_uuid] = mdc.uuid
1421 ############################################################
1422 # routing ("rooting")
1428 def init_node(dom_node):
1429 global local_node, router_flag
1430 netlist = dom_node.getElementsByTagName('network')
1431 for dom_net in netlist:
1432 type = get_attr(dom_net, 'type')
1433 gw = get_text(dom_net, 'server')
1434 local_node.append((type, gw))
1436 def node_needs_router():
1439 def get_routes(type, gw, dom_net):
1440 """ Return the routes as a list of tuples of the form:
1441 [(type, gw, lo, hi),]"""
1443 tbl = dom_net.getElementsByTagName('route_tbl')
1445 routes = t.getElementsByTagName('route')
1447 lo = get_attr(r, 'lo')
1448 hi = get_attr(r, 'hi', '')
1449 res.append((type, gw, lo, hi))
1453 def init_route_config(lustre):
1454 """ Scan the lustre config looking for routers. Build list of
1456 global routes, router_flag
1458 list = lustre.getElementsByTagName('node')
1460 if get_attr(node, 'router'):
1462 for (local_type, local_nid) in local_node:
1464 netlist = node.getElementsByTagName('network')
1465 for dom_net in netlist:
1466 if local_type == get_attr(dom_net, 'type'):
1467 gw = get_text(dom_net, 'server')
1471 for dom_net in netlist:
1472 if local_type != get_attr(dom_net, 'type'):
1473 for route in get_routes(local_type, gw, dom_net):
1474 routes.append(route)
1479 for iface in local_node:
1480 if net.net_type == iface[0]:
1484 def find_route(net):
1485 global local_node, routes
1486 frm_type = local_node[0][0]
1487 to_type = net.net_type
1489 debug ('looking for route to', to_type,to)
1498 ############################################################
1501 def startService(dom_node, module_flag):
1502 type = getServiceType(dom_node)
1503 debug('Service:', type, getName(dom_node), getUUID(dom_node))
1504 # there must be a more dynamic way of doing this...
1510 elif type == 'lovconfig':
1511 n = LOVConfig(dom_node)
1512 elif type == 'network':
1513 n = Network(dom_node)
1524 elif type == 'mountpoint':
1525 n = Mountpoint(dom_node)
1526 elif type == 'echo_client':
1527 n = ECHO_CLIENT(dom_node)
1529 panic ("unknown service type:", type)
1534 if config.cleanup():
1539 if config.nosetup():
1541 if config.cleanup():
1547 # Prepare the system to run lustre using a particular profile
1548 # in a the configuration.
1549 # * load & the modules
1550 # * setup networking for the current node
1551 # * make sure partitions are in place and prepared
1552 # * initialize devices with lctl
1553 # Levels is important, and needs to be enforced.
1554 def startProfile(lustreNode, profileNode, module_flag):
1556 panic("profile:", profile, "not found.")
1557 services = getServices(lustreNode, profileNode)
1558 if config.cleanup():
1561 startService(s[1], module_flag)
1566 def doHost(lustreNode, hosts):
1570 dom_node = getByName(lustreNode, h, 'node')
1574 print 'lconf: No host entry found in '+sys.argv[1]
1577 if not get_attr(dom_node, 'router'):
1579 init_route_config(lustreNode)
1584 # Two step process: (1) load modules, (2) setup lustre
1585 # if not cleaning, load modules first.
1586 module_flag = not config.cleanup()
1587 reflist = dom_node.getElementsByTagName('profile')
1588 for profile in reflist:
1589 startProfile(lustreNode, profile, module_flag)
1591 if not config.cleanup():
1592 sys_set_debug_path()
1593 script = config.gdb_script()
1594 run(lctl.lctl, ' modules >', script)
1596 # dump /tmp/ogdb and sleep/pause here
1597 log ("The GDB module script is in", script)
1600 module_flag = not module_flag
1601 for profile in reflist:
1602 startProfile(lustreNode, profile, module_flag)
1604 ############################################################
1605 # Command line processing
1607 def parse_cmdline(argv):
1608 short_opts = "hdnvf"
1609 long_opts = ["ldap", "reformat", "lustre=", "verbose", "gdb",
1610 "portals=", "makeldiff", "cleanup", "noexec",
1611 "help", "node=", "nomod", "nosetup",
1612 "dump=", "force", "minlevel=", "maxlevel="]
1616 opts, args = getopt.getopt(argv, short_opts, long_opts)
1617 except getopt.error:
1622 if o in ("-h", "--help"):
1624 if o in ("-d","--cleanup"):
1626 if o in ("-v", "--verbose"):
1628 if o in ("-n", "--noexec"):
1631 if o == "--portals":
1635 if o == "--reformat":
1643 if o == "--nosetup":
1647 if o in ("-f", "--force"):
1649 if o in ("--minlevel",):
1651 if o in ("--maxlevel",):
1660 s = urllib.urlopen(url)
1666 def setupModulePath(cmd):
1668 base = os.path.dirname(cmd)
1669 if os.access(base+"/Makefile", os.R_OK):
1670 config.src_dir(base + "/../")
1671 if PORTALS_DIR[0] != '/':
1672 PORTALS_DIR= config.src_dir()+PORTALS_DIR
1674 def sys_set_debug_path():
1675 debug("debug path: ", config.debug_path())
1679 fp = open('/proc/sys/portals/debug_path', 'w')
1680 fp.write(config.debug_path())
1685 #/proc/sys/net/core/rmem_max
1686 #/proc/sys/net/core/wmem_max
1687 def sys_set_netmem_max(path, max):
1688 debug("setting", path, "to at least", max)
1696 fp = open(path, 'w')
1697 fp.write('%d\n' %(max))
1701 def sys_make_devices():
1702 if not os.access('/dev/portals', os.R_OK):
1703 run('mknod /dev/portals c 10 240')
1704 if not os.access('/dev/obd', os.R_OK):
1705 run('mknod /dev/obd c 10 241')
1708 # Add dir to the global PATH, if not already there.
1709 def add_to_path(new_dir):
1710 syspath = string.split(os.environ['PATH'], ':')
1711 if new_dir in syspath:
1713 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
1716 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
1717 # ensure basic elements are in the system path
1718 def sanitise_path():
1719 for dir in DEFAULT_PATH:
1722 # Initialize or shutdown lustre according to a configuration file
1723 # * prepare the system for lustre
1724 # * configure devices with lctl
1725 # Shutdown does steps in reverse
1728 global TCP_ACCEPTOR, lctl, MAXTCPBUF
1729 setupModulePath(sys.argv[0])
1731 host = socket.gethostname()
1733 # the PRNG is normally seeded with time(), which is not so good for starting
1734 # time-synchronized clusters
1735 input = open('/dev/urandom', 'r')
1737 print 'Unable to open /dev/urandom!'
1739 seed = input.read(32)
1745 args = parse_cmdline(sys.argv[1:])
1747 if not os.access(args[0], os.R_OK):
1748 print 'File not found or readable:', args[0]
1750 dom = xml.dom.minidom.parse(args[0])
1752 xmldata = fetch(config.url())
1753 dom = xml.dom.minidom.parseString(xmldata)
1759 node_list.append(config.node())
1762 node_list.append(host)
1763 node_list.append('localhost')
1764 debug("configuring for host: ", node_list)
1767 config._debug_path = config._debug_path + '-' + host
1768 config._gdb_script = config._gdb_script + '-' + host
1770 TCP_ACCEPTOR = find_prog('acceptor')
1771 if not TCP_ACCEPTOR:
1773 TCP_ACCEPTOR = 'acceptor'
1774 debug('! acceptor not found')
1776 panic('acceptor not found')
1778 lctl = LCTLInterface('lctl')
1781 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
1782 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
1783 doHost(dom.documentElement, node_list)
1785 if __name__ == "__main__":
1788 except LconfError, e:
1790 except CommandError, e:
1794 if first_cleanup_error:
1795 sys.exit(first_cleanup_error)