3 # Copyright (C) 2002-2003 Cluster File Systems, Inc.
4 # Authors: Robert Read <rread@clusterfs.com>
5 # Mike Shaver <shaver@clusterfs.com>
6 # This file is part of Lustre, http://www.lustre.org.
8 # Lustre is free software; you can redistribute it and/or
9 # modify it under the terms of version 2 of the GNU General Public
10 # License as published by the Free Software Foundation.
12 # Lustre is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Lustre; if not, write to the Free Software
19 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 # lconf - lustre configuration tool
23 # lconf is the main driver script for starting and stopping
24 # lustre filesystem services.
26 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import sys, getopt, types
29 import string, os, stat, popen2, socket, time, random, fcntl, select
30 import re, exceptions, signal, traceback
31 import xml.dom.minidom
33 if sys.version[0] == '1':
34 from FCNTL import F_GETFL, F_SETFL
36 from fcntl import F_GETFL, F_SETFL
38 PYMOD_DIR = "/usr/lib/lustre/python"
40 def development_mode():
41 base = os.path.dirname(sys.argv[0])
42 if os.access(base+"/Makefile.am", os.R_OK):
46 if not development_mode():
47 sys.path.append(PYMOD_DIR)
53 DEFAULT_TCPBUF = 8388608
56 # Maximum number of devices to search for.
57 # (the /dev/loop* nodes need to be created beforehand)
58 MAX_LOOP_DEVICES = 256
59 PORTALS_DIR = 'portals'
61 # Needed to call lconf --record
64 # Please keep these in sync with the values in portals/kp30.h
76 "warning" : (1 << 10),
80 "portals" : (1 << 14),
82 "dlmtrace" : (1 << 16),
86 "rpctrace" : (1 << 20),
87 "vfstrace" : (1 << 21),
92 "undefined" : (1 << 0),
102 "portals" : (1 << 10),
103 "socknal" : (1 << 11),
104 "qswnal" : (1 << 12),
105 "pinger" : (1 << 13),
106 "filter" : (1 << 14),
112 "ptlrouter" : (1 << 20),
118 first_cleanup_error = 0
119 def cleanup_error(rc):
120 global first_cleanup_error
121 if not first_cleanup_error:
122 first_cleanup_error = rc
124 # ============================================================
125 # debugging and error funcs
127 def fixme(msg = "this feature"):
128 raise Lustre.LconfError, msg + ' not implmemented yet.'
131 msg = string.join(map(str,args))
132 if not config.noexec:
133 raise Lustre.LconfError(msg)
138 msg = string.join(map(str,args))
143 print string.strip(s)
147 msg = string.join(map(str,args))
150 # ack, python's builtin int() does not support '0x123' syntax.
151 # eval can do it, although what a hack!
155 return eval(s, {}, {})
158 except SyntaxError, e:
159 raise ValueError("not a number")
161 raise ValueError("not a number")
163 # ============================================================
164 # locally defined exceptions
165 class CommandError (exceptions.Exception):
166 def __init__(self, cmd_name, cmd_err, rc=None):
167 self.cmd_name = cmd_name
168 self.cmd_err = cmd_err
173 if type(self.cmd_err) == types.StringType:
175 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
177 print "! %s: %s" % (self.cmd_name, self.cmd_err)
178 elif type(self.cmd_err) == types.ListType:
180 print "! %s (error %d):" % (self.cmd_name, self.rc)
182 print "! %s:" % (self.cmd_name)
183 for s in self.cmd_err:
184 print "> %s" %(string.strip(s))
189 # ============================================================
190 # handle daemons, like the acceptor
192 """ Manage starting and stopping a daemon. Assumes daemon manages
193 it's own pid file. """
195 def __init__(self, cmd):
201 log(self.command, "already running.")
203 self.path = find_prog(self.command)
205 panic(self.command, "not found.")
206 ret, out = runcmd(self.path +' '+ self.command_line())
208 raise CommandError(self.path, out, ret)
212 pid = self.read_pidfile()
214 log ("killing process", pid)
216 #time.sleep(1) # let daemon die
218 log("unable to kill", self.command, e)
220 log("unable to kill", self.command)
223 pid = self.read_pidfile()
233 def read_pidfile(self):
235 fp = open(self.pidfile(), 'r')
242 def clean_pidfile(self):
243 """ Remove a stale pidfile """
244 log("removing stale pidfile:", self.pidfile())
246 os.unlink(self.pidfile())
248 log(self.pidfile(), e)
250 class AcceptorHandler(DaemonHandler):
251 def __init__(self, port, net_type, send_mem, recv_mem, irq_aff):
252 DaemonHandler.__init__(self, "acceptor")
255 self.send_mem = send_mem
256 self.recv_mem = recv_mem
259 self.flags = self.flags + ' -i'
262 return "/var/run/%s-%d.pid" % (self.command, self.port)
264 def command_line(self):
265 return string.join(map(str,('-s', self.send_mem, '-r', self.recv_mem, self.flags, self.port)))
269 # start the acceptors
271 if config.lctl_dump or config.record:
273 for port in acceptors.keys():
274 daemon = acceptors[port]
275 if not daemon.running():
278 def run_one_acceptor(port):
279 if config.lctl_dump or config.record:
281 if acceptors.has_key(port):
282 daemon = acceptors[port]
283 if not daemon.running():
286 panic("run_one_acceptor: No acceptor defined for port:", port)
288 def stop_acceptor(port):
289 if acceptors.has_key(port):
290 daemon = acceptors[port]
295 # ============================================================
296 # handle lctl interface
299 Manage communication with lctl
302 def __init__(self, cmd):
304 Initialize close by finding the lctl binary.
306 self.lctl = find_prog(cmd)
308 self.record_device = ''
311 debug('! lctl not found')
314 raise CommandError('lctl', "unable to find lctl binary.")
316 def use_save_file(self, file):
317 self.save_file = file
319 def record(self, dev_name, logname):
320 log("Recording log", logname, "on", dev_name)
321 self.record_device = dev_name
322 self.record_log = logname
324 def end_record(self):
325 log("End recording log", self.record_log, "on", self.record_device)
326 self.record_device = None
327 self.record_log = None
329 def set_nonblock(self, fd):
330 fl = fcntl.fcntl(fd, F_GETFL)
331 fcntl.fcntl(fd, F_SETFL, fl | os.O_NDELAY)
336 the cmds are written to stdin of lctl
337 lctl doesn't return errors when run in script mode, so
339 should modify command line to accept multiple commands, or
340 create complex command line options
344 cmds = '\n dump ' + self.save_file + '\n' + cmds
345 elif self.record_device:
349 %s""" % (self.record_device, self.record_log, cmds)
351 debug("+", cmd_line, cmds)
352 if config.noexec: return (0, [])
354 child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command
355 child.tochild.write(cmds + "\n")
356 child.tochild.close()
358 # From "Python Cookbook" from O'Reilly
359 outfile = child.fromchild
360 outfd = outfile.fileno()
361 self.set_nonblock(outfd)
362 errfile = child.childerr
363 errfd = errfile.fileno()
364 self.set_nonblock(errfd)
366 outdata = errdata = ''
369 ready = select.select([outfd,errfd],[],[]) # Wait for input
370 if outfd in ready[0]:
371 outchunk = outfile.read()
372 if outchunk == '': outeof = 1
373 outdata = outdata + outchunk
374 if errfd in ready[0]:
375 errchunk = errfile.read()
376 if errchunk == '': erreof = 1
377 errdata = errdata + errchunk
378 if outeof and erreof: break
379 # end of "borrowed" code
382 if os.WIFEXITED(ret):
383 rc = os.WEXITSTATUS(ret)
386 if rc or len(errdata):
387 raise CommandError(self.lctl, errdata, rc)
390 def runcmd(self, *args):
392 run lctl using the command line
394 cmd = string.join(map(str,args))
395 debug("+", self.lctl, cmd)
396 rc, out = run(self.lctl, cmd)
398 raise CommandError(self.lctl, out, rc)
402 def network(self, net, nid):
407 quit """ % (net, nid)
410 # create a new connection
411 def add_uuid(self, net_type, uuid, nid):
412 cmds = "\n add_uuid %s %s %s" %(uuid, nid, net_type)
415 def add_autoconn(self, net_type, send_mem, recv_mem, nid, hostaddr,
417 if net_type in ('tcp',) and not config.lctl_dump:
422 add_autoconn %s %s %d %s
426 nid, hostaddr, port, flags )
429 def connect(self, srv):
430 self.add_uuid(srv.net_type, srv.nid_uuid, srv.nid)
431 if srv.net_type in ('tcp',) and not config.lctl_dump:
435 self.add_autoconn(srv.net_type, srv.send_mem, srv.recv_mem,
436 srv.nid, srv.hostaddr, srv.port, flags)
439 def recover(self, dev_name, new_conn):
442 recover %s""" %(dev_name, new_conn)
445 # add a route to a range
446 def add_route(self, net, gw, lo, hi):
454 except CommandError, e:
458 def del_route(self, net, gw, lo, hi):
463 quit """ % (net, gw, lo, hi)
466 # add a route to a host
467 def add_route_host(self, net, uuid, gw, tgt):
468 self.add_uuid(net, uuid, tgt)
476 except CommandError, e:
480 # add a route to a range
481 def del_route_host(self, net, uuid, gw, tgt):
487 quit """ % (net, gw, tgt)
491 def del_autoconn(self, net_type, nid, hostaddr):
492 if net_type in ('tcp',) and not config.lctl_dump:
501 # disconnect one connection
502 def disconnect(self, srv):
503 self.del_uuid(srv.nid_uuid)
504 if srv.net_type in ('tcp',) and not config.lctl_dump:
505 self.del_autoconn(srv.net_type, srv.nid, srv.hostaddr)
507 def del_uuid(self, uuid):
515 def disconnectAll(self, net):
523 def attach(self, type, name, uuid):
526 quit""" % (type, name, uuid)
529 def setup(self, name, setup = ""):
533 quit""" % (name, setup)
537 # create a new device with lctl
538 def newdev(self, type, name, uuid, setup = ""):
539 self.attach(type, name, uuid);
541 self.setup(name, setup)
542 except CommandError, e:
543 self.cleanup(name, uuid, 0)
548 def cleanup(self, name, uuid, force, failover = 0):
549 if failover: force = 1
555 quit""" % (name, ('', 'force')[force],
556 ('', 'failover')[failover])
560 def lov_setup(self, name, uuid, desc_uuid, mdsuuid, stripe_cnt,
561 stripe_sz, stripe_off,
565 lov_setup %s %d %d %d %s %s
566 quit""" % (name, uuid, desc_uuid, stripe_cnt, stripe_sz, stripe_off,
571 def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off,
575 lov_setconfig %s %d %d %d %s %s
576 quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
580 def dump(self, dump_file):
583 quit""" % (dump_file)
586 # get list of devices
587 def device_list(self):
588 devices = '/proc/fs/lustre/devices'
590 if os.access(devices, os.R_OK):
592 fp = open(devices, 'r')
600 def lustre_version(self):
601 rc, out = self.runcmd('version')
605 def mount_option(self, profile, osc, mdc):
607 mount_option %s %s %s
608 quit""" % (profile, osc, mdc)
611 # delete mount options
612 def del_mount_option(self, profile):
618 def set_timeout(self, timeout):
624 # delete mount options
625 def set_lustre_upcall(self, upcall):
630 # ============================================================
631 # Various system-level functions
632 # (ideally moved to their own module)
634 # Run a command and return the output and status.
635 # stderr is sent to /dev/null, could use popen3 to
636 # save it if necessary
639 if config.noexec: return (0, [])
640 f = os.popen(cmd + ' 2>&1')
650 cmd = string.join(map(str,args))
653 # Run a command in the background.
654 def run_daemon(*args):
655 cmd = string.join(map(str,args))
657 if config.noexec: return 0
658 f = os.popen(cmd + ' 2>&1')
666 # Determine full path to use for an external command
667 # searches dirname(argv[0]) first, then PATH
669 syspath = string.split(os.environ['PATH'], ':')
670 cmdpath = os.path.dirname(sys.argv[0])
671 syspath.insert(0, cmdpath);
673 syspath.insert(0, os.path.join(config.portals, 'utils/'))
675 prog = os.path.join(d,cmd)
676 if os.access(prog, os.X_OK):
680 # Recursively look for file starting at base dir
681 def do_find_file(base, mod):
682 fullname = os.path.join(base, mod)
683 if os.access(fullname, os.R_OK):
685 for d in os.listdir(base):
686 dir = os.path.join(base,d)
687 if os.path.isdir(dir):
688 module = do_find_file(dir, mod)
692 def find_module(src_dir, dev_dir, modname):
693 mod = '%s.o' % (modname)
694 module = src_dir +'/'+ dev_dir +'/'+ mod
696 if os.access(module, os.R_OK):
702 # is the path a block device?
709 return stat.S_ISBLK(s[stat.ST_MODE])
711 # build fs according to type
713 def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1):
719 panic("size of filesystem on '%s' must be larger than 8MB, but is set to %s"%
721 # devsize is in 1k, and fs block count is in 4k
722 block_cnt = devsize/4
724 if fstype in ('ext3', 'extN'):
725 # ext3 journal size is in megabytes
726 if jsize: jopt = "-J size=%d" %(jsize,)
727 if isize: iopt = "-I %d" %(isize,)
728 mkfs = 'mkfs.ext2 -j -b 4096 '
729 if not isblock or config.force:
731 elif fstype == 'reiserfs':
732 # reiserfs journal size is in blocks
733 if jsize: jopt = "--journal_size %d" %(jsize,)
734 mkfs = 'mkreiserfs -ff'
736 panic('unsupported fs type: ', fstype)
738 if config.mkfsoptions != None:
739 mkfs = mkfs + ' ' + config.mkfsoptions
740 if mkfsoptions != None:
741 mkfs = mkfs + ' ' + mkfsoptions
742 (ret, out) = run (mkfs, jopt, iopt, dev, block_cnt)
744 panic("Unable to build fs:", dev, string.join(out))
745 # enable hash tree indexing on fsswe
746 if fstype in ('ext3', 'extN'):
747 htree = 'echo "feature FEATURE_C5" | debugfs -w'
748 (ret, out) = run (htree, dev)
750 panic("Unable to enable htree:", dev)
752 # some systems use /dev/loopN, some /dev/loop/N
756 if not os.access(loop + str(0), os.R_OK):
758 if not os.access(loop + str(0), os.R_OK):
759 panic ("can't access loop devices")
762 # find loop device assigned to thefile
765 for n in xrange(0, MAX_LOOP_DEVICES):
767 if os.access(dev, os.R_OK):
768 (stat, out) = run('losetup', dev)
769 if out and stat == 0:
770 m = re.search(r'\((.*)\)', out[0])
771 if m and file == m.group(1):
777 # create file if necessary and assign the first free loop device
778 def init_loop(file, size, fstype, journal_size, inode_size, mkfsoptions, reformat):
779 dev = find_loop(file)
781 print 'WARNING file:', file, 'already mapped to', dev
783 if reformat or not os.access(file, os.R_OK | os.W_OK):
785 panic("size of loopback file '%s' must be larger than 8MB, but is set to %s" % (file,size))
786 (ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size,
789 panic("Unable to create backing store:", file)
790 mkfs(file, size, fstype, journal_size, inode_size, mkfsoptions, isblock=0)
793 # find next free loop
794 for n in xrange(0, MAX_LOOP_DEVICES):
796 if os.access(dev, os.R_OK):
797 (stat, out) = run('losetup', dev)
799 run('losetup', dev, file)
802 print "out of loop devices"
804 print "out of loop devices"
807 # undo loop assignment
808 def clean_loop(file):
809 dev = find_loop(file)
811 ret, out = run('losetup -d', dev)
813 log('unable to clean loop device:', dev, 'for file:', file)
816 # determine if dev is formatted as a <fstype> filesystem
817 def need_format(fstype, dev):
818 # FIXME don't know how to implement this
821 # initialize a block device if needed
822 def block_dev(dev, size, fstype, reformat, autoformat, journal_size,
823 inode_size, mkfsoptions):
824 if config.noexec: return dev
825 if not is_block(dev):
826 dev = init_loop(dev, size, fstype, journal_size, inode_size,
827 mkfsoptions, reformat)
828 elif reformat or (need_format(fstype, dev) and autoformat == 'yes'):
829 mkfs(dev, size, fstype, journal_size, inode_size, mkfsoptions,
832 # panic("device:", dev,
833 # "not prepared, and autoformat is not set.\n",
834 # "Rerun with --reformat option to format ALL filesystems")
839 """lookup IP address for an interface"""
840 rc, out = run("/sbin/ifconfig", iface)
843 addr = string.split(out[1])[1]
844 ip = string.split(addr, ':')[1]
847 def sys_get_elan_position_file():
848 procfiles = ["/proc/elan/device0/position",
849 "/proc/qsnet/elan4/device0/position",
850 "/proc/qsnet/elan3/device0/position"]
852 if os.access(p, os.R_OK):
856 def sys_get_local_nid(net_type, wildcard, cluster_id):
857 """Return the local nid."""
859 if sys_get_elan_position_file():
860 local = sys_get_local_address('elan', '*', cluster_id)
862 local = sys_get_local_address(net_type, wildcard, cluster_id)
865 def sys_get_local_address(net_type, wildcard, cluster_id):
866 """Return the local address for the network type."""
868 if net_type in ('tcp',):
870 iface, star = string.split(wildcard, ':')
871 local = if2addr(iface)
873 panic ("unable to determine ip for:", wildcard)
875 host = socket.gethostname()
876 local = socket.gethostbyname(host)
877 elif net_type == 'elan':
878 # awk '/NodeId/ { print $2 }' 'sys_get_elan_position_file()'
879 f = sys_get_elan_position_file()
881 panic ("unable to determine local Elan ID")
884 lines = fp.readlines()
892 nid = my_int(cluster_id) + my_int(elan_id)
894 except ValueError, e:
898 elif net_type == 'gm':
899 fixme("automatic local address for GM")
900 elif net_type == 'scimac':
901 scinode="/opt/scali/sbin/scinode"
902 if os.path.exists(scinode):
903 (rc,local) = run(scinode)
905 panic (scinode, " not found on node with scimac networking")
907 panic (scinode, " failed")
908 local=string.rstrip(local[0])
912 def mod_loaded(modname):
913 """Check if a module is already loaded. Look in /proc/modules for it."""
915 fp = open('/proc/modules')
916 lines = fp.readlines()
918 # please forgive my tired fingers for this one
919 ret = filter(lambda word, mod=modname: word == mod,
920 map(lambda line: string.split(line)[0], lines))
925 # XXX: instead of device_list, ask for $name and see what we get
926 def is_prepared(name):
927 """Return true if a device exists for the name"""
930 if (config.noexec or config.record) and config.cleanup:
933 # expect this format:
934 # 1 UP ldlm ldlm ldlm_UUID 2
935 out = lctl.device_list()
937 if name == string.split(s)[3]:
939 except CommandError, e:
943 def is_network_prepared():
944 """If the any device exists, then assume that all networking
945 has been configured"""
946 out = lctl.device_list()
949 def fs_is_mounted(path):
950 """Return true if path is a mounted lustre filesystem"""
952 fp = open('/proc/mounts')
953 lines = fp.readlines()
957 if a[1] == path and a[2] == 'lustre_lite':
965 """Manage kernel modules"""
966 def __init__(self, lustre_dir, portals_dir):
967 self.lustre_dir = lustre_dir
968 self.portals_dir = portals_dir
969 self.kmodule_list = []
971 def add_portals_module(self, dev_dir, modname):
972 """Append a module to list of modules to load."""
973 self.kmodule_list.append((self.portals_dir, dev_dir, modname))
975 def add_lustre_module(self, dev_dir, modname):
976 """Append a module to list of modules to load."""
977 self.kmodule_list.append((self.lustre_dir, dev_dir, modname))
979 def load_module(self):
980 """Load all the modules in the list in the order they appear."""
981 for src_dir, dev_dir, mod in self.kmodule_list:
982 if mod_loaded(mod) and not config.noexec:
984 log ('loading module:', mod, 'srcdir', src_dir, 'devdir', dev_dir)
986 module = find_module(src_dir, dev_dir, mod)
988 panic('module not found:', mod)
989 (rc, out) = run('/sbin/insmod', module)
991 raise CommandError('insmod', out, rc)
993 (rc, out) = run('/sbin/modprobe', mod)
995 raise CommandError('modprobe', out, rc)
997 def cleanup_module(self):
998 """Unload the modules in the list in reverse order."""
999 rev = self.kmodule_list
1001 for src_dir, dev_dir, mod in rev:
1002 if not mod_loaded(mod) and not config.noexec:
1005 if mod == 'portals' and config.dump:
1006 lctl.dump(config.dump)
1007 log('unloading module:', mod)
1008 (rc, out) = run('/sbin/rmmod', mod)
1010 log('! unable to unload module:', mod)
1013 # ============================================================
1014 # Classes to prepare and cleanup the various objects
1017 """ Base class for the rest of the modules. The default cleanup method is
1018 defined here, as well as some utilitiy funcs.
1020 def __init__(self, module_name, db):
1022 self.module_name = module_name
1023 self.name = self.db.getName()
1024 self.uuid = self.db.getUUID()
1027 self.kmod = kmod(config.lustre, config.portals)
1029 def info(self, *args):
1030 msg = string.join(map(str,args))
1031 print self.module_name + ":", self.name, self.uuid, msg
1034 """ default cleanup, used for most modules """
1037 lctl.cleanup(self.name, self.uuid, config.force)
1038 except CommandError, e:
1039 log(self.module_name, "cleanup failed: ", self.name)
1043 def add_portals_module(self, dev_dir, modname):
1044 """Append a module to list of modules to load."""
1045 self.kmod.add_portals_module(dev_dir, modname)
1047 def add_lustre_module(self, dev_dir, modname):
1048 """Append a module to list of modules to load."""
1049 self.kmod.add_lustre_module(dev_dir, modname)
1051 def load_module(self):
1052 """Load all the modules in the list in the order they appear."""
1053 self.kmod.load_module()
1055 def cleanup_module(self):
1056 """Unload the modules in the list in reverse order."""
1057 if self.safe_to_clean():
1058 self.kmod.cleanup_module()
1060 def safe_to_clean(self):
1063 def safe_to_clean_modules(self):
1064 return self.safe_to_clean()
1066 class Network(Module):
1067 def __init__(self,db):
1068 Module.__init__(self, 'NETWORK', db)
1069 self.net_type = self.db.get_val('nettype')
1070 self.nid = self.db.get_val('nid', '*')
1071 self.cluster_id = self.db.get_val('clusterid', "0")
1072 self.port = self.db.get_val_int('port', 0)
1073 self.send_mem = self.db.get_val_int('sendmem', DEFAULT_TCPBUF)
1074 self.recv_mem = self.db.get_val_int('recvmem', DEFAULT_TCPBUF)
1075 self.irq_affinity = self.db.get_val_int('irqaffinity', 0)
1078 self.nid = sys_get_local_nid(self.net_type, self.nid, self.cluster_id)
1080 panic("unable to set nid for", self.net_type, self.nid, cluster_id)
1081 self.generic_nid = 1
1082 debug("nid:", self.nid)
1084 self.generic_nid = 0
1086 self.nid_uuid = self.nid_to_uuid(self.nid)
1088 self.hostaddr = self.db.get_val('hostaddr', self.nid)
1089 if '*' in self.hostaddr:
1090 self.hostaddr = sys_get_local_address(self.net_type, self.hostaddr, self.cluster_id)
1091 if not self.hostaddr:
1092 panic("unable to set hostaddr for", self.net_type, self.hostaddr, self.cluster_id)
1093 debug("hostaddr:", self.hostaddr)
1095 self.add_portals_module("libcfs", 'portals')
1096 if node_needs_router():
1097 self.add_portals_module("router", 'kptlrouter')
1098 if self.net_type == 'tcp':
1099 self.add_portals_module("knals/socknal", 'ksocknal')
1100 if self.net_type == 'elan':
1101 self.add_portals_module("knals/qswnal", 'kqswnal')
1102 if self.net_type == 'gm':
1103 self.add_portals_module("knals/gmnal", 'kgmnal')
1104 if self.net_type == 'scimac':
1105 self.add_portals_module("knals/scimacnal", 'kscimacnal')
1107 def nid_to_uuid(self, nid):
1108 return "NID_%s_UUID" %(nid,)
1111 if is_network_prepared():
1113 self.info(self.net_type, self.nid, self.port)
1114 if not (config.record and self.generic_nid):
1115 lctl.network(self.net_type, self.nid)
1116 if self.net_type == 'tcp':
1118 if self.net_type == 'elan':
1120 if self.port and node_is_router():
1121 run_one_acceptor(self.port)
1122 self.connect_peer_gateways()
1124 def connect_peer_gateways(self):
1125 for router in self.db.lookup_class('node'):
1126 if router.get_val_int('router', 0):
1127 for netuuid in router.get_networks():
1128 net = self.db.lookup(netuuid)
1130 if (gw.cluster_id == self.cluster_id and
1131 gw.net_type == self.net_type):
1132 if gw.nid != self.nid:
1135 def disconnect_peer_gateways(self):
1136 for router in self.db.lookup_class('node'):
1137 if router.get_val_int('router', 0):
1138 for netuuid in router.get_networks():
1139 net = self.db.lookup(netuuid)
1141 if (gw.cluster_id == self.cluster_id and
1142 gw.net_type == self.net_type):
1143 if gw.nid != self.nid:
1146 except CommandError, e:
1147 print "disconnect failed: ", self.name
1151 def safe_to_clean(self):
1152 return not is_network_prepared()
1155 self.info(self.net_type, self.nid, self.port)
1157 stop_acceptor(self.port)
1158 if node_is_router():
1159 self.disconnect_peer_gateways()
1161 class RouteTable(Module):
1162 def __init__(self,db):
1163 Module.__init__(self, 'ROUTES', db)
1165 def server_for_route(self, net_type, gw, gw_cluster_id, tgt_cluster_id,
1167 # only setup connections for tcp NALs
1169 if not net_type in ('tcp',):
1172 # connect to target if route is to single node and this node is the gw
1173 if lo == hi and local_interface(net_type, gw_cluster_id, gw):
1174 if not local_cluster(net_type, tgt_cluster_id):
1175 panic("target", lo, " not on the local cluster")
1176 srvdb = self.db.nid2server(lo, net_type, gw_cluster_id)
1177 # connect to gateway if this node is not the gw
1178 elif (local_cluster(net_type, gw_cluster_id)
1179 and not local_interface(net_type, gw_cluster_id, gw)):
1180 srvdb = self.db.nid2server(gw, net_type, gw_cluster_id)
1185 panic("no server for nid", lo)
1188 return Network(srvdb)
1191 if is_network_prepared():
1194 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1195 lctl.add_route(net_type, gw, lo, hi)
1196 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1200 def safe_to_clean(self):
1201 return not is_network_prepared()
1204 if is_network_prepared():
1205 # the network is still being used, don't clean it up
1207 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1208 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1211 lctl.disconnect(srv)
1212 except CommandError, e:
1213 print "disconnect failed: ", self.name
1218 lctl.del_route(net_type, gw, lo, hi)
1219 except CommandError, e:
1220 print "del_route failed: ", self.name
1224 class Management(Module):
1225 def __init__(self, db):
1226 Module.__init__(self, 'MGMT', db)
1227 self.add_lustre_module('lvfs', 'lvfs')
1228 self.add_lustre_module('obdclass', 'obdclass')
1229 self.add_lustre_module('ptlrpc', 'ptlrpc')
1230 self.add_lustre_module('mgmt', 'mgmt_svc')
1233 if is_prepared(self.name):
1236 lctl.newdev("mgmt", self.name, self.uuid)
1238 def safe_to_clean(self):
1242 if is_prepared(self.name):
1243 Module.cleanup(self)
1245 # This is only needed to load the modules; the LDLM device
1246 # is now created automatically.
1248 def __init__(self,db):
1249 Module.__init__(self, 'LDLM', db)
1250 self.add_lustre_module('lvfs', 'lvfs')
1251 self.add_lustre_module('obdclass', 'obdclass')
1252 self.add_lustre_module('ptlrpc', 'ptlrpc')
1261 def __init__(self, db, uuid, fs_name, name_override = None, config_only = None):
1262 Module.__init__(self, 'LOV', db)
1263 if name_override != None:
1264 self.name = "lov_%s" % name_override
1265 self.add_lustre_module('lov', 'lov')
1266 self.mds_uuid = self.db.get_first_ref('mds')
1267 self.stripe_sz = self.db.get_val_int('stripesize', 65536)
1268 self.stripe_off = self.db.get_val_int('stripeoffset', 0)
1269 self.pattern = self.db.get_val_int('stripepattern', 0)
1270 self.devlist = self.db.get_refs('obd')
1271 self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist))
1273 self.desc_uuid = self.uuid
1274 self.uuid = generate_client_uuid(self.name)
1275 self.fs_name = fs_name
1277 self.config_only = 1
1279 self.config_only = None
1280 mds= self.db.lookup(self.mds_uuid)
1281 self.mds_name = mds.getName()
1282 for obd_uuid in self.devlist:
1283 obd = self.db.lookup(obd_uuid)
1284 osc = get_osc(obd, self.uuid, fs_name)
1286 self.osclist.append(osc)
1288 panic('osc not found:', obd_uuid)
1291 if is_prepared(self.name):
1293 if self.config_only:
1294 panic("Can't prepare config_only LOV ", self.name)
1296 for osc in self.osclist:
1298 # Only ignore connect failures with --force, which
1299 # isn't implemented here yet.
1300 osc.prepare(ignore_connect_failure=0)
1301 except CommandError, e:
1302 print "Error preparing OSC %s\n" % osc.uuid
1304 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
1305 self.stripe_off, self.pattern, self.devlist, self.mds_name)
1306 lctl.lov_setup(self.name, self.uuid,
1307 self.desc_uuid, self.mds_name, self.stripe_cnt,
1308 self.stripe_sz, self.stripe_off, self.pattern,
1309 string.join(self.devlist))
1312 if is_prepared(self.name):
1313 Module.cleanup(self)
1314 if self.config_only:
1315 panic("Can't clean up config_only LOV ", self.name)
1316 for osc in self.osclist:
1319 def load_module(self):
1320 if self.config_only:
1321 panic("Can't load modules for config_only LOV ", self.name)
1322 for osc in self.osclist:
1325 Module.load_module(self)
1327 def cleanup_module(self):
1328 if self.config_only:
1329 panic("Can't cleanup modules for config_only LOV ", self.name)
1330 Module.cleanup_module(self)
1331 for osc in self.osclist:
1332 osc.cleanup_module()
1335 class MDSDEV(Module):
1336 def __init__(self,db):
1337 Module.__init__(self, 'MDSDEV', db)
1338 self.devpath = self.db.get_val('devpath','')
1339 self.size = self.db.get_val_int('devsize', 0)
1340 self.journal_size = self.db.get_val_int('journalsize', 0)
1341 self.fstype = self.db.get_val('fstype', '')
1342 self.nspath = self.db.get_val('nspath', '')
1343 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1344 # overwrite the orignal MDSDEV name and uuid with the MDS name and uuid
1345 target_uuid = self.db.get_first_ref('target')
1346 mds = self.db.lookup(target_uuid)
1347 self.name = mds.getName()
1348 self.filesystem_uuids = mds.get_refs('filesystem')
1349 # FIXME: if fstype not set, then determine based on kernel version
1350 self.format = self.db.get_val('autoformat', "no")
1351 if mds.get_val('failover', 0):
1352 self.failover_mds = 'f'
1354 self.failover_mds = 'n'
1355 active_uuid = get_active_target(mds)
1357 panic("No target device found:", target_uuid)
1358 if active_uuid == self.uuid:
1362 if self.active and config.group and config.group != ost.get_val('group'):
1365 self.inode_size = self.db.get_val_int('inodesize', 0)
1366 if self.inode_size == 0:
1367 # find the LOV for this MDS
1368 lovconfig_uuid = mds.get_first_ref('lovconfig')
1369 if not lovconfig_uuid:
1370 panic("No LOV config found for MDS ", mds.name)
1371 lovconfig = mds.lookup(lovconfig_uuid)
1372 lov_uuid = lovconfig.get_first_ref('lov')
1374 panic("No LOV found for lovconfig ", lovconfig.name)
1375 lov = LOV(self.db.lookup(lov_uuid), lov_uuid, 'FS_name', config_only = 1)
1377 # default stripe count controls default inode_size
1378 stripe_count = lov.stripe_cnt
1379 if stripe_count > 77:
1380 self.inode_size = 4096
1381 elif stripe_count > 35:
1382 self.inode_size = 2048
1383 elif stripe_count > 13:
1384 self.inode_size = 1024
1385 elif stripe_count > 3:
1386 self.inode_size = 512
1388 self.inode_size = 256
1390 self.target_dev_uuid = self.uuid
1391 self.uuid = target_uuid
1393 self.add_lustre_module('mdc', 'mdc')
1394 self.add_lustre_module('osc', 'osc')
1395 self.add_lustre_module('lov', 'lov')
1396 self.add_lustre_module('mds', 'mds')
1398 self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.fstype))
1400 def load_module(self):
1402 Module.load_module(self)
1405 if is_prepared(self.name):
1408 debug(self.uuid, "not active")
1411 # run write_conf automatically, if --reformat used
1413 self.info(self.devpath, self.fstype, self.size, self.format)
1415 # never reformat here
1416 blkdev = block_dev(self.devpath, self.size, self.fstype, 0,
1417 self.format, self.journal_size, self.inode_size,
1419 if not is_prepared('MDT'):
1420 lctl.newdev("mdt", 'MDT', 'MDT_UUID', setup ="")
1422 lctl.newdev("mds", self.name, self.uuid,
1423 setup ="%s %s %s" %(blkdev, self.fstype, self.name))
1424 except CommandError, e:
1426 panic("MDS is missing the config log. Need to run " +
1427 "lconf --write_conf.")
1431 def write_conf(self):
1432 if is_prepared(self.name):
1434 self.info(self.devpath, self.fstype, self.format)
1435 blkdev = block_dev(self.devpath, self.size, self.fstype,
1436 config.reformat, self.format, self.journal_size,
1437 self.inode_size, self.mkfsoptions)
1438 lctl.newdev("mds", self.name, self.uuid,
1439 setup ="%s %s" %(blkdev, self.fstype))
1441 # record logs for the MDS lov
1442 for uuid in self.filesystem_uuids:
1443 log("recording clients for filesystem:", uuid)
1444 fs = self.db.lookup(uuid)
1445 obd_uuid = fs.get_first_ref('obd')
1446 client_uuid = generate_client_uuid(self.name)
1447 client = VOSC(self.db.lookup(obd_uuid), client_uuid, self.name,
1450 lctl.record(self.name, self.name)
1452 lctl.mount_option(self.name, client.get_name(), "")
1456 lctl.record(self.name, self.name + '-clean')
1458 lctl.del_mount_option(self.name)
1463 # record logs for each client
1465 config_options = "--ldapurl " + config.ldapurl + " --config " + config.config
1467 config_options = CONFIG_FILE
1469 for node_db in self.db.lookup_class('node'):
1470 client_name = node_db.getName()
1471 for prof_uuid in node_db.get_refs('profile'):
1472 prof_db = node_db.lookup(prof_uuid)
1473 # refactor this into a funtion to test "clientness"
1475 for ref_class, ref_uuid in prof_db.get_all_refs():
1476 if ref_class in ('mountpoint','echoclient'):
1477 debug("recording", client_name)
1478 old_noexec = config.noexec
1480 noexec_opt = ('', '-n')
1481 ret, out = run (sys.argv[0],
1482 noexec_opt[old_noexec == 1],
1483 " -v --record --nomod",
1484 "--record_log", client_name,
1485 "--record_device", self.name,
1486 "--node", client_name,
1489 for s in out: log("record> ", string.strip(s))
1490 ret, out = run (sys.argv[0],
1491 noexec_opt[old_noexec == 1],
1492 "--cleanup -v --record --nomod",
1493 "--record_log", client_name + "-clean",
1494 "--record_device", self.name,
1495 "--node", client_name,
1498 for s in out: log("record> ", string.strip(s))
1499 config.noexec = old_noexec
1501 lctl.cleanup(self.name, self.uuid, 0, 0)
1502 except CommandError, e:
1503 log(self.module_name, "cleanup failed: ", self.name)
1506 Module.cleanup(self)
1507 clean_loop(self.devpath)
1509 def msd_remaining(self):
1510 out = lctl.device_list()
1512 if string.split(s)[2] in ('mds',):
1515 def safe_to_clean(self):
1518 def safe_to_clean_modules(self):
1519 return not self.msd_remaining()
1523 debug(self.uuid, "not active")
1526 if is_prepared(self.name):
1528 lctl.cleanup(self.name, self.uuid, config.force,
1530 except CommandError, e:
1531 log(self.module_name, "cleanup failed: ", self.name)
1534 Module.cleanup(self)
1535 if not self.msd_remaining() and is_prepared('MDT'):
1537 lctl.cleanup("MDT", "MDT_UUID", config.force,
1539 except CommandError, e:
1540 print "cleanup failed: ", self.name
1543 clean_loop(self.devpath)
1546 def __init__(self, db):
1547 Module.__init__(self, 'OSD', db)
1548 self.osdtype = self.db.get_val('osdtype')
1549 self.devpath = self.db.get_val('devpath', '')
1550 self.size = self.db.get_val_int('devsize', 0)
1551 self.journal_size = self.db.get_val_int('journalsize', 0)
1552 self.inode_size = self.db.get_val_int('inodesize', 0)
1553 self.mkfsoptions = self.db.get_val_int('mkfsoptions', '')
1554 self.fstype = self.db.get_val('fstype', '')
1555 self.nspath = self.db.get_val('nspath', '')
1556 target_uuid = self.db.get_first_ref('target')
1557 ost = self.db.lookup(target_uuid)
1558 self.name = ost.getName()
1559 self.format = self.db.get_val('autoformat', 'yes')
1560 if ost.get_val('failover', 0):
1561 self.failover_ost = 'f'
1563 self.failover_ost = 'n'
1565 active_uuid = get_active_target(ost)
1567 panic("No target device found:", target_uuid)
1568 if active_uuid == self.uuid:
1572 if self.active and config.group and config.group != ost.get_val('group'):
1575 self.target_dev_uuid = self.uuid
1576 self.uuid = target_uuid
1578 self.add_lustre_module('ost', 'ost')
1579 # FIXME: should we default to ext3 here?
1581 self.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.fstype))
1582 self.add_lustre_module(self.osdtype, self.osdtype)
1584 def load_module(self):
1586 Module.load_module(self)
1588 # need to check /proc/mounts and /etc/mtab before
1589 # formatting anything.
1590 # FIXME: check if device is already formatted.
1592 if is_prepared(self.name):
1595 debug(self.uuid, "not active")
1597 self.info(self.osdtype, self.devpath, self.size, self.fstype,
1598 self.format, self.journal_size, self.inode_size)
1600 if self.osdtype == 'obdecho':
1603 blkdev = block_dev(self.devpath, self.size, self.fstype,
1604 config.reformat, self.format, self.journal_size,
1605 self.inode_size, self.mkfsoptions)
1606 lctl.newdev(self.osdtype, self.name, self.uuid,
1607 setup ="%s %s %s" %(blkdev, self.fstype,
1609 if not is_prepared('OSS'):
1610 lctl.newdev("ost", 'OSS', 'OSS_UUID', setup ="")
1612 def osd_remaining(self):
1613 out = lctl.device_list()
1615 if string.split(s)[2] in ('obdfilter', 'obdecho'):
1618 def safe_to_clean(self):
1621 def safe_to_clean_modules(self):
1622 return not self.osd_remaining()
1626 debug(self.uuid, "not active")
1628 if is_prepared(self.name):
1631 lctl.cleanup(self.name, self.uuid, config.force,
1633 except CommandError, e:
1634 log(self.module_name, "cleanup failed: ", self.name)
1637 if not self.osd_remaining() and is_prepared('OSS'):
1639 lctl.cleanup("OSS", "OSS_UUID", config.force,
1641 except CommandError, e:
1642 print "cleanup failed: ", self.name
1645 if not self.osdtype == 'obdecho':
1646 clean_loop(self.devpath)
1648 def mgmt_uuid_for_fs(mtpt_name):
1651 mtpt_db = toplevel.lookup_name(mtpt_name)
1652 fs_uuid = mtpt_db.get_first_ref('filesystem')
1653 fs = toplevel.lookup(fs_uuid)
1656 return fs.get_first_ref('mgmt')
1658 # Generic client module, used by OSC and MDC
1659 class Client(Module):
1660 def __init__(self, tgtdb, uuid, module, fs_name, self_name=None,
1662 self.target_name = tgtdb.getName()
1663 self.target_uuid = tgtdb.getUUID()
1666 self.tgt_dev_uuid = get_active_target(tgtdb)
1667 if not self.tgt_dev_uuid:
1668 panic("No target device found for target:", self.target_name)
1670 self.kmod = kmod(config.lustre, config.portals)
1674 self.module = module
1675 self.module_name = string.upper(module)
1677 self.name = '%s_%s_%s_%s' % (self.module_name, socket.gethostname(),
1678 self.target_name, fs_name)
1680 self.name = self_name
1682 self.lookup_server(self.tgt_dev_uuid)
1683 mgmt_uuid = mgmt_uuid_for_fs(fs_name)
1685 self.mgmt_name = mgmtcli_name_for_uuid(mgmt_uuid)
1688 self.fs_name = fs_name
1691 self.add_lustre_module(module_dir, module)
1693 def lookup_server(self, srv_uuid):
1694 """ Lookup a server's network information """
1695 self._server_nets = get_ost_net(self.db, srv_uuid)
1696 if len(self._server_nets) == 0:
1697 panic ("Unable to find a server for:", srv_uuid)
1699 def get_servers(self):
1700 return self._server_nets
1702 def prepare(self, ignore_connect_failure = 0):
1703 self.info(self.target_uuid)
1704 if is_prepared(self.name):
1707 srv = choose_local_server(self.get_servers())
1711 routes = find_route(self.get_servers())
1712 if len(routes) == 0:
1713 panic ("no route to", self.target_uuid)
1714 for (srv, r) in routes:
1715 lctl.add_route_host(r[0], srv.nid_uuid, r[1], r[3])
1716 except CommandError, e:
1717 if not ignore_connect_failure:
1720 if self.target_uuid in config.inactive and self.permits_inactive():
1721 debug("%s inactive" % self.target_uuid)
1722 inactive_p = "inactive"
1724 debug("%s active" % self.target_uuid)
1726 lctl.newdev(self.module, self.name, self.uuid,
1727 setup ="%s %s %s %s" % (self.target_uuid, srv.nid_uuid,
1728 inactive_p, self.mgmt_name))
1731 if is_prepared(self.name):
1732 Module.cleanup(self)
1734 srv = choose_local_server(self.get_servers())
1736 lctl.disconnect(srv)
1738 for (srv, r) in find_route(self.get_servers()):
1739 lctl.del_route_host(r[0], srv.nid_uuid, r[1], r[3])
1740 except CommandError, e:
1741 log(self.module_name, "cleanup failed: ", self.name)
1747 def __init__(self, db, uuid, fs_name):
1748 Client.__init__(self, db, uuid, 'mdc', fs_name)
1750 def permits_inactive(self):
1754 def __init__(self, db, uuid, fs_name):
1755 Client.__init__(self, db, uuid, 'osc', fs_name)
1757 def permits_inactive(self):
1760 def mgmtcli_name_for_uuid(uuid):
1761 return 'MGMTCLI_%s' % uuid
1763 class ManagementClient(Client):
1764 def __init__(self, db, uuid):
1765 Client.__init__(self, db, uuid, 'mgmt_cli', '',
1766 self_name = mgmtcli_name_for_uuid(db.getUUID()),
1767 module_dir = 'mgmt')
1770 def __init__(self, db):
1771 Module.__init__(self, 'COBD', db)
1772 self.real_uuid = self.db.get_first_ref('realobd')
1773 self.cache_uuid = self.db.get_first_ref('cacheobd')
1774 self.add_lustre_module('cobd' , 'cobd')
1776 # need to check /proc/mounts and /etc/mtab before
1777 # formatting anything.
1778 # FIXME: check if device is already formatted.
1780 if is_prepared(self.name):
1782 self.info(self.real_uuid, self.cache_uuid)
1783 lctl.newdev("cobd", self.name, self.uuid,
1784 setup ="%s %s" %(self.real_uuid, self.cache_uuid))
1787 # virtual interface for OSC and LOV
1789 def __init__(self, db, uuid, fs_name, name_override = None):
1790 Module.__init__(self, 'VOSC', db)
1791 if db.get_class() == 'lov':
1792 self.osc = LOV(db, uuid, fs_name, name_override)
1794 self.osc = get_osc(db, uuid, fs_name)
1796 return self.osc.uuid
1798 return self.osc.name
1803 def load_module(self):
1804 self.osc.load_module()
1805 def cleanup_module(self):
1806 self.osc.cleanup_module()
1809 class ECHO_CLIENT(Module):
1810 def __init__(self,db):
1811 Module.__init__(self, 'ECHO_CLIENT', db)
1812 self.add_lustre_module('obdecho', 'obdecho')
1813 self.obd_uuid = self.db.get_first_ref('obd')
1814 obd = self.db.lookup(self.obd_uuid)
1815 self.uuid = generate_client_uuid(self.name)
1816 self.osc = VOSC(obd, self.uuid, self.name)
1819 if is_prepared(self.name):
1822 self.osc.prepare() # XXX This is so cheating. -p
1823 self.info(self.obd_uuid)
1825 lctl.newdev("echo_client", self.name, self.uuid,
1826 setup = self.osc.get_name())
1829 if is_prepared(self.name):
1830 Module.cleanup(self)
1833 def load_module(self):
1834 self.osc.load_module()
1835 Module.load_module(self)
1837 def cleanup_module(self):
1838 Module.cleanup_module(self)
1839 self.osc.cleanup_module()
1842 def generate_client_uuid(name):
1843 client_uuid = '%05x_%.19s_%05x%05x' % (int(random.random() * 1048576),
1845 int(random.random() * 1048576),
1846 int(random.random() * 1048576))
1847 return client_uuid[:36]
1850 class Mountpoint(Module):
1851 def __init__(self,db):
1852 Module.__init__(self, 'MTPT', db)
1853 self.path = self.db.get_val('path')
1854 self.fs_uuid = self.db.get_first_ref('filesystem')
1855 fs = self.db.lookup(self.fs_uuid)
1856 self.mds_uuid = fs.get_first_ref('mds')
1857 self.obd_uuid = fs.get_first_ref('obd')
1858 self.mgmt_uuid = fs.get_first_ref('mgmt')
1859 obd = self.db.lookup(self.obd_uuid)
1860 client_uuid = generate_client_uuid(self.name)
1861 self.vosc = VOSC(obd, client_uuid, self.name)
1862 self.mdc = get_mdc(db, client_uuid, self.name, self.mds_uuid)
1864 self.add_lustre_module('mdc', 'mdc')
1865 self.add_lustre_module('llite', 'llite')
1867 self.mgmtcli = ManagementClient(db.lookup(self.mgmt_uuid),
1873 if fs_is_mounted(self.path):
1874 log(self.path, "already mounted.")
1878 self.mgmtcli.prepare()
1881 mdc_name = self.mdc.name
1883 self.info(self.path, self.mds_uuid, self.obd_uuid)
1884 if config.record or config.lctl_dump:
1885 lctl.mount_option(local_node_name, self.vosc.get_name(), mdc_name)
1887 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s %s %s" % \
1888 (self.vosc.get_name(), mdc_name, config.config, self.path)
1889 run("mkdir", self.path)
1894 panic("mount failed:", self.path, ":", string.join(val))
1897 self.info(self.path, self.mds_uuid,self.obd_uuid)
1899 if config.record or config.lctl_dump:
1900 lctl.del_mount_option(local_node_name)
1902 if fs_is_mounted(self.path):
1904 (rc, out) = run("umount", "-f", self.path)
1906 (rc, out) = run("umount", self.path)
1908 raise CommandError('umount', out, rc)
1910 if fs_is_mounted(self.path):
1911 panic("fs is still mounted:", self.path)
1916 self.mgmtcli.cleanup()
1918 def load_module(self):
1920 self.mgmtcli.load_module()
1921 self.vosc.load_module()
1922 Module.load_module(self)
1924 def cleanup_module(self):
1925 Module.cleanup_module(self)
1926 self.vosc.cleanup_module()
1928 self.mgmtcli.cleanup_module()
1931 # ============================================================
1932 # misc query functions
1934 def get_ost_net(self, osd_uuid):
1938 osd = self.lookup(osd_uuid)
1939 node_uuid = osd.get_first_ref('node')
1940 node = self.lookup(node_uuid)
1942 panic("unable to find node for osd_uuid:", osd_uuid,
1943 " node_ref:", node_uuid)
1944 for net_uuid in node.get_networks():
1945 db = node.lookup(net_uuid)
1946 srv_list.append(Network(db))
1950 # the order of iniitailization is based on level.
1951 def getServiceLevel(self):
1952 type = self.get_class()
1954 if type in ('network',):
1956 elif type in ('routetbl',):
1958 elif type in ('ldlm',):
1960 elif type in ('mgmt',):
1962 elif type in ('osd', 'cobd'):
1964 elif type in ('mdsdev',):
1966 elif type in ('mountpoint', 'echoclient'):
1969 panic("Unknown type: ", type)
1971 if ret < config.minlevel or ret > config.maxlevel:
1976 # return list of services in a profile. list is a list of tuples
1977 # [(level, db_object),]
1978 def getServices(self):
1980 for ref_class, ref_uuid in self.get_all_refs():
1981 servdb = self.lookup(ref_uuid)
1983 level = getServiceLevel(servdb)
1985 list.append((level, servdb))
1987 panic('service not found: ' + ref_uuid)
1993 ############################################################
1995 # FIXME: clean this mess up!
1997 # OSC is no longer in the xml, so we have to fake it.
1998 # this is getting ugly and begging for another refactoring
1999 def get_osc(ost_db, uuid, fs_name):
2000 osc = OSC(ost_db, uuid, fs_name)
2003 def get_mdc(db, uuid, fs_name, mds_uuid):
2004 mds_db = db.lookup(mds_uuid);
2006 panic("no mds:", mds_uuid)
2007 mdc = MDC(mds_db, uuid, fs_name)
2010 ############################################################
2011 # routing ("rooting")
2013 # list of (nettype, cluster_id, nid)
2016 def find_local_clusters(node_db):
2017 global local_clusters
2018 for netuuid in node_db.get_networks():
2019 net = node_db.lookup(netuuid)
2021 debug("add_local", netuuid)
2022 local_clusters.append((srv.net_type, srv.cluster_id, srv.nid))
2024 if acceptors.has_key(srv.port):
2025 panic("duplicate port:", srv.port)
2026 acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type,
2027 srv.send_mem, srv.recv_mem,
2030 # This node is a gateway.
2032 def node_is_router():
2035 # If there are any routers found in the config, then this will be true
2036 # and all nodes will load kptlrouter.
2038 def node_needs_router():
2039 return needs_router or is_router
2041 # list of (nettype, gw, tgt_cluster_id, lo, hi)
2042 # Currently, these local routes are only added to kptlrouter route
2043 # table if they are needed to connect to a specific server. This
2044 # should be changed so all available routes are loaded, and the
2045 # ptlrouter can make all the decisions.
2048 def find_local_routes(lustre):
2049 """ Scan the lustre config looking for routers . Build list of
2051 global local_routes, needs_router
2053 list = lustre.lookup_class('node')
2055 if router.get_val_int('router', 0):
2057 for (local_type, local_cluster_id, local_nid) in local_clusters:
2059 for netuuid in router.get_networks():
2060 db = router.lookup(netuuid)
2061 if (local_type == db.get_val('nettype') and
2062 local_cluster_id == db.get_val('clusterid')):
2063 gw = db.get_val('nid')
2066 debug("find_local_routes: gw is", gw)
2067 for route in router.get_local_routes(local_type, gw):
2068 local_routes.append(route)
2069 debug("find_local_routes:", local_routes)
2072 def choose_local_server(srv_list):
2073 for srv in srv_list:
2074 if local_cluster(srv.net_type, srv.cluster_id):
2077 def local_cluster(net_type, cluster_id):
2078 for cluster in local_clusters:
2079 if net_type == cluster[0] and cluster_id == cluster[1]:
2083 def local_interface(net_type, cluster_id, nid):
2084 for cluster in local_clusters:
2085 if (net_type == cluster[0] and cluster_id == cluster[1]
2086 and nid == cluster[2]):
2090 def find_route(srv_list):
2092 frm_type = local_clusters[0][0]
2093 for srv in srv_list:
2094 debug("find_route: srv:", srv.nid, "type: ", srv.net_type)
2095 to_type = srv.net_type
2097 cluster_id = srv.cluster_id
2098 debug ('looking for route to', to_type, to)
2099 for r in local_routes:
2100 debug("find_route: ", r)
2101 if (r[3] <= to and to <= r[4]) and cluster_id == r[2]:
2102 result.append((srv, r))
2105 def get_active_target(db):
2106 target_uuid = db.getUUID()
2107 target_name = db.getName()
2108 node_name = get_select(target_name)
2110 tgt_dev_uuid = db.get_node_tgt_dev(node_name, target_uuid)
2112 tgt_dev_uuid = db.get_first_ref('active')
2115 def get_server_by_nid_uuid(db, nid_uuid):
2116 for n in db.lookup_class("network"):
2118 if net.nid_uuid == nid_uuid:
2122 ############################################################
2126 type = db.get_class()
2127 debug('Service:', type, db.getName(), db.getUUID())
2132 n = LOV(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
2133 elif type == 'network':
2135 elif type == 'routetbl':
2139 elif type == 'cobd':
2141 elif type == 'mdsdev':
2143 elif type == 'mountpoint':
2145 elif type == 'echoclient':
2147 elif type == 'mgmt':
2150 panic ("unknown service type:", type)
2154 # Prepare the system to run lustre using a particular profile
2155 # in a the configuration.
2156 # * load & the modules
2157 # * setup networking for the current node
2158 # * make sure partitions are in place and prepared
2159 # * initialize devices with lctl
2160 # Levels is important, and needs to be enforced.
2161 def for_each_profile(db, prof_list, operation):
2162 for prof_uuid in prof_list:
2163 prof_db = db.lookup(prof_uuid)
2165 panic("profile:", profile, "not found.")
2166 services = getServices(prof_db)
2169 def doWriteconf(services):
2173 if s[1].get_class() == 'mdsdev':
2174 n = newService(s[1])
2177 def doSetup(services):
2181 n = newService(s[1])
2184 def doModules(services):
2188 n = newService(s[1])
2191 def doCleanup(services):
2196 n = newService(s[1])
2197 if n.safe_to_clean():
2200 def doUnloadModules(services):
2205 n = newService(s[1])
2206 if n.safe_to_clean_modules():
2211 def doHost(lustreDB, hosts):
2212 global is_router, local_node_name
2215 node_db = lustreDB.lookup_name(h, 'node')
2219 print 'No host entry found.'
2222 local_node_name = node_db.get_val('name', 0)
2223 is_router = node_db.get_val_int('router', 0)
2224 lustre_upcall = node_db.get_val('lustreUpcall', '')
2225 portals_upcall = node_db.get_val('portalsUpcall', '')
2226 timeout = node_db.get_val_int('timeout', 0)
2227 ptldebug = node_db.get_val('ptldebug', '')
2228 subsystem = node_db.get_val('subsystem', '')
2230 find_local_clusters(node_db)
2232 find_local_routes(lustreDB)
2234 # Two step process: (1) load modules, (2) setup lustre
2235 # if not cleaning, load modules first.
2236 prof_list = node_db.get_refs('profile')
2238 if config.write_conf:
2239 for_each_profile(node_db, prof_list, doModules)
2241 for_each_profile(node_db, prof_list, doWriteconf)
2242 for_each_profile(node_db, prof_list, doUnloadModules)
2244 elif config.recover:
2245 if not (config.tgt_uuid and config.client_uuid and config.conn_uuid):
2246 raise Lustre.LconfError( "--recovery requires --tgt_uuid <UUID> " +
2247 "--client_uuid <UUID> --conn_uuid <UUID>")
2248 doRecovery(lustreDB, lctl, config.tgt_uuid, config.client_uuid,
2250 elif config.cleanup:
2252 # the command line can override this value
2254 # ugly hack, only need to run lctl commands for --dump
2255 if config.lctl_dump or config.record:
2256 for_each_profile(node_db, prof_list, doCleanup)
2259 sys_set_timeout(timeout)
2260 sys_set_ptldebug(ptldebug)
2261 sys_set_subsystem(subsystem)
2262 sys_set_lustre_upcall(lustre_upcall)
2263 sys_set_portals_upcall(portals_upcall)
2265 for_each_profile(node_db, prof_list, doCleanup)
2266 for_each_profile(node_db, prof_list, doUnloadModules)
2269 # ugly hack, only need to run lctl commands for --dump
2270 if config.lctl_dump or config.record:
2271 sys_set_timeout(timeout)
2272 sys_set_lustre_upcall(lustre_upcall)
2273 for_each_profile(node_db, prof_list, doSetup)
2277 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
2278 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
2280 for_each_profile(node_db, prof_list, doModules)
2282 sys_set_debug_path()
2283 sys_set_ptldebug(ptldebug)
2284 sys_set_subsystem(subsystem)
2285 script = config.gdb_script
2286 run(lctl.lctl, ' modules >', script)
2288 log ("The GDB module script is in", script)
2289 # pause, so user has time to break and
2292 sys_set_timeout(timeout)
2293 sys_set_lustre_upcall(lustre_upcall)
2294 sys_set_portals_upcall(portals_upcall)
2296 for_each_profile(node_db, prof_list, doSetup)
2298 def doRecovery(db, lctl, tgt_uuid, client_uuid, nid_uuid):
2299 tgt = db.lookup(tgt_uuid)
2301 raise Lustre.LconfError("doRecovery: "+ tgt_uuid +" not found.")
2302 new_uuid = get_active_target(tgt)
2304 raise Lustre.LconfError("doRecovery: no active target found for: " +
2306 net = choose_local_server(get_ost_net(db, new_uuid))
2308 raise Lustre.LconfError("Unable to find a connection to:" + new_uuid)
2310 log("Reconnecting", tgt_uuid, " to ", net.nid_uuid);
2312 oldnet = get_server_by_nid_uuid(db, nid_uuid)
2314 lctl.disconnect(oldnet)
2315 except CommandError, e:
2316 log("recover: disconnect", nid_uuid, "failed: ")
2321 except CommandError, e:
2322 log("recover: connect failed")
2325 lctl.recover(client_uuid, net.nid_uuid)
2328 def setupModulePath(cmd, portals_dir = PORTALS_DIR):
2329 base = os.path.dirname(cmd)
2330 if development_mode():
2331 if not config.lustre:
2332 config.lustre = (os.path.join(base, ".."))
2333 # normalize the portals dir, using command line arg if set
2335 portals_dir = config.portals
2336 dir = os.path.join(config.lustre, portals_dir)
2337 config.portals = dir
2338 debug('config.portals', config.portals)
2339 elif config.lustre and config.portals:
2341 # if --lustre and --portals, normalize portals
2342 # can ignore POTRALS_DIR here, since it is probly useless here
2343 config.portals = os.path.join(config.lustre, config.portals)
2344 debug('config.portals B', config.portals)
2346 def sysctl(path, val):
2347 debug("+ sysctl", path, val)
2351 fp = open(os.path.join('/proc/sys', path), 'w')
2358 def sys_set_debug_path():
2359 sysctl('portals/debug_path', config.debug_path)
2361 def sys_set_lustre_upcall(upcall):
2362 # the command overrides the value in the node config
2363 if config.lustre_upcall:
2364 upcall = config.lustre_upcall
2366 upcall = config.upcall
2368 lctl.set_lustre_upcall(upcall)
2370 def sys_set_portals_upcall(upcall):
2371 # the command overrides the value in the node config
2372 if config.portals_upcall:
2373 upcall = config.portals_upcall
2375 upcall = config.upcall
2377 sysctl('portals/upcall', upcall)
2379 def sys_set_timeout(timeout):
2380 # the command overrides the value in the node config
2381 if config.timeout and config.timeout > 0:
2382 timeout = config.timeout
2383 if timeout != None and timeout > 0:
2384 lctl.set_timeout(timeout)
2386 def sys_tweak_socknal ():
2387 if config.single_socket:
2388 sysctl("socknal/typed", 0)
2390 def sys_optimize_elan ():
2391 procfiles = ["/proc/elan/config/eventint_punt_loops",
2392 "/proc/qsnet/elan3/config/eventint_punt_loops",
2393 "/proc/qsnet/elan4/config/elan4_mainint_punt_loops"]
2395 if os.access(p, os.R_OK):
2396 run ("echo 0 > " + p)
2398 def sys_set_ptldebug(ptldebug):
2400 ptldebug = config.ptldebug
2403 val = eval(ptldebug, ptldebug_names)
2404 val = "0x%x" % (val)
2405 sysctl('portals/debug', val)
2406 except NameError, e:
2409 def sys_set_subsystem(subsystem):
2410 if config.subsystem:
2411 subsystem = config.subsystem
2414 val = eval(subsystem, subsystem_names)
2415 val = "0x%x" % (val)
2416 sysctl('portals/subsystem_debug', val)
2417 except NameError, e:
2420 def sys_set_netmem_max(path, max):
2421 debug("setting", path, "to at least", max)
2429 fp = open(path, 'w')
2430 fp.write('%d\n' %(max))
2434 def sys_make_devices():
2435 if not os.access('/dev/portals', os.R_OK):
2436 run('mknod /dev/portals c 10 240')
2437 if not os.access('/dev/obd', os.R_OK):
2438 run('mknod /dev/obd c 10 241')
2441 # Add dir to the global PATH, if not already there.
2442 def add_to_path(new_dir):
2443 syspath = string.split(os.environ['PATH'], ':')
2444 if new_dir in syspath:
2446 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
2448 def default_debug_path():
2449 path = '/tmp/lustre-log'
2450 if os.path.isdir('/r'):
2455 def default_gdb_script():
2456 script = '/tmp/ogdb'
2457 if os.path.isdir('/r'):
2458 return '/r' + script
2463 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
2464 # ensure basic elements are in the system path
2465 def sanitise_path():
2466 for dir in DEFAULT_PATH:
2469 # global hack for the --select handling
2471 def init_select(args):
2472 # args = [service=nodeA,service2=nodeB service3=nodeC]
2475 list = string.split(arg, ',')
2477 srv, node = string.split(entry, '=')
2478 tgt_select[srv] = node
2480 def get_select(srv):
2481 if tgt_select.has_key(srv):
2482 return tgt_select[srv]
2486 FLAG = Lustre.Options.FLAG
2487 PARAM = Lustre.Options.PARAM
2488 INTPARAM = Lustre.Options.INTPARAM
2489 PARAMLIST = Lustre.Options.PARAMLIST
2491 ('verbose,v', "Print system commands as they are run"),
2492 ('ldapurl',"LDAP server URL, eg. ldap://localhost", PARAM),
2493 ('config', "Cluster config name used for LDAP query", PARAM),
2494 ('select', "service=nodeA,service2=nodeB ", PARAMLIST),
2495 ('node', "Load config for <nodename>", PARAM),
2496 ('cleanup,d', "Cleans up config. (Shutdown)"),
2497 ('force,f', "Forced unmounting and/or obd detach during cleanup",
2499 ('single_socket', "socknal option: only use one socket instead of bundle",
2501 ('failover',"""Used to shut down without saving state.
2502 This will allow this node to "give up" a service to a
2503 another node for failover purposes. This will not
2504 be a clean shutdown.""",
2506 ('gdb', """Prints message after creating gdb module script
2507 and sleeps for 5 seconds."""),
2508 ('noexec,n', """Prints the commands and steps that will be run for a
2509 config without executing them. This can used to check if a
2510 config file is doing what it should be doing"""),
2511 ('nomod', "Skip load/unload module step."),
2512 ('nosetup', "Skip device setup/cleanup step."),
2513 ('reformat', "Reformat all devices (without question)"),
2514 ('mkfsoptions', "Additional options for the mk*fs command line", PARAM),
2515 ('dump', "Dump the kernel debug log to file before portals is unloaded",
2517 ('write_conf', "Save all the client config information on mds."),
2518 ('record', "Write config information on mds."),
2519 ('record_log', "Name of config record log.", PARAM),
2520 ('record_device', "MDS device name that will record the config commands",
2522 ('minlevel', "Minimum level of services to configure/cleanup",
2524 ('maxlevel', """Maximum level of services to configure/cleanup
2525 Levels are aproximatly like:
2530 70 - mountpoint, echo_client, osc, mdc, lov""",
2532 ('lustre', """Base directory of lustre sources. This parameter will
2533 cause lconf to load modules from a source tree.""", PARAM),
2534 ('portals', """Portals source directory. If this is a relative path,
2535 then it is assumed to be relative to lustre. """, PARAM),
2536 ('timeout', "Set recovery timeout", INTPARAM),
2537 ('upcall', "Set both portals and lustre upcall script", PARAM),
2538 ('lustre_upcall', "Set lustre upcall script", PARAM),
2539 ('portals_upcall', "Set portals upcall script", PARAM),
2540 ('lctl_dump', "Save lctl ioctls to the dumpfile argument", PARAM),
2541 ('ptldebug', "Set the portals debug level", PARAM),
2542 ('subsystem', "Set the portals debug subsystem", PARAM),
2543 ('gdb_script', "Fullname of gdb debug script", PARAM, default_gdb_script()),
2544 ('debug_path', "Path to save debug dumps", PARAM, default_debug_path()),
2545 # Client recovery options
2546 ('recover', "Recover a device"),
2547 ('group', "The group of devices to configure or cleanup", PARAM),
2548 ('tgt_uuid', "The failed target (required for recovery)", PARAM),
2549 ('client_uuid', "The failed client (required for recovery)", PARAM),
2550 ('conn_uuid', "The failed connection (required for recovery)", PARAM),
2552 ('inactive', """The name of an inactive service, to be ignored during
2553 mounting (currently OST-only). Can be repeated.""",
2558 global lctl, config, toplevel, CONFIG_FILE
2560 # in the upcall this is set to SIG_IGN
2561 signal.signal(signal.SIGCHLD, signal.SIG_DFL)
2563 cl = Lustre.Options("lconf", "config.xml", lconf_options)
2565 config, args = cl.parse(sys.argv[1:])
2566 except Lustre.OptionError, e:
2570 setupModulePath(sys.argv[0])
2572 host = socket.gethostname()
2574 # the PRNG is normally seeded with time(), which is not so good for starting
2575 # time-synchronized clusters
2576 input = open('/dev/urandom', 'r')
2578 print 'Unable to open /dev/urandom!'
2580 seed = input.read(32)
2586 init_select(config.select)
2589 if not os.access(args[0], os.R_OK):
2590 print 'File not found or readable:', args[0]
2593 dom = xml.dom.minidom.parse(args[0])
2595 panic("%s does not appear to be a config file." % (args[0]))
2596 sys.exit(1) # make sure to die here, even in debug mode.
2597 CONFIG_FILE = args[0]
2598 db = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement)
2599 if not config.config:
2600 config.config = os.path.basename(args[0])# use full path?
2601 if config.config[-4:] == '.xml':
2602 config.config = config.config[:-4]
2603 elif config.ldapurl:
2604 if not config.config:
2605 panic("--ldapurl requires --config name")
2606 dn = "config=%s,fs=lustre" % (config.config)
2607 db = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl)
2608 elif config.ptldebug or config.subsystem:
2609 sys_set_ptldebug(None)
2610 sys_set_subsystem(None)
2613 print 'Missing config file or ldap URL.'
2614 print 'see lconf --help for command summary'
2619 ver = db.get_version()
2621 panic("No version found in config data, please recreate.")
2622 if ver != Lustre.CONFIG_VERSION:
2623 panic("Config version", ver, "does not match lconf version",
2624 Lustre.CONFIG_VERSION)
2628 node_list.append(config.node)
2631 node_list.append(host)
2632 node_list.append('localhost')
2634 debug("configuring for host: ", node_list)
2637 config.debug_path = config.debug_path + '-' + host
2638 config.gdb_script = config.gdb_script + '-' + host
2640 lctl = LCTLInterface('lctl')
2642 if config.lctl_dump:
2643 lctl.use_save_file(config.lctl_dump)
2646 if not (config.record_device and config.record_log):
2647 panic("When recording, both --record_log and --record_device must be specified.")
2648 lctl.record(config.record_device, config.record_log)
2650 doHost(db, node_list)
2655 if __name__ == "__main__":
2658 except Lustre.LconfError, e:
2660 # traceback.print_exc(file=sys.stdout)
2662 except CommandError, e:
2666 if first_cleanup_error:
2667 sys.exit(first_cleanup_error)