3 # Copyright (C) 2002-2003 Cluster File Systems, Inc.
4 # Authors: Robert Read <rread@clusterfs.com>
5 # Mike Shaver <shaver@clusterfs.com>
6 # This file is part of Lustre, http://www.lustre.org.
8 # Lustre is free software; you can redistribute it and/or
9 # modify it under the terms of version 2 of the GNU General Public
10 # License as published by the Free Software Foundation.
12 # Lustre is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Lustre; if not, write to the Free Software
19 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 # lconf - lustre configuration tool
23 # lconf is the main driver script for starting and stopping
24 # lustre filesystem services.
26 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import sys, getopt, types
29 import string, os, stat, popen2, socket, time, random, fcntl, select
30 import re, exceptions, signal, traceback
31 import xml.dom.minidom
33 if sys.version[0] == '1':
34 from FCNTL import F_GETFL, F_SETFL
36 from fcntl import F_GETFL, F_SETFL
38 PYMOD_DIR = "/usr/lib/lustre/python"
40 def development_mode():
41 base = os.path.dirname(sys.argv[0])
42 if os.access(base+"/Makefile.am", os.R_OK):
46 if not development_mode():
47 sys.path.append(PYMOD_DIR)
53 DEFAULT_TCPBUF = 8388608
56 # Maximum number of devices to search for.
57 # (the /dev/loop* nodes need to be created beforehand)
58 MAX_LOOP_DEVICES = 256
59 PORTALS_DIR = 'portals'
61 # Needed to call lconf --record
64 # Please keep these in sync with the values in portals/kp30.h
76 "warning" : (1 << 10),
80 "portals" : (1 << 14),
82 "dlmtrace" : (1 << 16),
86 "rpctrace" : (1 << 20),
87 "vfstrace" : (1 << 21),
92 "undefined" : (1 << 0),
102 "portals" : (1 << 10),
103 "socknal" : (1 << 11),
104 "qswnal" : (1 << 12),
105 "pinger" : (1 << 13),
106 "filter" : (1 << 14),
112 "ptlrouter" : (1 << 20),
118 first_cleanup_error = 0
119 def cleanup_error(rc):
120 global first_cleanup_error
121 if not first_cleanup_error:
122 first_cleanup_error = rc
124 # ============================================================
125 # debugging and error funcs
127 def fixme(msg = "this feature"):
128 raise Lustre.LconfError, msg + ' not implmemented yet.'
131 msg = string.join(map(str,args))
132 if not config.noexec:
133 raise Lustre.LconfError(msg)
138 msg = string.join(map(str,args))
143 print string.strip(s)
147 msg = string.join(map(str,args))
150 # ack, python's builtin int() does not support '0x123' syntax.
151 # eval can do it, although what a hack!
155 return eval(s, {}, {})
158 except SyntaxError, e:
159 raise ValueError("not a number")
161 raise ValueError("not a number")
163 # ============================================================
164 # locally defined exceptions
165 class CommandError (exceptions.Exception):
166 def __init__(self, cmd_name, cmd_err, rc=None):
167 self.cmd_name = cmd_name
168 self.cmd_err = cmd_err
173 if type(self.cmd_err) == types.StringType:
175 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
177 print "! %s: %s" % (self.cmd_name, self.cmd_err)
178 elif type(self.cmd_err) == types.ListType:
180 print "! %s (error %d):" % (self.cmd_name, self.rc)
182 print "! %s:" % (self.cmd_name)
183 for s in self.cmd_err:
184 print "> %s" %(string.strip(s))
189 # ============================================================
190 # handle daemons, like the acceptor
192 """ Manage starting and stopping a daemon. Assumes daemon manages
193 it's own pid file. """
195 def __init__(self, cmd):
201 log(self.command, "already running.")
203 self.path = find_prog(self.command)
205 panic(self.command, "not found.")
206 ret, out = runcmd(self.path +' '+ self.command_line())
208 raise CommandError(self.path, out, ret)
212 pid = self.read_pidfile()
214 log ("killing process", pid)
216 #time.sleep(1) # let daemon die
218 log("unable to kill", self.command, e)
220 log("unable to kill", self.command)
223 pid = self.read_pidfile()
233 def read_pidfile(self):
235 fp = open(self.pidfile(), 'r')
242 def clean_pidfile(self):
243 """ Remove a stale pidfile """
244 log("removing stale pidfile:", self.pidfile())
246 os.unlink(self.pidfile())
248 log(self.pidfile(), e)
250 class AcceptorHandler(DaemonHandler):
251 def __init__(self, port, net_type, send_mem, recv_mem, irq_aff):
252 DaemonHandler.__init__(self, "acceptor")
255 self.send_mem = send_mem
256 self.recv_mem = recv_mem
259 self.flags = self.flags + ' -i'
262 return "/var/run/%s-%d.pid" % (self.command, self.port)
264 def command_line(self):
265 return string.join(map(str,('-s', self.send_mem, '-r', self.recv_mem, self.flags, self.port)))
269 # start the acceptors
271 if config.lctl_dump or config.record:
273 for port in acceptors.keys():
274 daemon = acceptors[port]
275 if not daemon.running():
278 def run_one_acceptor(port):
279 if config.lctl_dump or config.record:
281 if acceptors.has_key(port):
282 daemon = acceptors[port]
283 if not daemon.running():
286 panic("run_one_acceptor: No acceptor defined for port:", port)
288 def stop_acceptor(port):
289 if acceptors.has_key(port):
290 daemon = acceptors[port]
295 # ============================================================
296 # handle lctl interface
299 Manage communication with lctl
302 def __init__(self, cmd):
304 Initialize close by finding the lctl binary.
306 self.lctl = find_prog(cmd)
308 self.record_device = ''
311 debug('! lctl not found')
314 raise CommandError('lctl', "unable to find lctl binary.")
316 def use_save_file(self, file):
317 self.save_file = file
319 def record(self, dev_name, logname):
320 log("Recording log", logname, "on", dev_name)
321 self.record_device = dev_name
322 self.record_log = logname
324 def end_record(self):
325 log("End recording log", self.record_log, "on", self.record_device)
326 self.record_device = None
327 self.record_log = None
329 def set_nonblock(self, fd):
330 fl = fcntl.fcntl(fd, F_GETFL)
331 fcntl.fcntl(fd, F_SETFL, fl | os.O_NDELAY)
336 the cmds are written to stdin of lctl
337 lctl doesn't return errors when run in script mode, so
339 should modify command line to accept multiple commands, or
340 create complex command line options
344 cmds = '\n dump ' + self.save_file + '\n' + cmds
345 elif self.record_device:
349 %s""" % (self.record_device, self.record_log, cmds)
351 debug("+", cmd_line, cmds)
352 if config.noexec: return (0, [])
354 child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command
355 child.tochild.write(cmds + "\n")
356 child.tochild.close()
358 # From "Python Cookbook" from O'Reilly
359 outfile = child.fromchild
360 outfd = outfile.fileno()
361 self.set_nonblock(outfd)
362 errfile = child.childerr
363 errfd = errfile.fileno()
364 self.set_nonblock(errfd)
366 outdata = errdata = ''
369 ready = select.select([outfd,errfd],[],[]) # Wait for input
370 if outfd in ready[0]:
371 outchunk = outfile.read()
372 if outchunk == '': outeof = 1
373 outdata = outdata + outchunk
374 if errfd in ready[0]:
375 errchunk = errfile.read()
376 if errchunk == '': erreof = 1
377 errdata = errdata + errchunk
378 if outeof and erreof: break
379 # end of "borrowed" code
382 if os.WIFEXITED(ret):
383 rc = os.WEXITSTATUS(ret)
386 if rc or len(errdata):
387 raise CommandError(self.lctl, errdata, rc)
390 def runcmd(self, *args):
392 run lctl using the command line
394 cmd = string.join(map(str,args))
395 debug("+", self.lctl, cmd)
396 rc, out = run(self.lctl, cmd)
398 raise CommandError(self.lctl, out, rc)
402 def network(self, net, nid):
407 quit """ % (net, nid)
410 # create a new connection
411 def add_uuid(self, net_type, uuid, nid):
412 cmds = "\n add_uuid %s %s %s" %(uuid, nid, net_type)
415 def add_autoconn(self, net_type, send_mem, recv_mem, nid, hostaddr,
417 if net_type in ('tcp',) and not config.lctl_dump:
422 add_autoconn %s %s %d %s
426 nid, hostaddr, port, flags )
429 def connect(self, srv):
430 self.add_uuid(srv.net_type, srv.nid_uuid, srv.nid)
431 if srv.net_type in ('tcp',) and not config.lctl_dump:
435 self.add_autoconn(srv.net_type, srv.send_mem, srv.recv_mem,
436 srv.nid, srv.hostaddr, srv.port, flags)
439 def recover(self, dev_name, new_conn):
442 recover %s""" %(dev_name, new_conn)
445 # add a route to a range
446 def add_route(self, net, gw, lo, hi):
454 except CommandError, e:
458 def del_route(self, net, gw, lo, hi):
463 quit """ % (net, gw, lo, hi)
466 # add a route to a host
467 def add_route_host(self, net, uuid, gw, tgt):
468 self.add_uuid(net, uuid, tgt)
476 except CommandError, e:
480 # add a route to a range
481 def del_route_host(self, net, uuid, gw, tgt):
487 quit """ % (net, gw, tgt)
491 def del_autoconn(self, net_type, nid, hostaddr):
492 if net_type in ('tcp',) and not config.lctl_dump:
501 # disconnect one connection
502 def disconnect(self, srv):
503 self.del_uuid(srv.nid_uuid)
504 if srv.net_type in ('tcp',) and not config.lctl_dump:
505 self.del_autoconn(srv.net_type, srv.nid, srv.hostaddr)
507 def del_uuid(self, uuid):
515 def disconnectAll(self, net):
523 def attach(self, type, name, uuid):
526 quit""" % (type, name, uuid)
529 def setup(self, name, setup = ""):
533 quit""" % (name, setup)
537 # create a new device with lctl
538 def newdev(self, type, name, uuid, setup = ""):
539 self.attach(type, name, uuid);
541 self.setup(name, setup)
542 except CommandError, e:
543 self.cleanup(name, uuid, 0)
548 def cleanup(self, name, uuid, force, failover = 0):
549 if failover: force = 1
555 quit""" % (name, ('', 'force')[force],
556 ('', 'failover')[failover])
560 def lov_setup(self, name, uuid, desc_uuid, mdsuuid, stripe_cnt,
561 stripe_sz, stripe_off,
565 lov_setup %s %d %d %d %s %s
566 quit""" % (name, uuid, desc_uuid, stripe_cnt, stripe_sz, stripe_off,
571 def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off,
575 lov_setconfig %s %d %d %d %s %s
576 quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
580 def dump(self, dump_file):
583 quit""" % (dump_file)
586 # get list of devices
587 def device_list(self):
588 devices = '/proc/fs/lustre/devices'
590 if os.access(devices, os.R_OK):
592 fp = open(devices, 'r')
600 def lustre_version(self):
601 rc, out = self.runcmd('version')
605 def mount_option(self, profile, osc, mdc):
607 mount_option %s %s %s
608 quit""" % (profile, osc, mdc)
611 # delete mount options
612 def del_mount_option(self, profile):
618 def set_timeout(self, timeout):
624 # delete mount options
625 def set_lustre_upcall(self, upcall):
630 # ============================================================
631 # Various system-level functions
632 # (ideally moved to their own module)
634 # Run a command and return the output and status.
635 # stderr is sent to /dev/null, could use popen3 to
636 # save it if necessary
639 if config.noexec: return (0, [])
640 f = os.popen(cmd + ' 2>&1')
650 cmd = string.join(map(str,args))
653 # Run a command in the background.
654 def run_daemon(*args):
655 cmd = string.join(map(str,args))
657 if config.noexec: return 0
658 f = os.popen(cmd + ' 2>&1')
666 # Determine full path to use for an external command
667 # searches dirname(argv[0]) first, then PATH
669 syspath = string.split(os.environ['PATH'], ':')
670 cmdpath = os.path.dirname(sys.argv[0])
671 syspath.insert(0, cmdpath);
673 syspath.insert(0, os.path.join(config.portals, 'utils/'))
675 prog = os.path.join(d,cmd)
676 if os.access(prog, os.X_OK):
680 # Recursively look for file starting at base dir
681 def do_find_file(base, mod):
682 fullname = os.path.join(base, mod)
683 if os.access(fullname, os.R_OK):
685 for d in os.listdir(base):
686 dir = os.path.join(base,d)
687 if os.path.isdir(dir):
688 module = do_find_file(dir, mod)
692 def find_module(src_dir, dev_dir, modname):
693 mod = '%s.o' % (modname)
694 module = src_dir +'/'+ dev_dir +'/'+ mod
696 if os.access(module, os.R_OK):
702 # is the path a block device?
709 return stat.S_ISBLK(s[stat.ST_MODE])
711 # build fs according to type
713 def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1):
719 panic("size of filesystem on '%s' must be larger than 8MB, but is set to %s"%
721 # devsize is in 1k, and fs block count is in 4k
722 block_cnt = devsize/4
724 if fstype in ('ext3', 'extN'):
725 # ext3 journal size is in megabytes
728 if not is_block(dev):
729 ret, out = runcmd("ls -l %s" %dev)
730 devsize = int(string.split(out[0])[4]) / 1024
732 ret, out = runcmd("sfdisk -s %s" %dev)
733 devsize = int(out[0])
734 if devsize > 1024 * 1024:
735 jsize = ((devsize / 102400) * 4)
738 if jsize: jopt = "-J size=%d" %(jsize,)
739 if isize: iopt = "-I %d" %(isize,)
740 mkfs = 'mkfs.ext2 -j -b 4096 '
741 if not isblock or config.force:
743 elif fstype == 'reiserfs':
744 # reiserfs journal size is in blocks
745 if jsize: jopt = "--journal_size %d" %(jsize,)
746 mkfs = 'mkreiserfs -ff'
748 panic('unsupported fs type: ', fstype)
750 if config.mkfsoptions != None:
751 mkfs = mkfs + ' ' + config.mkfsoptions
752 if mkfsoptions != None:
753 mkfs = mkfs + ' ' + mkfsoptions
754 (ret, out) = run (mkfs, jopt, iopt, dev, block_cnt)
756 panic("Unable to build fs:", dev, string.join(out))
757 # enable hash tree indexing on fsswe
758 if fstype in ('ext3', 'extN'):
759 htree = 'echo "feature FEATURE_C5" | debugfs -w'
760 (ret, out) = run (htree, dev)
762 panic("Unable to enable htree:", dev)
764 # some systems use /dev/loopN, some /dev/loop/N
768 if not os.access(loop + str(0), os.R_OK):
770 if not os.access(loop + str(0), os.R_OK):
771 panic ("can't access loop devices")
774 # find loop device assigned to thefile
777 for n in xrange(0, MAX_LOOP_DEVICES):
779 if os.access(dev, os.R_OK):
780 (stat, out) = run('losetup', dev)
781 if out and stat == 0:
782 m = re.search(r'\((.*)\)', out[0])
783 if m and file == m.group(1):
789 # create file if necessary and assign the first free loop device
790 def init_loop(file, size, fstype, journal_size, inode_size, mkfsoptions, reformat):
791 dev = find_loop(file)
793 print 'WARNING file:', file, 'already mapped to', dev
795 if reformat or not os.access(file, os.R_OK | os.W_OK):
797 panic("size of loopback file '%s' must be larger than 8MB, but is set to %s" % (file,size))
798 (ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size,
801 panic("Unable to create backing store:", file)
802 mkfs(file, size, fstype, journal_size, inode_size, mkfsoptions, isblock=0)
805 # find next free loop
806 for n in xrange(0, MAX_LOOP_DEVICES):
808 if os.access(dev, os.R_OK):
809 (stat, out) = run('losetup', dev)
811 run('losetup', dev, file)
814 print "out of loop devices"
816 print "out of loop devices"
819 # undo loop assignment
820 def clean_loop(file):
821 dev = find_loop(file)
823 ret, out = run('losetup -d', dev)
825 log('unable to clean loop device:', dev, 'for file:', file)
828 # determine if dev is formatted as a <fstype> filesystem
829 def need_format(fstype, dev):
830 # FIXME don't know how to implement this
833 # initialize a block device if needed
834 def block_dev(dev, size, fstype, reformat, autoformat, journal_size,
835 inode_size, mkfsoptions):
836 if config.noexec: return dev
837 if not is_block(dev):
838 dev = init_loop(dev, size, fstype, journal_size, inode_size,
839 mkfsoptions, reformat)
840 elif reformat or (need_format(fstype, dev) and autoformat == 'yes'):
841 mkfs(dev, size, fstype, journal_size, inode_size, mkfsoptions,
844 # panic("device:", dev,
845 # "not prepared, and autoformat is not set.\n",
846 # "Rerun with --reformat option to format ALL filesystems")
851 """lookup IP address for an interface"""
852 rc, out = run("/sbin/ifconfig", iface)
855 addr = string.split(out[1])[1]
856 ip = string.split(addr, ':')[1]
859 def sys_get_elan_position_file():
860 procfiles = ["/proc/elan/device0/position",
861 "/proc/qsnet/elan4/device0/position",
862 "/proc/qsnet/elan3/device0/position"]
864 if os.access(p, os.R_OK):
868 def sys_get_local_nid(net_type, wildcard, cluster_id):
869 """Return the local nid."""
871 if sys_get_elan_position_file():
872 local = sys_get_local_address('elan', '*', cluster_id)
874 local = sys_get_local_address(net_type, wildcard, cluster_id)
877 def sys_get_local_address(net_type, wildcard, cluster_id):
878 """Return the local address for the network type."""
880 if net_type in ('tcp',):
882 iface, star = string.split(wildcard, ':')
883 local = if2addr(iface)
885 panic ("unable to determine ip for:", wildcard)
887 host = socket.gethostname()
888 local = socket.gethostbyname(host)
889 elif net_type == 'elan':
890 # awk '/NodeId/ { print $2 }' 'sys_get_elan_position_file()'
891 f = sys_get_elan_position_file()
893 panic ("unable to determine local Elan ID")
896 lines = fp.readlines()
904 nid = my_int(cluster_id) + my_int(elan_id)
906 except ValueError, e:
910 elif net_type == 'gm':
911 fixme("automatic local address for GM")
912 elif net_type == 'scimac':
913 scinode="/opt/scali/sbin/scinode"
914 if os.path.exists(scinode):
915 (rc,local) = run(scinode)
917 panic (scinode, " not found on node with scimac networking")
919 panic (scinode, " failed")
920 local=string.rstrip(local[0])
924 def mod_loaded(modname):
925 """Check if a module is already loaded. Look in /proc/modules for it."""
927 fp = open('/proc/modules')
928 lines = fp.readlines()
930 # please forgive my tired fingers for this one
931 ret = filter(lambda word, mod=modname: word == mod,
932 map(lambda line: string.split(line)[0], lines))
937 # XXX: instead of device_list, ask for $name and see what we get
938 def is_prepared(name):
939 """Return true if a device exists for the name"""
942 if (config.noexec or config.record) and config.cleanup:
945 # expect this format:
946 # 1 UP ldlm ldlm ldlm_UUID 2
947 out = lctl.device_list()
949 if name == string.split(s)[3]:
951 except CommandError, e:
955 def is_network_prepared():
956 """If the any device exists, then assume that all networking
957 has been configured"""
958 out = lctl.device_list()
961 def fs_is_mounted(path):
962 """Return true if path is a mounted lustre filesystem"""
964 fp = open('/proc/mounts')
965 lines = fp.readlines()
969 if a[1] == path and a[2] == 'lustre_lite':
977 """Manage kernel modules"""
978 def __init__(self, lustre_dir, portals_dir):
979 self.lustre_dir = lustre_dir
980 self.portals_dir = portals_dir
981 self.kmodule_list = []
983 def add_portals_module(self, dev_dir, modname):
984 """Append a module to list of modules to load."""
985 self.kmodule_list.append((self.portals_dir, dev_dir, modname))
987 def add_lustre_module(self, dev_dir, modname):
988 """Append a module to list of modules to load."""
989 self.kmodule_list.append((self.lustre_dir, dev_dir, modname))
991 def load_module(self):
992 """Load all the modules in the list in the order they appear."""
993 for src_dir, dev_dir, mod in self.kmodule_list:
994 if mod_loaded(mod) and not config.noexec:
996 log ('loading module:', mod, 'srcdir', src_dir, 'devdir', dev_dir)
998 module = find_module(src_dir, dev_dir, mod)
1000 panic('module not found:', mod)
1001 (rc, out) = run('/sbin/insmod', module)
1003 raise CommandError('insmod', out, rc)
1005 (rc, out) = run('/sbin/modprobe', mod)
1007 raise CommandError('modprobe', out, rc)
1009 def cleanup_module(self):
1010 """Unload the modules in the list in reverse order."""
1011 rev = self.kmodule_list
1013 for src_dir, dev_dir, mod in rev:
1014 if not mod_loaded(mod) and not config.noexec:
1017 if mod == 'portals' and config.dump:
1018 lctl.dump(config.dump)
1019 log('unloading module:', mod)
1020 (rc, out) = run('/sbin/rmmod', mod)
1022 log('! unable to unload module:', mod)
1025 # ============================================================
1026 # Classes to prepare and cleanup the various objects
1029 """ Base class for the rest of the modules. The default cleanup method is
1030 defined here, as well as some utilitiy funcs.
1032 def __init__(self, module_name, db):
1034 self.module_name = module_name
1035 self.name = self.db.getName()
1036 self.uuid = self.db.getUUID()
1039 self.kmod = kmod(config.lustre, config.portals)
1041 def info(self, *args):
1042 msg = string.join(map(str,args))
1043 print self.module_name + ":", self.name, self.uuid, msg
1046 """ default cleanup, used for most modules """
1049 lctl.cleanup(self.name, self.uuid, config.force)
1050 except CommandError, e:
1051 log(self.module_name, "cleanup failed: ", self.name)
1055 def add_portals_module(self, dev_dir, modname):
1056 """Append a module to list of modules to load."""
1057 self.kmod.add_portals_module(dev_dir, modname)
1059 def add_lustre_module(self, dev_dir, modname):
1060 """Append a module to list of modules to load."""
1061 self.kmod.add_lustre_module(dev_dir, modname)
1063 def load_module(self):
1064 """Load all the modules in the list in the order they appear."""
1065 self.kmod.load_module()
1067 def cleanup_module(self):
1068 """Unload the modules in the list in reverse order."""
1069 if self.safe_to_clean():
1070 self.kmod.cleanup_module()
1072 def safe_to_clean(self):
1075 def safe_to_clean_modules(self):
1076 return self.safe_to_clean()
1078 class Network(Module):
1079 def __init__(self,db):
1080 Module.__init__(self, 'NETWORK', db)
1081 self.net_type = self.db.get_val('nettype')
1082 self.nid = self.db.get_val('nid', '*')
1083 self.cluster_id = self.db.get_val('clusterid', "0")
1084 self.port = self.db.get_val_int('port', 0)
1085 self.send_mem = self.db.get_val_int('sendmem', DEFAULT_TCPBUF)
1086 self.recv_mem = self.db.get_val_int('recvmem', DEFAULT_TCPBUF)
1087 self.irq_affinity = self.db.get_val_int('irqaffinity', 0)
1090 self.nid = sys_get_local_nid(self.net_type, self.nid, self.cluster_id)
1092 panic("unable to set nid for", self.net_type, self.nid, cluster_id)
1093 self.generic_nid = 1
1094 debug("nid:", self.nid)
1096 self.generic_nid = 0
1098 self.nid_uuid = self.nid_to_uuid(self.nid)
1100 self.hostaddr = self.db.get_val('hostaddr', self.nid)
1101 if '*' in self.hostaddr:
1102 self.hostaddr = sys_get_local_address(self.net_type, self.hostaddr, self.cluster_id)
1103 if not self.hostaddr:
1104 panic("unable to set hostaddr for", self.net_type, self.hostaddr, self.cluster_id)
1105 debug("hostaddr:", self.hostaddr)
1107 self.add_portals_module("libcfs", 'portals')
1108 if node_needs_router():
1109 self.add_portals_module("router", 'kptlrouter')
1110 if self.net_type == 'tcp':
1111 self.add_portals_module("knals/socknal", 'ksocknal')
1112 if self.net_type == 'elan':
1113 self.add_portals_module("knals/qswnal", 'kqswnal')
1114 if self.net_type == 'gm':
1115 self.add_portals_module("knals/gmnal", 'kgmnal')
1116 if self.net_type == 'scimac':
1117 self.add_portals_module("knals/scimacnal", 'kscimacnal')
1119 def nid_to_uuid(self, nid):
1120 return "NID_%s_UUID" %(nid,)
1123 if is_network_prepared():
1125 self.info(self.net_type, self.nid, self.port)
1126 if not (config.record and self.generic_nid):
1127 lctl.network(self.net_type, self.nid)
1128 if self.net_type == 'tcp':
1130 if self.net_type == 'elan':
1132 if self.port and node_is_router():
1133 run_one_acceptor(self.port)
1134 self.connect_peer_gateways()
1136 def connect_peer_gateways(self):
1137 for router in self.db.lookup_class('node'):
1138 if router.get_val_int('router', 0):
1139 for netuuid in router.get_networks():
1140 net = self.db.lookup(netuuid)
1142 if (gw.cluster_id == self.cluster_id and
1143 gw.net_type == self.net_type):
1144 if gw.nid != self.nid:
1147 def disconnect_peer_gateways(self):
1148 for router in self.db.lookup_class('node'):
1149 if router.get_val_int('router', 0):
1150 for netuuid in router.get_networks():
1151 net = self.db.lookup(netuuid)
1153 if (gw.cluster_id == self.cluster_id and
1154 gw.net_type == self.net_type):
1155 if gw.nid != self.nid:
1158 except CommandError, e:
1159 print "disconnect failed: ", self.name
1163 def safe_to_clean(self):
1164 return not is_network_prepared()
1167 self.info(self.net_type, self.nid, self.port)
1169 stop_acceptor(self.port)
1170 if node_is_router():
1171 self.disconnect_peer_gateways()
1173 class RouteTable(Module):
1174 def __init__(self,db):
1175 Module.__init__(self, 'ROUTES', db)
1177 def server_for_route(self, net_type, gw, gw_cluster_id, tgt_cluster_id,
1179 # only setup connections for tcp NALs
1181 if not net_type in ('tcp',):
1184 # connect to target if route is to single node and this node is the gw
1185 if lo == hi and local_interface(net_type, gw_cluster_id, gw):
1186 if not local_cluster(net_type, tgt_cluster_id):
1187 panic("target", lo, " not on the local cluster")
1188 srvdb = self.db.nid2server(lo, net_type, gw_cluster_id)
1189 # connect to gateway if this node is not the gw
1190 elif (local_cluster(net_type, gw_cluster_id)
1191 and not local_interface(net_type, gw_cluster_id, gw)):
1192 srvdb = self.db.nid2server(gw, net_type, gw_cluster_id)
1197 panic("no server for nid", lo)
1200 return Network(srvdb)
1203 if is_network_prepared():
1206 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1207 lctl.add_route(net_type, gw, lo, hi)
1208 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1212 def safe_to_clean(self):
1213 return not is_network_prepared()
1216 if is_network_prepared():
1217 # the network is still being used, don't clean it up
1219 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1220 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1223 lctl.disconnect(srv)
1224 except CommandError, e:
1225 print "disconnect failed: ", self.name
1230 lctl.del_route(net_type, gw, lo, hi)
1231 except CommandError, e:
1232 print "del_route failed: ", self.name
1236 class Management(Module):
1237 def __init__(self, db):
1238 Module.__init__(self, 'MGMT', db)
1239 self.add_lustre_module('lvfs', 'lvfs')
1240 self.add_lustre_module('obdclass', 'obdclass')
1241 self.add_lustre_module('ptlrpc', 'ptlrpc')
1242 self.add_lustre_module('mgmt', 'mgmt_svc')
1245 if is_prepared(self.name):
1248 lctl.newdev("mgmt", self.name, self.uuid)
1250 def safe_to_clean(self):
1254 if is_prepared(self.name):
1255 Module.cleanup(self)
1257 # This is only needed to load the modules; the LDLM device
1258 # is now created automatically.
1260 def __init__(self,db):
1261 Module.__init__(self, 'LDLM', db)
1262 self.add_lustre_module('lvfs', 'lvfs')
1263 self.add_lustre_module('obdclass', 'obdclass')
1264 self.add_lustre_module('ptlrpc', 'ptlrpc')
1273 def __init__(self, db, uuid, fs_name, name_override = None, config_only = None):
1274 Module.__init__(self, 'LOV', db)
1275 if name_override != None:
1276 self.name = "lov_%s" % name_override
1277 self.add_lustre_module('lov', 'lov')
1278 self.mds_uuid = self.db.get_first_ref('mds')
1279 self.stripe_sz = self.db.get_val_int('stripesize', 65536)
1280 self.stripe_off = self.db.get_val_int('stripeoffset', 0)
1281 self.pattern = self.db.get_val_int('stripepattern', 0)
1282 self.devlist = self.db.get_refs('obd')
1283 self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist))
1285 self.desc_uuid = self.uuid
1286 self.uuid = generate_client_uuid(self.name)
1287 self.fs_name = fs_name
1289 self.config_only = 1
1291 self.config_only = None
1292 mds= self.db.lookup(self.mds_uuid)
1293 self.mds_name = mds.getName()
1294 for obd_uuid in self.devlist:
1295 obd = self.db.lookup(obd_uuid)
1296 osc = get_osc(obd, self.uuid, fs_name)
1298 self.osclist.append(osc)
1300 panic('osc not found:', obd_uuid)
1303 if is_prepared(self.name):
1305 if self.config_only:
1306 panic("Can't prepare config_only LOV ", self.name)
1308 for osc in self.osclist:
1310 # Only ignore connect failures with --force, which
1311 # isn't implemented here yet.
1312 osc.prepare(ignore_connect_failure=0)
1313 except CommandError, e:
1314 print "Error preparing OSC %s\n" % osc.uuid
1316 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
1317 self.stripe_off, self.pattern, self.devlist, self.mds_name)
1318 lctl.lov_setup(self.name, self.uuid,
1319 self.desc_uuid, self.mds_name, self.stripe_cnt,
1320 self.stripe_sz, self.stripe_off, self.pattern,
1321 string.join(self.devlist))
1324 if is_prepared(self.name):
1325 Module.cleanup(self)
1326 if self.config_only:
1327 panic("Can't clean up config_only LOV ", self.name)
1328 for osc in self.osclist:
1331 def load_module(self):
1332 if self.config_only:
1333 panic("Can't load modules for config_only LOV ", self.name)
1334 for osc in self.osclist:
1337 Module.load_module(self)
1339 def cleanup_module(self):
1340 if self.config_only:
1341 panic("Can't cleanup modules for config_only LOV ", self.name)
1342 Module.cleanup_module(self)
1343 for osc in self.osclist:
1344 osc.cleanup_module()
1347 class MDSDEV(Module):
1348 def __init__(self,db):
1349 Module.__init__(self, 'MDSDEV', db)
1350 self.devpath = self.db.get_val('devpath','')
1351 self.size = self.db.get_val_int('devsize', 0)
1352 self.journal_size = self.db.get_val_int('journalsize', 0)
1353 self.fstype = self.db.get_val('fstype', '')
1354 self.nspath = self.db.get_val('nspath', '')
1355 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1356 # overwrite the orignal MDSDEV name and uuid with the MDS name and uuid
1357 target_uuid = self.db.get_first_ref('target')
1358 mds = self.db.lookup(target_uuid)
1359 self.name = mds.getName()
1360 self.filesystem_uuids = mds.get_refs('filesystem')
1361 # FIXME: if fstype not set, then determine based on kernel version
1362 self.format = self.db.get_val('autoformat', "no")
1363 if mds.get_val('failover', 0):
1364 self.failover_mds = 'f'
1366 self.failover_mds = 'n'
1367 active_uuid = get_active_target(mds)
1369 panic("No target device found:", target_uuid)
1370 if active_uuid == self.uuid:
1374 if self.active and config.group and config.group != ost.get_val('group'):
1377 self.inode_size = self.db.get_val_int('inodesize', 0)
1378 if self.inode_size == 0:
1379 # find the LOV for this MDS
1380 lovconfig_uuid = mds.get_first_ref('lovconfig')
1381 if not lovconfig_uuid:
1382 panic("No LOV config found for MDS ", mds.name)
1383 lovconfig = mds.lookup(lovconfig_uuid)
1384 lov_uuid = lovconfig.get_first_ref('lov')
1386 panic("No LOV found for lovconfig ", lovconfig.name)
1387 lov = LOV(self.db.lookup(lov_uuid), lov_uuid, 'FS_name', config_only = 1)
1389 # default stripe count controls default inode_size
1390 stripe_count = lov.stripe_cnt
1391 if stripe_count > 77:
1392 self.inode_size = 4096
1393 elif stripe_count > 35:
1394 self.inode_size = 2048
1395 elif stripe_count > 13:
1396 self.inode_size = 1024
1397 elif stripe_count > 3:
1398 self.inode_size = 512
1400 self.inode_size = 256
1402 self.target_dev_uuid = self.uuid
1403 self.uuid = target_uuid
1405 self.add_lustre_module('mdc', 'mdc')
1406 self.add_lustre_module('osc', 'osc')
1407 self.add_lustre_module('lov', 'lov')
1408 self.add_lustre_module('mds', 'mds')
1410 self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.fstype))
1412 def load_module(self):
1414 Module.load_module(self)
1417 if is_prepared(self.name):
1420 debug(self.uuid, "not active")
1423 # run write_conf automatically, if --reformat used
1425 self.info(self.devpath, self.fstype, self.size, self.format)
1427 # never reformat here
1428 blkdev = block_dev(self.devpath, self.size, self.fstype, 0,
1429 self.format, self.journal_size, self.inode_size,
1431 if not is_prepared('MDT'):
1432 lctl.newdev("mdt", 'MDT', 'MDT_UUID', setup ="")
1434 lctl.newdev("mds", self.name, self.uuid,
1435 setup ="%s %s %s" %(blkdev, self.fstype, self.name))
1436 except CommandError, e:
1438 panic("MDS is missing the config log. Need to run " +
1439 "lconf --write_conf.")
1443 def write_conf(self):
1444 if is_prepared(self.name):
1446 self.info(self.devpath, self.fstype, self.format)
1447 blkdev = block_dev(self.devpath, self.size, self.fstype,
1448 config.reformat, self.format, self.journal_size,
1449 self.inode_size, self.mkfsoptions)
1450 lctl.newdev("mds", self.name, self.uuid,
1451 setup ="%s %s" %(blkdev, self.fstype))
1453 # record logs for the MDS lov
1454 for uuid in self.filesystem_uuids:
1455 log("recording clients for filesystem:", uuid)
1456 fs = self.db.lookup(uuid)
1457 obd_uuid = fs.get_first_ref('obd')
1458 client_uuid = generate_client_uuid(self.name)
1459 client = VOSC(self.db.lookup(obd_uuid), client_uuid, self.name,
1462 lctl.record(self.name, self.name)
1464 lctl.mount_option(self.name, client.get_name(), "")
1468 lctl.record(self.name, self.name + '-clean')
1470 lctl.del_mount_option(self.name)
1475 # record logs for each client
1477 config_options = "--ldapurl " + config.ldapurl + " --config " + config.config
1479 config_options = CONFIG_FILE
1481 for node_db in self.db.lookup_class('node'):
1482 client_name = node_db.getName()
1483 for prof_uuid in node_db.get_refs('profile'):
1484 prof_db = node_db.lookup(prof_uuid)
1485 # refactor this into a funtion to test "clientness"
1487 for ref_class, ref_uuid in prof_db.get_all_refs():
1488 if ref_class in ('mountpoint','echoclient'):
1489 debug("recording", client_name)
1490 old_noexec = config.noexec
1492 noexec_opt = ('', '-n')
1493 ret, out = run (sys.argv[0],
1494 noexec_opt[old_noexec == 1],
1495 " -v --record --nomod",
1496 "--record_log", client_name,
1497 "--record_device", self.name,
1498 "--node", client_name,
1501 for s in out: log("record> ", string.strip(s))
1502 ret, out = run (sys.argv[0],
1503 noexec_opt[old_noexec == 1],
1504 "--cleanup -v --record --nomod",
1505 "--record_log", client_name + "-clean",
1506 "--record_device", self.name,
1507 "--node", client_name,
1510 for s in out: log("record> ", string.strip(s))
1511 config.noexec = old_noexec
1513 lctl.cleanup(self.name, self.uuid, 0, 0)
1514 except CommandError, e:
1515 log(self.module_name, "cleanup failed: ", self.name)
1518 Module.cleanup(self)
1519 clean_loop(self.devpath)
1521 def msd_remaining(self):
1522 out = lctl.device_list()
1524 if string.split(s)[2] in ('mds',):
1527 def safe_to_clean(self):
1530 def safe_to_clean_modules(self):
1531 return not self.msd_remaining()
1535 debug(self.uuid, "not active")
1538 if is_prepared(self.name):
1540 lctl.cleanup(self.name, self.uuid, config.force,
1542 except CommandError, e:
1543 log(self.module_name, "cleanup failed: ", self.name)
1546 Module.cleanup(self)
1547 if not self.msd_remaining() and is_prepared('MDT'):
1549 lctl.cleanup("MDT", "MDT_UUID", config.force,
1551 except CommandError, e:
1552 print "cleanup failed: ", self.name
1555 clean_loop(self.devpath)
1558 def __init__(self, db):
1559 Module.__init__(self, 'OSD', db)
1560 self.osdtype = self.db.get_val('osdtype')
1561 self.devpath = self.db.get_val('devpath', '')
1562 self.size = self.db.get_val_int('devsize', 0)
1563 self.journal_size = self.db.get_val_int('journalsize', 0)
1564 self.inode_size = self.db.get_val_int('inodesize', 0)
1565 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1566 self.fstype = self.db.get_val('fstype', '')
1567 self.nspath = self.db.get_val('nspath', '')
1568 target_uuid = self.db.get_first_ref('target')
1569 ost = self.db.lookup(target_uuid)
1570 self.name = ost.getName()
1571 self.format = self.db.get_val('autoformat', 'yes')
1572 if ost.get_val('failover', 0):
1573 self.failover_ost = 'f'
1575 self.failover_ost = 'n'
1577 active_uuid = get_active_target(ost)
1579 panic("No target device found:", target_uuid)
1580 if active_uuid == self.uuid:
1584 if self.active and config.group and config.group != ost.get_val('group'):
1587 self.target_dev_uuid = self.uuid
1588 self.uuid = target_uuid
1590 self.add_lustre_module('ost', 'ost')
1591 # FIXME: should we default to ext3 here?
1593 self.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.fstype))
1594 self.add_lustre_module(self.osdtype, self.osdtype)
1596 def load_module(self):
1598 Module.load_module(self)
1600 # need to check /proc/mounts and /etc/mtab before
1601 # formatting anything.
1602 # FIXME: check if device is already formatted.
1604 if is_prepared(self.name):
1607 debug(self.uuid, "not active")
1609 self.info(self.osdtype, self.devpath, self.size, self.fstype,
1610 self.format, self.journal_size, self.inode_size)
1612 if self.osdtype == 'obdecho':
1615 blkdev = block_dev(self.devpath, self.size, self.fstype,
1616 config.reformat, self.format, self.journal_size,
1617 self.inode_size, self.mkfsoptions)
1618 lctl.newdev(self.osdtype, self.name, self.uuid,
1619 setup ="%s %s %s" %(blkdev, self.fstype,
1621 if not is_prepared('OSS'):
1622 lctl.newdev("ost", 'OSS', 'OSS_UUID', setup ="")
1624 def osd_remaining(self):
1625 out = lctl.device_list()
1627 if string.split(s)[2] in ('obdfilter', 'obdecho'):
1630 def safe_to_clean(self):
1633 def safe_to_clean_modules(self):
1634 return not self.osd_remaining()
1638 debug(self.uuid, "not active")
1640 if is_prepared(self.name):
1643 lctl.cleanup(self.name, self.uuid, config.force,
1645 except CommandError, e:
1646 log(self.module_name, "cleanup failed: ", self.name)
1649 if not self.osd_remaining() and is_prepared('OSS'):
1651 lctl.cleanup("OSS", "OSS_UUID", config.force,
1653 except CommandError, e:
1654 print "cleanup failed: ", self.name
1657 if not self.osdtype == 'obdecho':
1658 clean_loop(self.devpath)
1660 def mgmt_uuid_for_fs(mtpt_name):
1663 mtpt_db = toplevel.lookup_name(mtpt_name)
1664 fs_uuid = mtpt_db.get_first_ref('filesystem')
1665 fs = toplevel.lookup(fs_uuid)
1668 return fs.get_first_ref('mgmt')
1670 # Generic client module, used by OSC and MDC
1671 class Client(Module):
1672 def __init__(self, tgtdb, uuid, module, fs_name, self_name=None,
1674 self.target_name = tgtdb.getName()
1675 self.target_uuid = tgtdb.getUUID()
1678 self.tgt_dev_uuid = get_active_target(tgtdb)
1679 if not self.tgt_dev_uuid:
1680 panic("No target device found for target:", self.target_name)
1682 self.kmod = kmod(config.lustre, config.portals)
1686 self.module = module
1687 self.module_name = string.upper(module)
1689 self.name = '%s_%s_%s_%s' % (self.module_name, socket.gethostname(),
1690 self.target_name, fs_name)
1692 self.name = self_name
1694 self.lookup_server(self.tgt_dev_uuid)
1695 mgmt_uuid = mgmt_uuid_for_fs(fs_name)
1697 self.mgmt_name = mgmtcli_name_for_uuid(mgmt_uuid)
1700 self.fs_name = fs_name
1703 self.add_lustre_module(module_dir, module)
1705 def lookup_server(self, srv_uuid):
1706 """ Lookup a server's network information """
1707 self._server_nets = get_ost_net(self.db, srv_uuid)
1708 if len(self._server_nets) == 0:
1709 panic ("Unable to find a server for:", srv_uuid)
1711 def get_servers(self):
1712 return self._server_nets
1714 def prepare(self, ignore_connect_failure = 0):
1715 self.info(self.target_uuid)
1716 if is_prepared(self.name):
1719 srv = choose_local_server(self.get_servers())
1723 routes = find_route(self.get_servers())
1724 if len(routes) == 0:
1725 panic ("no route to", self.target_uuid)
1726 for (srv, r) in routes:
1727 lctl.add_route_host(r[0], srv.nid_uuid, r[1], r[3])
1728 except CommandError, e:
1729 if not ignore_connect_failure:
1732 if self.target_uuid in config.inactive and self.permits_inactive():
1733 debug("%s inactive" % self.target_uuid)
1734 inactive_p = "inactive"
1736 debug("%s active" % self.target_uuid)
1738 lctl.newdev(self.module, self.name, self.uuid,
1739 setup ="%s %s %s %s" % (self.target_uuid, srv.nid_uuid,
1740 inactive_p, self.mgmt_name))
1743 if is_prepared(self.name):
1744 Module.cleanup(self)
1746 srv = choose_local_server(self.get_servers())
1748 lctl.disconnect(srv)
1750 for (srv, r) in find_route(self.get_servers()):
1751 lctl.del_route_host(r[0], srv.nid_uuid, r[1], r[3])
1752 except CommandError, e:
1753 log(self.module_name, "cleanup failed: ", self.name)
1759 def __init__(self, db, uuid, fs_name):
1760 Client.__init__(self, db, uuid, 'mdc', fs_name)
1762 def permits_inactive(self):
1766 def __init__(self, db, uuid, fs_name):
1767 Client.__init__(self, db, uuid, 'osc', fs_name)
1769 def permits_inactive(self):
1772 def mgmtcli_name_for_uuid(uuid):
1773 return 'MGMTCLI_%s' % uuid
1775 class ManagementClient(Client):
1776 def __init__(self, db, uuid):
1777 Client.__init__(self, db, uuid, 'mgmt_cli', '',
1778 self_name = mgmtcli_name_for_uuid(db.getUUID()),
1779 module_dir = 'mgmt')
1782 def __init__(self, db):
1783 Module.__init__(self, 'COBD', db)
1784 self.real_uuid = self.db.get_first_ref('realobd')
1785 self.cache_uuid = self.db.get_first_ref('cacheobd')
1786 self.add_lustre_module('cobd' , 'cobd')
1788 # need to check /proc/mounts and /etc/mtab before
1789 # formatting anything.
1790 # FIXME: check if device is already formatted.
1792 if is_prepared(self.name):
1794 self.info(self.real_uuid, self.cache_uuid)
1795 lctl.newdev("cobd", self.name, self.uuid,
1796 setup ="%s %s" %(self.real_uuid, self.cache_uuid))
1799 # virtual interface for OSC and LOV
1801 def __init__(self, db, uuid, fs_name, name_override = None):
1802 Module.__init__(self, 'VOSC', db)
1803 if db.get_class() == 'lov':
1804 self.osc = LOV(db, uuid, fs_name, name_override)
1806 self.osc = get_osc(db, uuid, fs_name)
1808 return self.osc.uuid
1810 return self.osc.name
1815 def load_module(self):
1816 self.osc.load_module()
1817 def cleanup_module(self):
1818 self.osc.cleanup_module()
1821 class ECHO_CLIENT(Module):
1822 def __init__(self,db):
1823 Module.__init__(self, 'ECHO_CLIENT', db)
1824 self.add_lustre_module('obdecho', 'obdecho')
1825 self.obd_uuid = self.db.get_first_ref('obd')
1826 obd = self.db.lookup(self.obd_uuid)
1827 self.uuid = generate_client_uuid(self.name)
1828 self.osc = VOSC(obd, self.uuid, self.name)
1831 if is_prepared(self.name):
1834 self.osc.prepare() # XXX This is so cheating. -p
1835 self.info(self.obd_uuid)
1837 lctl.newdev("echo_client", self.name, self.uuid,
1838 setup = self.osc.get_name())
1841 if is_prepared(self.name):
1842 Module.cleanup(self)
1845 def load_module(self):
1846 self.osc.load_module()
1847 Module.load_module(self)
1849 def cleanup_module(self):
1850 Module.cleanup_module(self)
1851 self.osc.cleanup_module()
1854 def generate_client_uuid(name):
1855 client_uuid = '%05x_%.19s_%05x%05x' % (int(random.random() * 1048576),
1857 int(random.random() * 1048576),
1858 int(random.random() * 1048576))
1859 return client_uuid[:36]
1862 class Mountpoint(Module):
1863 def __init__(self,db):
1864 Module.__init__(self, 'MTPT', db)
1865 self.path = self.db.get_val('path')
1866 self.fs_uuid = self.db.get_first_ref('filesystem')
1867 fs = self.db.lookup(self.fs_uuid)
1868 self.mds_uuid = fs.get_first_ref('mds')
1869 self.obd_uuid = fs.get_first_ref('obd')
1870 self.mgmt_uuid = fs.get_first_ref('mgmt')
1871 obd = self.db.lookup(self.obd_uuid)
1872 client_uuid = generate_client_uuid(self.name)
1873 self.vosc = VOSC(obd, client_uuid, self.name)
1874 self.mdc = get_mdc(db, client_uuid, self.name, self.mds_uuid)
1876 self.add_lustre_module('mdc', 'mdc')
1877 self.add_lustre_module('llite', 'llite')
1879 self.mgmtcli = ManagementClient(db.lookup(self.mgmt_uuid),
1885 if fs_is_mounted(self.path):
1886 log(self.path, "already mounted.")
1890 self.mgmtcli.prepare()
1893 mdc_name = self.mdc.name
1895 self.info(self.path, self.mds_uuid, self.obd_uuid)
1896 if config.record or config.lctl_dump:
1897 lctl.mount_option(local_node_name, self.vosc.get_name(), mdc_name)
1899 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s %s %s" % \
1900 (self.vosc.get_name(), mdc_name, config.config, self.path)
1901 run("mkdir", self.path)
1906 panic("mount failed:", self.path, ":", string.join(val))
1909 self.info(self.path, self.mds_uuid,self.obd_uuid)
1911 if config.record or config.lctl_dump:
1912 lctl.del_mount_option(local_node_name)
1914 if fs_is_mounted(self.path):
1916 (rc, out) = run("umount", "-f", self.path)
1918 (rc, out) = run("umount", self.path)
1920 raise CommandError('umount', out, rc)
1922 if fs_is_mounted(self.path):
1923 panic("fs is still mounted:", self.path)
1928 self.mgmtcli.cleanup()
1930 def load_module(self):
1932 self.mgmtcli.load_module()
1933 self.vosc.load_module()
1934 Module.load_module(self)
1936 def cleanup_module(self):
1937 Module.cleanup_module(self)
1938 self.vosc.cleanup_module()
1940 self.mgmtcli.cleanup_module()
1943 # ============================================================
1944 # misc query functions
1946 def get_ost_net(self, osd_uuid):
1950 osd = self.lookup(osd_uuid)
1951 node_uuid = osd.get_first_ref('node')
1952 node = self.lookup(node_uuid)
1954 panic("unable to find node for osd_uuid:", osd_uuid,
1955 " node_ref:", node_uuid)
1956 for net_uuid in node.get_networks():
1957 db = node.lookup(net_uuid)
1958 srv_list.append(Network(db))
1962 # the order of iniitailization is based on level.
1963 def getServiceLevel(self):
1964 type = self.get_class()
1966 if type in ('network',):
1968 elif type in ('routetbl',):
1970 elif type in ('ldlm',):
1972 elif type in ('mgmt',):
1974 elif type in ('osd', 'cobd'):
1976 elif type in ('mdsdev',):
1978 elif type in ('mountpoint', 'echoclient'):
1981 panic("Unknown type: ", type)
1983 if ret < config.minlevel or ret > config.maxlevel:
1988 # return list of services in a profile. list is a list of tuples
1989 # [(level, db_object),]
1990 def getServices(self):
1992 for ref_class, ref_uuid in self.get_all_refs():
1993 servdb = self.lookup(ref_uuid)
1995 level = getServiceLevel(servdb)
1997 list.append((level, servdb))
1999 panic('service not found: ' + ref_uuid)
2005 ############################################################
2007 # FIXME: clean this mess up!
2009 # OSC is no longer in the xml, so we have to fake it.
2010 # this is getting ugly and begging for another refactoring
2011 def get_osc(ost_db, uuid, fs_name):
2012 osc = OSC(ost_db, uuid, fs_name)
2015 def get_mdc(db, uuid, fs_name, mds_uuid):
2016 mds_db = db.lookup(mds_uuid);
2018 panic("no mds:", mds_uuid)
2019 mdc = MDC(mds_db, uuid, fs_name)
2022 ############################################################
2023 # routing ("rooting")
2025 # list of (nettype, cluster_id, nid)
2028 def find_local_clusters(node_db):
2029 global local_clusters
2030 for netuuid in node_db.get_networks():
2031 net = node_db.lookup(netuuid)
2033 debug("add_local", netuuid)
2034 local_clusters.append((srv.net_type, srv.cluster_id, srv.nid))
2036 if acceptors.has_key(srv.port):
2037 panic("duplicate port:", srv.port)
2038 acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type,
2039 srv.send_mem, srv.recv_mem,
2042 # This node is a gateway.
2044 def node_is_router():
2047 # If there are any routers found in the config, then this will be true
2048 # and all nodes will load kptlrouter.
2050 def node_needs_router():
2051 return needs_router or is_router
2053 # list of (nettype, gw, tgt_cluster_id, lo, hi)
2054 # Currently, these local routes are only added to kptlrouter route
2055 # table if they are needed to connect to a specific server. This
2056 # should be changed so all available routes are loaded, and the
2057 # ptlrouter can make all the decisions.
2060 def find_local_routes(lustre):
2061 """ Scan the lustre config looking for routers . Build list of
2063 global local_routes, needs_router
2065 list = lustre.lookup_class('node')
2067 if router.get_val_int('router', 0):
2069 for (local_type, local_cluster_id, local_nid) in local_clusters:
2071 for netuuid in router.get_networks():
2072 db = router.lookup(netuuid)
2073 if (local_type == db.get_val('nettype') and
2074 local_cluster_id == db.get_val('clusterid')):
2075 gw = db.get_val('nid')
2078 debug("find_local_routes: gw is", gw)
2079 for route in router.get_local_routes(local_type, gw):
2080 local_routes.append(route)
2081 debug("find_local_routes:", local_routes)
2084 def choose_local_server(srv_list):
2085 for srv in srv_list:
2086 if local_cluster(srv.net_type, srv.cluster_id):
2089 def local_cluster(net_type, cluster_id):
2090 for cluster in local_clusters:
2091 if net_type == cluster[0] and cluster_id == cluster[1]:
2095 def local_interface(net_type, cluster_id, nid):
2096 for cluster in local_clusters:
2097 if (net_type == cluster[0] and cluster_id == cluster[1]
2098 and nid == cluster[2]):
2102 def find_route(srv_list):
2104 frm_type = local_clusters[0][0]
2105 for srv in srv_list:
2106 debug("find_route: srv:", srv.nid, "type: ", srv.net_type)
2107 to_type = srv.net_type
2109 cluster_id = srv.cluster_id
2110 debug ('looking for route to', to_type, to)
2111 for r in local_routes:
2112 debug("find_route: ", r)
2113 if (r[3] <= to and to <= r[4]) and cluster_id == r[2]:
2114 result.append((srv, r))
2117 def get_active_target(db):
2118 target_uuid = db.getUUID()
2119 target_name = db.getName()
2120 node_name = get_select(target_name)
2122 tgt_dev_uuid = db.get_node_tgt_dev(node_name, target_uuid)
2124 tgt_dev_uuid = db.get_first_ref('active')
2127 def get_server_by_nid_uuid(db, nid_uuid):
2128 for n in db.lookup_class("network"):
2130 if net.nid_uuid == nid_uuid:
2134 ############################################################
2138 type = db.get_class()
2139 debug('Service:', type, db.getName(), db.getUUID())
2144 n = LOV(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
2145 elif type == 'network':
2147 elif type == 'routetbl':
2151 elif type == 'cobd':
2153 elif type == 'mdsdev':
2155 elif type == 'mountpoint':
2157 elif type == 'echoclient':
2159 elif type == 'mgmt':
2162 panic ("unknown service type:", type)
2166 # Prepare the system to run lustre using a particular profile
2167 # in a the configuration.
2168 # * load & the modules
2169 # * setup networking for the current node
2170 # * make sure partitions are in place and prepared
2171 # * initialize devices with lctl
2172 # Levels is important, and needs to be enforced.
2173 def for_each_profile(db, prof_list, operation):
2174 for prof_uuid in prof_list:
2175 prof_db = db.lookup(prof_uuid)
2177 panic("profile:", profile, "not found.")
2178 services = getServices(prof_db)
2181 def doWriteconf(services):
2185 if s[1].get_class() == 'mdsdev':
2186 n = newService(s[1])
2189 def doSetup(services):
2193 n = newService(s[1])
2196 def doModules(services):
2200 n = newService(s[1])
2203 def doCleanup(services):
2208 n = newService(s[1])
2209 if n.safe_to_clean():
2212 def doUnloadModules(services):
2217 n = newService(s[1])
2218 if n.safe_to_clean_modules():
2223 def doHost(lustreDB, hosts):
2224 global is_router, local_node_name
2227 node_db = lustreDB.lookup_name(h, 'node')
2231 print 'No host entry found.'
2234 local_node_name = node_db.get_val('name', 0)
2235 is_router = node_db.get_val_int('router', 0)
2236 lustre_upcall = node_db.get_val('lustreUpcall', '')
2237 portals_upcall = node_db.get_val('portalsUpcall', '')
2238 timeout = node_db.get_val_int('timeout', 0)
2239 ptldebug = node_db.get_val('ptldebug', '')
2240 subsystem = node_db.get_val('subsystem', '')
2242 find_local_clusters(node_db)
2244 find_local_routes(lustreDB)
2246 # Two step process: (1) load modules, (2) setup lustre
2247 # if not cleaning, load modules first.
2248 prof_list = node_db.get_refs('profile')
2250 if config.write_conf:
2251 for_each_profile(node_db, prof_list, doModules)
2253 for_each_profile(node_db, prof_list, doWriteconf)
2254 for_each_profile(node_db, prof_list, doUnloadModules)
2256 elif config.recover:
2257 if not (config.tgt_uuid and config.client_uuid and config.conn_uuid):
2258 raise Lustre.LconfError( "--recovery requires --tgt_uuid <UUID> " +
2259 "--client_uuid <UUID> --conn_uuid <UUID>")
2260 doRecovery(lustreDB, lctl, config.tgt_uuid, config.client_uuid,
2262 elif config.cleanup:
2264 # the command line can override this value
2266 # ugly hack, only need to run lctl commands for --dump
2267 if config.lctl_dump or config.record:
2268 for_each_profile(node_db, prof_list, doCleanup)
2271 sys_set_timeout(timeout)
2272 sys_set_ptldebug(ptldebug)
2273 sys_set_subsystem(subsystem)
2274 sys_set_lustre_upcall(lustre_upcall)
2275 sys_set_portals_upcall(portals_upcall)
2277 for_each_profile(node_db, prof_list, doCleanup)
2278 for_each_profile(node_db, prof_list, doUnloadModules)
2281 # ugly hack, only need to run lctl commands for --dump
2282 if config.lctl_dump or config.record:
2283 sys_set_timeout(timeout)
2284 sys_set_lustre_upcall(lustre_upcall)
2285 for_each_profile(node_db, prof_list, doSetup)
2289 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
2290 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
2292 for_each_profile(node_db, prof_list, doModules)
2294 sys_set_debug_path()
2295 sys_set_ptldebug(ptldebug)
2296 sys_set_subsystem(subsystem)
2297 script = config.gdb_script
2298 run(lctl.lctl, ' modules >', script)
2300 log ("The GDB module script is in", script)
2301 # pause, so user has time to break and
2304 sys_set_timeout(timeout)
2305 sys_set_lustre_upcall(lustre_upcall)
2306 sys_set_portals_upcall(portals_upcall)
2308 for_each_profile(node_db, prof_list, doSetup)
2310 def doRecovery(db, lctl, tgt_uuid, client_uuid, nid_uuid):
2311 tgt = db.lookup(tgt_uuid)
2313 raise Lustre.LconfError("doRecovery: "+ tgt_uuid +" not found.")
2314 new_uuid = get_active_target(tgt)
2316 raise Lustre.LconfError("doRecovery: no active target found for: " +
2318 net = choose_local_server(get_ost_net(db, new_uuid))
2320 raise Lustre.LconfError("Unable to find a connection to:" + new_uuid)
2322 log("Reconnecting", tgt_uuid, " to ", net.nid_uuid);
2324 oldnet = get_server_by_nid_uuid(db, nid_uuid)
2326 lctl.disconnect(oldnet)
2327 except CommandError, e:
2328 log("recover: disconnect", nid_uuid, "failed: ")
2333 except CommandError, e:
2334 log("recover: connect failed")
2337 lctl.recover(client_uuid, net.nid_uuid)
2340 def setupModulePath(cmd, portals_dir = PORTALS_DIR):
2341 base = os.path.dirname(cmd)
2342 if development_mode():
2343 if not config.lustre:
2344 config.lustre = (os.path.join(base, ".."))
2345 # normalize the portals dir, using command line arg if set
2347 portals_dir = config.portals
2348 dir = os.path.join(config.lustre, portals_dir)
2349 config.portals = dir
2350 debug('config.portals', config.portals)
2351 elif config.lustre and config.portals:
2353 # if --lustre and --portals, normalize portals
2354 # can ignore POTRALS_DIR here, since it is probly useless here
2355 config.portals = os.path.join(config.lustre, config.portals)
2356 debug('config.portals B', config.portals)
2358 def sysctl(path, val):
2359 debug("+ sysctl", path, val)
2363 fp = open(os.path.join('/proc/sys', path), 'w')
2370 def sys_set_debug_path():
2371 sysctl('portals/debug_path', config.debug_path)
2373 def sys_set_lustre_upcall(upcall):
2374 # the command overrides the value in the node config
2375 if config.lustre_upcall:
2376 upcall = config.lustre_upcall
2378 upcall = config.upcall
2380 lctl.set_lustre_upcall(upcall)
2382 def sys_set_portals_upcall(upcall):
2383 # the command overrides the value in the node config
2384 if config.portals_upcall:
2385 upcall = config.portals_upcall
2387 upcall = config.upcall
2389 sysctl('portals/upcall', upcall)
2391 def sys_set_timeout(timeout):
2392 # the command overrides the value in the node config
2393 if config.timeout and config.timeout > 0:
2394 timeout = config.timeout
2395 if timeout != None and timeout > 0:
2396 lctl.set_timeout(timeout)
2398 def sys_tweak_socknal ():
2399 if config.single_socket:
2400 sysctl("socknal/typed", 0)
2402 def sys_optimize_elan ():
2403 procfiles = ["/proc/elan/config/eventint_punt_loops",
2404 "/proc/qsnet/elan3/config/eventint_punt_loops",
2405 "/proc/qsnet/elan4/config/elan4_mainint_punt_loops"]
2407 if os.access(p, os.R_OK):
2408 run ("echo 0 > " + p)
2410 def sys_set_ptldebug(ptldebug):
2412 ptldebug = config.ptldebug
2415 val = eval(ptldebug, ptldebug_names)
2416 val = "0x%x" % (val)
2417 sysctl('portals/debug', val)
2418 except NameError, e:
2421 def sys_set_subsystem(subsystem):
2422 if config.subsystem:
2423 subsystem = config.subsystem
2426 val = eval(subsystem, subsystem_names)
2427 val = "0x%x" % (val)
2428 sysctl('portals/subsystem_debug', val)
2429 except NameError, e:
2432 def sys_set_netmem_max(path, max):
2433 debug("setting", path, "to at least", max)
2441 fp = open(path, 'w')
2442 fp.write('%d\n' %(max))
2446 def sys_make_devices():
2447 if not os.access('/dev/portals', os.R_OK):
2448 run('mknod /dev/portals c 10 240')
2449 if not os.access('/dev/obd', os.R_OK):
2450 run('mknod /dev/obd c 10 241')
2453 # Add dir to the global PATH, if not already there.
2454 def add_to_path(new_dir):
2455 syspath = string.split(os.environ['PATH'], ':')
2456 if new_dir in syspath:
2458 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
2460 def default_debug_path():
2461 path = '/tmp/lustre-log'
2462 if os.path.isdir('/r'):
2467 def default_gdb_script():
2468 script = '/tmp/ogdb'
2469 if os.path.isdir('/r'):
2470 return '/r' + script
2475 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
2476 # ensure basic elements are in the system path
2477 def sanitise_path():
2478 for dir in DEFAULT_PATH:
2481 # global hack for the --select handling
2483 def init_select(args):
2484 # args = [service=nodeA,service2=nodeB service3=nodeC]
2487 list = string.split(arg, ',')
2489 srv, node = string.split(entry, '=')
2490 tgt_select[srv] = node
2492 def get_select(srv):
2493 if tgt_select.has_key(srv):
2494 return tgt_select[srv]
2498 FLAG = Lustre.Options.FLAG
2499 PARAM = Lustre.Options.PARAM
2500 INTPARAM = Lustre.Options.INTPARAM
2501 PARAMLIST = Lustre.Options.PARAMLIST
2503 ('verbose,v', "Print system commands as they are run"),
2504 ('ldapurl',"LDAP server URL, eg. ldap://localhost", PARAM),
2505 ('config', "Cluster config name used for LDAP query", PARAM),
2506 ('select', "service=nodeA,service2=nodeB ", PARAMLIST),
2507 ('node', "Load config for <nodename>", PARAM),
2508 ('cleanup,d', "Cleans up config. (Shutdown)"),
2509 ('force,f', "Forced unmounting and/or obd detach during cleanup",
2511 ('single_socket', "socknal option: only use one socket instead of bundle",
2513 ('failover',"""Used to shut down without saving state.
2514 This will allow this node to "give up" a service to a
2515 another node for failover purposes. This will not
2516 be a clean shutdown.""",
2518 ('gdb', """Prints message after creating gdb module script
2519 and sleeps for 5 seconds."""),
2520 ('noexec,n', """Prints the commands and steps that will be run for a
2521 config without executing them. This can used to check if a
2522 config file is doing what it should be doing"""),
2523 ('nomod', "Skip load/unload module step."),
2524 ('nosetup', "Skip device setup/cleanup step."),
2525 ('reformat', "Reformat all devices (without question)"),
2526 ('mkfsoptions', "Additional options for the mk*fs command line", PARAM),
2527 ('dump', "Dump the kernel debug log to file before portals is unloaded",
2529 ('write_conf', "Save all the client config information on mds."),
2530 ('record', "Write config information on mds."),
2531 ('record_log', "Name of config record log.", PARAM),
2532 ('record_device', "MDS device name that will record the config commands",
2534 ('minlevel', "Minimum level of services to configure/cleanup",
2536 ('maxlevel', """Maximum level of services to configure/cleanup
2537 Levels are aproximatly like:
2542 70 - mountpoint, echo_client, osc, mdc, lov""",
2544 ('lustre', """Base directory of lustre sources. This parameter will
2545 cause lconf to load modules from a source tree.""", PARAM),
2546 ('portals', """Portals source directory. If this is a relative path,
2547 then it is assumed to be relative to lustre. """, PARAM),
2548 ('timeout', "Set recovery timeout", INTPARAM),
2549 ('upcall', "Set both portals and lustre upcall script", PARAM),
2550 ('lustre_upcall', "Set lustre upcall script", PARAM),
2551 ('portals_upcall', "Set portals upcall script", PARAM),
2552 ('lctl_dump', "Save lctl ioctls to the dumpfile argument", PARAM),
2553 ('ptldebug', "Set the portals debug level", PARAM),
2554 ('subsystem', "Set the portals debug subsystem", PARAM),
2555 ('gdb_script', "Fullname of gdb debug script", PARAM, default_gdb_script()),
2556 ('debug_path', "Path to save debug dumps", PARAM, default_debug_path()),
2557 # Client recovery options
2558 ('recover', "Recover a device"),
2559 ('group', "The group of devices to configure or cleanup", PARAM),
2560 ('tgt_uuid', "The failed target (required for recovery)", PARAM),
2561 ('client_uuid', "The failed client (required for recovery)", PARAM),
2562 ('conn_uuid', "The failed connection (required for recovery)", PARAM),
2564 ('inactive', """The name of an inactive service, to be ignored during
2565 mounting (currently OST-only). Can be repeated.""",
2570 global lctl, config, toplevel, CONFIG_FILE
2572 # in the upcall this is set to SIG_IGN
2573 signal.signal(signal.SIGCHLD, signal.SIG_DFL)
2575 cl = Lustre.Options("lconf", "config.xml", lconf_options)
2577 config, args = cl.parse(sys.argv[1:])
2578 except Lustre.OptionError, e:
2582 setupModulePath(sys.argv[0])
2584 host = socket.gethostname()
2586 # the PRNG is normally seeded with time(), which is not so good for starting
2587 # time-synchronized clusters
2588 input = open('/dev/urandom', 'r')
2590 print 'Unable to open /dev/urandom!'
2592 seed = input.read(32)
2598 init_select(config.select)
2601 if not os.access(args[0], os.R_OK):
2602 print 'File not found or readable:', args[0]
2605 dom = xml.dom.minidom.parse(args[0])
2607 panic("%s does not appear to be a config file." % (args[0]))
2608 sys.exit(1) # make sure to die here, even in debug mode.
2609 CONFIG_FILE = args[0]
2610 db = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement)
2611 if not config.config:
2612 config.config = os.path.basename(args[0])# use full path?
2613 if config.config[-4:] == '.xml':
2614 config.config = config.config[:-4]
2615 elif config.ldapurl:
2616 if not config.config:
2617 panic("--ldapurl requires --config name")
2618 dn = "config=%s,fs=lustre" % (config.config)
2619 db = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl)
2620 elif config.ptldebug or config.subsystem:
2621 sys_set_ptldebug(None)
2622 sys_set_subsystem(None)
2625 print 'Missing config file or ldap URL.'
2626 print 'see lconf --help for command summary'
2631 ver = db.get_version()
2633 panic("No version found in config data, please recreate.")
2634 if ver != Lustre.CONFIG_VERSION:
2635 panic("Config version", ver, "does not match lconf version",
2636 Lustre.CONFIG_VERSION)
2640 node_list.append(config.node)
2643 node_list.append(host)
2644 node_list.append('localhost')
2646 debug("configuring for host: ", node_list)
2649 config.debug_path = config.debug_path + '-' + host
2650 config.gdb_script = config.gdb_script + '-' + host
2652 lctl = LCTLInterface('lctl')
2654 if config.lctl_dump:
2655 lctl.use_save_file(config.lctl_dump)
2658 if not (config.record_device and config.record_log):
2659 panic("When recording, both --record_log and --record_device must be specified.")
2660 lctl.record(config.record_device, config.record_log)
2662 doHost(db, node_list)
2667 if __name__ == "__main__":
2670 except Lustre.LconfError, e:
2672 # traceback.print_exc(file=sys.stdout)
2674 except CommandError, e:
2678 if first_cleanup_error:
2679 sys.exit(first_cleanup_error)