3 # Copyright (C) 2002-2003 Cluster File Systems, Inc.
4 # Authors: Robert Read <rread@clusterfs.com>
5 # Mike Shaver <shaver@clusterfs.com>
6 # This file is part of Lustre, http://www.lustre.org.
8 # Lustre is free software; you can redistribute it and/or
9 # modify it under the terms of version 2 of the GNU General Public
10 # License as published by the Free Software Foundation.
12 # Lustre is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Lustre; if not, write to the Free Software
19 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 # lconf - lustre configuration tool
23 # lconf is the main driver script for starting and stopping
24 # lustre filesystem services.
26 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import sys, getopt, types
29 import string, os, stat, popen2, socket, time, random, fcntl, select
30 import re, exceptions, signal, traceback
31 import xml.dom.minidom
33 if sys.version[0] == '1':
34 from FCNTL import F_GETFL, F_SETFL
36 from fcntl import F_GETFL, F_SETFL
38 PYMOD_DIR = "/usr/lib/lustre/python"
40 def development_mode():
41 base = os.path.dirname(sys.argv[0])
42 if os.access(base+"/Makefile.am", os.R_OK):
46 if not development_mode():
47 sys.path.append(PYMOD_DIR)
53 DEFAULT_TCPBUF = 8388608
56 # Maximum number of devices to search for.
57 # (the /dev/loop* nodes need to be created beforehand)
58 MAX_LOOP_DEVICES = 256
59 PORTALS_DIR = 'portals'
61 # Needed to call lconf --record
64 # Please keep these in sync with the values in portals/kp30.h
76 "warning" : (1 << 10),
80 "portals" : (1 << 14),
82 "dlmtrace" : (1 << 16),
86 "rpctrace" : (1 << 20),
87 "vfstrace" : (1 << 21),
92 "undefined" : (1 << 0),
102 "portals" : (1 << 10),
103 "socknal" : (1 << 11),
104 "qswnal" : (1 << 12),
105 "pinger" : (1 << 13),
106 "filter" : (1 << 14),
112 "ptlrouter" : (1 << 20),
118 first_cleanup_error = 0
119 def cleanup_error(rc):
120 global first_cleanup_error
121 if not first_cleanup_error:
122 first_cleanup_error = rc
124 # ============================================================
125 # debugging and error funcs
127 def fixme(msg = "this feature"):
128 raise Lustre.LconfError, msg + ' not implmemented yet.'
131 msg = string.join(map(str,args))
132 if not config.noexec:
133 raise Lustre.LconfError(msg)
138 msg = string.join(map(str,args))
143 print string.strip(s)
147 msg = string.join(map(str,args))
150 # ack, python's builtin int() does not support '0x123' syntax.
151 # eval can do it, although what a hack!
155 return eval(s, {}, {})
158 except SyntaxError, e:
159 raise ValueError("not a number")
161 raise ValueError("not a number")
163 # ============================================================
164 # locally defined exceptions
165 class CommandError (exceptions.Exception):
166 def __init__(self, cmd_name, cmd_err, rc=None):
167 self.cmd_name = cmd_name
168 self.cmd_err = cmd_err
173 if type(self.cmd_err) == types.StringType:
175 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
177 print "! %s: %s" % (self.cmd_name, self.cmd_err)
178 elif type(self.cmd_err) == types.ListType:
180 print "! %s (error %d):" % (self.cmd_name, self.rc)
182 print "! %s:" % (self.cmd_name)
183 for s in self.cmd_err:
184 print "> %s" %(string.strip(s))
189 # ============================================================
190 # handle daemons, like the acceptor
192 """ Manage starting and stopping a daemon. Assumes daemon manages
193 it's own pid file. """
195 def __init__(self, cmd):
201 log(self.command, "already running.")
203 self.path = find_prog(self.command)
205 panic(self.command, "not found.")
206 ret, out = runcmd(self.path +' '+ self.command_line())
208 raise CommandError(self.path, out, ret)
212 pid = self.read_pidfile()
214 log ("killing process", pid)
216 #time.sleep(1) # let daemon die
218 log("unable to kill", self.command, e)
220 log("unable to kill", self.command)
223 pid = self.read_pidfile()
233 def read_pidfile(self):
235 fp = open(self.pidfile(), 'r')
242 def clean_pidfile(self):
243 """ Remove a stale pidfile """
244 log("removing stale pidfile:", self.pidfile())
246 os.unlink(self.pidfile())
248 log(self.pidfile(), e)
250 class AcceptorHandler(DaemonHandler):
251 def __init__(self, port, net_type, send_mem, recv_mem, irq_aff):
252 DaemonHandler.__init__(self, "acceptor")
255 self.send_mem = send_mem
256 self.recv_mem = recv_mem
259 self.flags = self.flags + ' -i'
262 return "/var/run/%s-%d.pid" % (self.command, self.port)
264 def command_line(self):
265 return string.join(map(str,('-s', self.send_mem, '-r', self.recv_mem, self.flags, self.port)))
269 # start the acceptors
271 if config.lctl_dump or config.record:
273 for port in acceptors.keys():
274 daemon = acceptors[port]
275 if not daemon.running():
278 def run_one_acceptor(port):
279 if config.lctl_dump or config.record:
281 if acceptors.has_key(port):
282 daemon = acceptors[port]
283 if not daemon.running():
286 panic("run_one_acceptor: No acceptor defined for port:", port)
288 def stop_acceptor(port):
289 if acceptors.has_key(port):
290 daemon = acceptors[port]
295 # ============================================================
296 # handle lctl interface
299 Manage communication with lctl
302 def __init__(self, cmd):
304 Initialize close by finding the lctl binary.
306 self.lctl = find_prog(cmd)
308 self.record_device = ''
311 debug('! lctl not found')
314 raise CommandError('lctl', "unable to find lctl binary.")
316 def use_save_file(self, file):
317 self.save_file = file
319 def record(self, dev_name, logname):
320 log("Recording log", logname, "on", dev_name)
321 self.record_device = dev_name
322 self.record_log = logname
324 def end_record(self):
325 log("End recording log", self.record_log, "on", self.record_device)
326 self.record_device = None
327 self.record_log = None
329 def set_nonblock(self, fd):
330 fl = fcntl.fcntl(fd, F_GETFL)
331 fcntl.fcntl(fd, F_SETFL, fl | os.O_NDELAY)
336 the cmds are written to stdin of lctl
337 lctl doesn't return errors when run in script mode, so
339 should modify command line to accept multiple commands, or
340 create complex command line options
344 cmds = '\n dump ' + self.save_file + '\n' + cmds
345 elif self.record_device:
349 %s""" % (self.record_device, self.record_log, cmds)
351 debug("+", cmd_line, cmds)
352 if config.noexec: return (0, [])
354 child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command
355 child.tochild.write(cmds + "\n")
356 child.tochild.close()
358 # From "Python Cookbook" from O'Reilly
359 outfile = child.fromchild
360 outfd = outfile.fileno()
361 self.set_nonblock(outfd)
362 errfile = child.childerr
363 errfd = errfile.fileno()
364 self.set_nonblock(errfd)
366 outdata = errdata = ''
369 ready = select.select([outfd,errfd],[],[]) # Wait for input
370 if outfd in ready[0]:
371 outchunk = outfile.read()
372 if outchunk == '': outeof = 1
373 outdata = outdata + outchunk
374 if errfd in ready[0]:
375 errchunk = errfile.read()
376 if errchunk == '': erreof = 1
377 errdata = errdata + errchunk
378 if outeof and erreof: break
379 # end of "borrowed" code
382 if os.WIFEXITED(ret):
383 rc = os.WEXITSTATUS(ret)
386 if rc or len(errdata):
387 raise CommandError(self.lctl, errdata, rc)
390 def runcmd(self, *args):
392 run lctl using the command line
394 cmd = string.join(map(str,args))
395 debug("+", self.lctl, cmd)
396 rc, out = run(self.lctl, cmd)
398 raise CommandError(self.lctl, out, rc)
402 def clear_log(self, dev, log):
403 """ clear an existing log """
408 quit """ % (dev, log)
411 def network(self, net, nid):
416 quit """ % (net, nid)
419 # create a new connection
420 def add_uuid(self, net_type, uuid, nid):
421 cmds = "\n add_uuid %s %s %s" %(uuid, nid, net_type)
424 def add_autoconn(self, net_type, send_mem, recv_mem, nid, hostaddr,
426 if net_type in ('tcp',) and not config.lctl_dump:
431 add_autoconn %s %s %d %s
435 nid, hostaddr, port, flags )
438 def connect(self, srv):
439 self.add_uuid(srv.net_type, srv.nid_uuid, srv.nid)
440 if srv.net_type in ('tcp',) and not config.lctl_dump:
444 self.add_autoconn(srv.net_type, srv.send_mem, srv.recv_mem,
445 srv.nid, srv.hostaddr, srv.port, flags)
448 def recover(self, dev_name, new_conn):
451 recover %s""" %(dev_name, new_conn)
454 # add a route to a range
455 def add_route(self, net, gw, lo, hi):
463 except CommandError, e:
467 def del_route(self, net, gw, lo, hi):
472 quit """ % (net, gw, lo, hi)
475 # add a route to a host
476 def add_route_host(self, net, uuid, gw, tgt):
477 self.add_uuid(net, uuid, tgt)
485 except CommandError, e:
489 # add a route to a range
490 def del_route_host(self, net, uuid, gw, tgt):
496 quit """ % (net, gw, tgt)
500 def del_autoconn(self, net_type, nid, hostaddr):
501 if net_type in ('tcp',) and not config.lctl_dump:
510 # disconnect one connection
511 def disconnect(self, srv):
512 self.del_uuid(srv.nid_uuid)
513 if srv.net_type in ('tcp',) and not config.lctl_dump:
514 self.del_autoconn(srv.net_type, srv.nid, srv.hostaddr)
516 def del_uuid(self, uuid):
524 def disconnectAll(self, net):
532 def attach(self, type, name, uuid):
535 quit""" % (type, name, uuid)
538 def setup(self, name, setup = ""):
542 quit""" % (name, setup)
546 # create a new device with lctl
547 def newdev(self, type, name, uuid, setup = ""):
548 self.attach(type, name, uuid);
550 self.setup(name, setup)
551 except CommandError, e:
552 self.cleanup(name, uuid, 0)
557 def cleanup(self, name, uuid, force, failover = 0):
558 if failover: force = 1
564 quit""" % (name, ('', 'force')[force],
565 ('', 'failover')[failover])
569 def lov_setup(self, name, uuid, desc_uuid, mdsuuid, stripe_cnt,
570 stripe_sz, stripe_off,
574 lov_setup %s %d %d %d %s %s
575 quit""" % (name, uuid, desc_uuid, stripe_cnt, stripe_sz, stripe_off,
580 def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off,
584 lov_setconfig %s %d %d %d %s %s
585 quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
589 def dump(self, dump_file):
592 quit""" % (dump_file)
595 # get list of devices
596 def device_list(self):
597 devices = '/proc/fs/lustre/devices'
599 if os.access(devices, os.R_OK):
601 fp = open(devices, 'r')
609 def lustre_version(self):
610 rc, out = self.runcmd('version')
614 def mount_option(self, profile, osc, mdc):
616 mount_option %s %s %s
617 quit""" % (profile, osc, mdc)
620 # delete mount options
621 def del_mount_option(self, profile):
627 def set_timeout(self, timeout):
633 # delete mount options
634 def set_lustre_upcall(self, upcall):
639 # ============================================================
640 # Various system-level functions
641 # (ideally moved to their own module)
643 # Run a command and return the output and status.
644 # stderr is sent to /dev/null, could use popen3 to
645 # save it if necessary
648 if config.noexec: return (0, [])
649 f = os.popen(cmd + ' 2>&1')
659 cmd = string.join(map(str,args))
662 # Run a command in the background.
663 def run_daemon(*args):
664 cmd = string.join(map(str,args))
666 if config.noexec: return 0
667 f = os.popen(cmd + ' 2>&1')
675 # Determine full path to use for an external command
676 # searches dirname(argv[0]) first, then PATH
678 syspath = string.split(os.environ['PATH'], ':')
679 cmdpath = os.path.dirname(sys.argv[0])
680 syspath.insert(0, cmdpath);
682 syspath.insert(0, os.path.join(config.portals, 'utils/'))
684 prog = os.path.join(d,cmd)
685 if os.access(prog, os.X_OK):
689 # Recursively look for file starting at base dir
690 def do_find_file(base, mod):
691 fullname = os.path.join(base, mod)
692 if os.access(fullname, os.R_OK):
694 for d in os.listdir(base):
695 dir = os.path.join(base,d)
696 if os.path.isdir(dir):
697 module = do_find_file(dir, mod)
701 def find_module(src_dir, dev_dir, modname):
702 mod = '%s.o' % (modname)
703 module = src_dir +'/'+ dev_dir +'/'+ mod
705 if os.access(module, os.R_OK):
711 # is the path a block device?
718 return stat.S_ISBLK(s[stat.ST_MODE])
720 # build fs according to type
722 def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1):
728 panic("size of filesystem on '%s' must be larger than 8MB, but is set to %s"%
730 # devsize is in 1k, and fs block count is in 4k
731 block_cnt = devsize/4
733 if fstype in ('ext3', 'extN'):
734 # ext3 journal size is in megabytes
737 if not is_block(dev):
738 ret, out = runcmd("ls -l %s" %dev)
739 devsize = int(string.split(out[0])[4]) / 1024
741 ret, out = runcmd("sfdisk -s %s" %dev)
742 devsize = int(out[0])
743 if devsize > 1024 * 1024:
744 jsize = ((devsize / 102400) * 4)
747 if jsize: jopt = "-J size=%d" %(jsize,)
748 if isize: iopt = "-I %d" %(isize,)
749 mkfs = 'mkfs.ext2 -j -b 4096 '
750 if not isblock or config.force:
752 elif fstype == 'reiserfs':
753 # reiserfs journal size is in blocks
754 if jsize: jopt = "--journal_size %d" %(jsize,)
755 mkfs = 'mkreiserfs -ff'
757 panic('unsupported fs type: ', fstype)
759 if config.mkfsoptions != None:
760 mkfs = mkfs + ' ' + config.mkfsoptions
761 if mkfsoptions != None:
762 mkfs = mkfs + ' ' + mkfsoptions
763 (ret, out) = run (mkfs, jopt, iopt, dev, block_cnt)
765 panic("Unable to build fs:", dev, string.join(out))
766 # enable hash tree indexing on fsswe
767 if fstype in ('ext3', 'extN'):
768 htree = 'echo "feature FEATURE_C5" | debugfs -w'
769 (ret, out) = run (htree, dev)
771 panic("Unable to enable htree:", dev)
773 # some systems use /dev/loopN, some /dev/loop/N
777 if not os.access(loop + str(0), os.R_OK):
779 if not os.access(loop + str(0), os.R_OK):
780 panic ("can't access loop devices")
783 # find loop device assigned to thefile
786 for n in xrange(0, MAX_LOOP_DEVICES):
788 if os.access(dev, os.R_OK):
789 (stat, out) = run('losetup', dev)
790 if out and stat == 0:
791 m = re.search(r'\((.*)\)', out[0])
792 if m and file == m.group(1):
798 # create file if necessary and assign the first free loop device
799 def init_loop(file, size, fstype, journal_size, inode_size, mkfsoptions, reformat):
800 dev = find_loop(file)
802 print 'WARNING file:', file, 'already mapped to', dev
804 if reformat or not os.access(file, os.R_OK | os.W_OK):
806 panic("size of loopback file '%s' must be larger than 8MB, but is set to %s" % (file,size))
807 (ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size,
810 panic("Unable to create backing store:", file)
811 mkfs(file, size, fstype, journal_size, inode_size, mkfsoptions, isblock=0)
814 # find next free loop
815 for n in xrange(0, MAX_LOOP_DEVICES):
817 if os.access(dev, os.R_OK):
818 (stat, out) = run('losetup', dev)
820 run('losetup', dev, file)
823 print "out of loop devices"
825 print "out of loop devices"
828 # undo loop assignment
829 def clean_loop(file):
830 dev = find_loop(file)
832 ret, out = run('losetup -d', dev)
834 log('unable to clean loop device:', dev, 'for file:', file)
837 # determine if dev is formatted as a <fstype> filesystem
838 def need_format(fstype, dev):
839 # FIXME don't know how to implement this
842 # initialize a block device if needed
843 def block_dev(dev, size, fstype, reformat, autoformat, journal_size,
844 inode_size, mkfsoptions):
845 if config.noexec: return dev
846 if not is_block(dev):
847 dev = init_loop(dev, size, fstype, journal_size, inode_size,
848 mkfsoptions, reformat)
849 elif reformat or (need_format(fstype, dev) and autoformat == 'yes'):
850 mkfs(dev, size, fstype, journal_size, inode_size, mkfsoptions,
853 # panic("device:", dev,
854 # "not prepared, and autoformat is not set.\n",
855 # "Rerun with --reformat option to format ALL filesystems")
860 """lookup IP address for an interface"""
861 rc, out = run("/sbin/ifconfig", iface)
864 addr = string.split(out[1])[1]
865 ip = string.split(addr, ':')[1]
868 def sys_get_elan_position_file():
869 procfiles = ["/proc/elan/device0/position",
870 "/proc/qsnet/elan4/device0/position",
871 "/proc/qsnet/elan3/device0/position"]
873 if os.access(p, os.R_OK):
877 def sys_get_local_nid(net_type, wildcard, cluster_id):
878 """Return the local nid."""
880 if sys_get_elan_position_file():
881 local = sys_get_local_address('elan', '*', cluster_id)
883 local = sys_get_local_address(net_type, wildcard, cluster_id)
886 def sys_get_local_address(net_type, wildcard, cluster_id):
887 """Return the local address for the network type."""
889 if net_type in ('tcp',):
891 iface, star = string.split(wildcard, ':')
892 local = if2addr(iface)
894 panic ("unable to determine ip for:", wildcard)
896 host = socket.gethostname()
897 local = socket.gethostbyname(host)
898 elif net_type == 'elan':
899 # awk '/NodeId/ { print $2 }' 'sys_get_elan_position_file()'
900 f = sys_get_elan_position_file()
902 panic ("unable to determine local Elan ID")
905 lines = fp.readlines()
913 nid = my_int(cluster_id) + my_int(elan_id)
915 except ValueError, e:
919 elif net_type == 'gm':
920 fixme("automatic local address for GM")
921 elif net_type == 'scimac':
922 scinode="/opt/scali/sbin/scinode"
923 if os.path.exists(scinode):
924 (rc,local) = run(scinode)
926 panic (scinode, " not found on node with scimac networking")
928 panic (scinode, " failed")
929 local=string.rstrip(local[0])
933 def mod_loaded(modname):
934 """Check if a module is already loaded. Look in /proc/modules for it."""
936 fp = open('/proc/modules')
937 lines = fp.readlines()
939 # please forgive my tired fingers for this one
940 ret = filter(lambda word, mod=modname: word == mod,
941 map(lambda line: string.split(line)[0], lines))
946 # XXX: instead of device_list, ask for $name and see what we get
947 def is_prepared(name):
948 """Return true if a device exists for the name"""
951 if (config.noexec or config.record) and config.cleanup:
954 # expect this format:
955 # 1 UP ldlm ldlm ldlm_UUID 2
956 out = lctl.device_list()
958 if name == string.split(s)[3]:
960 except CommandError, e:
964 def is_network_prepared():
965 """If the any device exists, then assume that all networking
966 has been configured"""
967 out = lctl.device_list()
970 def fs_is_mounted(path):
971 """Return true if path is a mounted lustre filesystem"""
973 fp = open('/proc/mounts')
974 lines = fp.readlines()
978 if a[1] == path and a[2] == 'lustre_lite':
986 """Manage kernel modules"""
987 def __init__(self, lustre_dir, portals_dir):
988 self.lustre_dir = lustre_dir
989 self.portals_dir = portals_dir
990 self.kmodule_list = []
992 def add_portals_module(self, dev_dir, modname):
993 """Append a module to list of modules to load."""
994 self.kmodule_list.append((self.portals_dir, dev_dir, modname))
996 def add_lustre_module(self, dev_dir, modname):
997 """Append a module to list of modules to load."""
998 self.kmodule_list.append((self.lustre_dir, dev_dir, modname))
1000 def load_module(self):
1001 """Load all the modules in the list in the order they appear."""
1002 for src_dir, dev_dir, mod in self.kmodule_list:
1003 if mod_loaded(mod) and not config.noexec:
1005 log ('loading module:', mod, 'srcdir', src_dir, 'devdir', dev_dir)
1007 module = find_module(src_dir, dev_dir, mod)
1009 panic('module not found:', mod)
1010 (rc, out) = run('/sbin/insmod', module)
1012 raise CommandError('insmod', out, rc)
1014 (rc, out) = run('/sbin/modprobe', mod)
1016 raise CommandError('modprobe', out, rc)
1018 def cleanup_module(self):
1019 """Unload the modules in the list in reverse order."""
1020 rev = self.kmodule_list
1022 for src_dir, dev_dir, mod in rev:
1023 if not mod_loaded(mod) and not config.noexec:
1026 if mod == 'portals' and config.dump:
1027 lctl.dump(config.dump)
1028 log('unloading module:', mod)
1029 (rc, out) = run('/sbin/rmmod', mod)
1031 log('! unable to unload module:', mod)
1034 # ============================================================
1035 # Classes to prepare and cleanup the various objects
1038 """ Base class for the rest of the modules. The default cleanup method is
1039 defined here, as well as some utilitiy funcs.
1041 def __init__(self, module_name, db):
1043 self.module_name = module_name
1044 self.name = self.db.getName()
1045 self.uuid = self.db.getUUID()
1048 self.kmod = kmod(config.lustre, config.portals)
1050 def info(self, *args):
1051 msg = string.join(map(str,args))
1052 print self.module_name + ":", self.name, self.uuid, msg
1055 """ default cleanup, used for most modules """
1058 lctl.cleanup(self.name, self.uuid, config.force)
1059 except CommandError, e:
1060 log(self.module_name, "cleanup failed: ", self.name)
1064 def add_portals_module(self, dev_dir, modname):
1065 """Append a module to list of modules to load."""
1066 self.kmod.add_portals_module(dev_dir, modname)
1068 def add_lustre_module(self, dev_dir, modname):
1069 """Append a module to list of modules to load."""
1070 self.kmod.add_lustre_module(dev_dir, modname)
1072 def load_module(self):
1073 """Load all the modules in the list in the order they appear."""
1074 self.kmod.load_module()
1076 def cleanup_module(self):
1077 """Unload the modules in the list in reverse order."""
1078 if self.safe_to_clean():
1079 self.kmod.cleanup_module()
1081 def safe_to_clean(self):
1084 def safe_to_clean_modules(self):
1085 return self.safe_to_clean()
1087 class Network(Module):
1088 def __init__(self,db):
1089 Module.__init__(self, 'NETWORK', db)
1090 self.net_type = self.db.get_val('nettype')
1091 self.nid = self.db.get_val('nid', '*')
1092 self.cluster_id = self.db.get_val('clusterid', "0")
1093 self.port = self.db.get_val_int('port', 0)
1094 self.send_mem = self.db.get_val_int('sendmem', DEFAULT_TCPBUF)
1095 self.recv_mem = self.db.get_val_int('recvmem', DEFAULT_TCPBUF)
1096 self.irq_affinity = self.db.get_val_int('irqaffinity', 0)
1099 self.nid = sys_get_local_nid(self.net_type, self.nid, self.cluster_id)
1101 panic("unable to set nid for", self.net_type, self.nid, cluster_id)
1102 self.generic_nid = 1
1103 debug("nid:", self.nid)
1105 self.generic_nid = 0
1107 self.nid_uuid = self.nid_to_uuid(self.nid)
1109 self.hostaddr = self.db.get_val('hostaddr', self.nid)
1110 if '*' in self.hostaddr:
1111 self.hostaddr = sys_get_local_address(self.net_type, self.hostaddr, self.cluster_id)
1112 if not self.hostaddr:
1113 panic("unable to set hostaddr for", self.net_type, self.hostaddr, self.cluster_id)
1114 debug("hostaddr:", self.hostaddr)
1116 self.add_portals_module("libcfs", 'libcfs')
1117 self.add_portals_module("portals", 'portals')
1118 if node_needs_router():
1119 self.add_portals_module("router", 'kptlrouter')
1120 if self.net_type == 'tcp':
1121 self.add_portals_module("knals/socknal", 'ksocknal')
1122 if self.net_type == 'elan':
1123 self.add_portals_module("knals/qswnal", 'kqswnal')
1124 if self.net_type == 'gm':
1125 self.add_portals_module("knals/gmnal", 'kgmnal')
1126 if self.net_type == 'scimac':
1127 self.add_portals_module("knals/scimacnal", 'kscimacnal')
1129 def nid_to_uuid(self, nid):
1130 return "NID_%s_UUID" %(nid,)
1133 if is_network_prepared():
1135 self.info(self.net_type, self.nid, self.port)
1136 if not (config.record and self.generic_nid):
1137 lctl.network(self.net_type, self.nid)
1138 if self.net_type == 'tcp':
1140 if self.net_type == 'elan':
1142 if self.port and node_is_router():
1143 run_one_acceptor(self.port)
1144 self.connect_peer_gateways()
1146 def connect_peer_gateways(self):
1147 for router in self.db.lookup_class('node'):
1148 if router.get_val_int('router', 0):
1149 for netuuid in router.get_networks():
1150 net = self.db.lookup(netuuid)
1152 if (gw.cluster_id == self.cluster_id and
1153 gw.net_type == self.net_type):
1154 if gw.nid != self.nid:
1157 def disconnect_peer_gateways(self):
1158 for router in self.db.lookup_class('node'):
1159 if router.get_val_int('router', 0):
1160 for netuuid in router.get_networks():
1161 net = self.db.lookup(netuuid)
1163 if (gw.cluster_id == self.cluster_id and
1164 gw.net_type == self.net_type):
1165 if gw.nid != self.nid:
1168 except CommandError, e:
1169 print "disconnect failed: ", self.name
1173 def safe_to_clean(self):
1174 return not is_network_prepared()
1177 self.info(self.net_type, self.nid, self.port)
1179 stop_acceptor(self.port)
1180 if node_is_router():
1181 self.disconnect_peer_gateways()
1183 class RouteTable(Module):
1184 def __init__(self,db):
1185 Module.__init__(self, 'ROUTES', db)
1187 def server_for_route(self, net_type, gw, gw_cluster_id, tgt_cluster_id,
1189 # only setup connections for tcp NALs
1191 if not net_type in ('tcp',):
1194 # connect to target if route is to single node and this node is the gw
1195 if lo == hi and local_interface(net_type, gw_cluster_id, gw):
1196 if not local_cluster(net_type, tgt_cluster_id):
1197 panic("target", lo, " not on the local cluster")
1198 srvdb = self.db.nid2server(lo, net_type, gw_cluster_id)
1199 # connect to gateway if this node is not the gw
1200 elif (local_cluster(net_type, gw_cluster_id)
1201 and not local_interface(net_type, gw_cluster_id, gw)):
1202 srvdb = self.db.nid2server(gw, net_type, gw_cluster_id)
1207 panic("no server for nid", lo)
1210 return Network(srvdb)
1213 if is_network_prepared():
1216 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1217 lctl.add_route(net_type, gw, lo, hi)
1218 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1222 def safe_to_clean(self):
1223 return not is_network_prepared()
1226 if is_network_prepared():
1227 # the network is still being used, don't clean it up
1229 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1230 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1233 lctl.disconnect(srv)
1234 except CommandError, e:
1235 print "disconnect failed: ", self.name
1240 lctl.del_route(net_type, gw, lo, hi)
1241 except CommandError, e:
1242 print "del_route failed: ", self.name
1246 class Management(Module):
1247 def __init__(self, db):
1248 Module.__init__(self, 'MGMT', db)
1249 self.add_lustre_module('lvfs', 'lvfs')
1250 self.add_lustre_module('obdclass', 'obdclass')
1251 self.add_lustre_module('ptlrpc', 'ptlrpc')
1252 self.add_lustre_module('mgmt', 'mgmt_svc')
1255 if is_prepared(self.name):
1258 lctl.newdev("mgmt", self.name, self.uuid)
1260 def safe_to_clean(self):
1264 if is_prepared(self.name):
1265 Module.cleanup(self)
1267 # This is only needed to load the modules; the LDLM device
1268 # is now created automatically.
1270 def __init__(self,db):
1271 Module.__init__(self, 'LDLM', db)
1272 self.add_lustre_module('lvfs', 'lvfs')
1273 self.add_lustre_module('obdclass', 'obdclass')
1274 self.add_lustre_module('ptlrpc', 'ptlrpc')
1283 def __init__(self, db, uuid, fs_name, name_override = None, config_only = None):
1284 Module.__init__(self, 'LOV', db)
1285 if name_override != None:
1286 self.name = "lov_%s" % name_override
1287 self.add_lustre_module('lov', 'lov')
1288 self.mds_uuid = self.db.get_first_ref('mds')
1289 self.stripe_sz = self.db.get_val_int('stripesize', 65536)
1290 self.stripe_off = self.db.get_val_int('stripeoffset', 0)
1291 self.pattern = self.db.get_val_int('stripepattern', 0)
1292 self.devlist = self.db.get_refs('obd')
1293 self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist))
1295 self.desc_uuid = self.uuid
1296 self.uuid = generate_client_uuid(self.name)
1297 self.fs_name = fs_name
1299 self.config_only = 1
1301 self.config_only = None
1302 mds= self.db.lookup(self.mds_uuid)
1303 self.mds_name = mds.getName()
1304 for obd_uuid in self.devlist:
1305 obd = self.db.lookup(obd_uuid)
1306 osc = get_osc(obd, self.uuid, fs_name)
1308 self.osclist.append(osc)
1310 panic('osc not found:', obd_uuid)
1313 if is_prepared(self.name):
1315 if self.config_only:
1316 panic("Can't prepare config_only LOV ", self.name)
1318 for osc in self.osclist:
1320 # Only ignore connect failures with --force, which
1321 # isn't implemented here yet.
1322 osc.prepare(ignore_connect_failure=0)
1323 except CommandError, e:
1324 print "Error preparing OSC %s\n" % osc.uuid
1326 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
1327 self.stripe_off, self.pattern, self.devlist, self.mds_name)
1328 lctl.lov_setup(self.name, self.uuid,
1329 self.desc_uuid, self.mds_name, self.stripe_cnt,
1330 self.stripe_sz, self.stripe_off, self.pattern,
1331 string.join(self.devlist))
1334 if is_prepared(self.name):
1335 Module.cleanup(self)
1336 if self.config_only:
1337 panic("Can't clean up config_only LOV ", self.name)
1338 for osc in self.osclist:
1341 def load_module(self):
1342 if self.config_only:
1343 panic("Can't load modules for config_only LOV ", self.name)
1344 for osc in self.osclist:
1347 Module.load_module(self)
1349 def cleanup_module(self):
1350 if self.config_only:
1351 panic("Can't cleanup modules for config_only LOV ", self.name)
1352 Module.cleanup_module(self)
1353 for osc in self.osclist:
1354 osc.cleanup_module()
1357 class MDSDEV(Module):
1358 def __init__(self,db):
1359 Module.__init__(self, 'MDSDEV', db)
1360 self.devpath = self.db.get_val('devpath','')
1361 self.size = self.db.get_val_int('devsize', 0)
1362 self.journal_size = self.db.get_val_int('journalsize', 0)
1363 self.fstype = self.db.get_val('fstype', '')
1364 self.nspath = self.db.get_val('nspath', '')
1365 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1366 # overwrite the orignal MDSDEV name and uuid with the MDS name and uuid
1367 target_uuid = self.db.get_first_ref('target')
1368 mds = self.db.lookup(target_uuid)
1369 self.name = mds.getName()
1370 self.filesystem_uuids = mds.get_refs('filesystem')
1371 # FIXME: if fstype not set, then determine based on kernel version
1372 self.format = self.db.get_val('autoformat', "no")
1373 if mds.get_val('failover', 0):
1374 self.failover_mds = 'f'
1376 self.failover_mds = 'n'
1377 active_uuid = get_active_target(mds)
1379 panic("No target device found:", target_uuid)
1380 if active_uuid == self.uuid:
1384 if self.active and config.group and config.group != mds.get_val('group'):
1387 self.inode_size = self.db.get_val_int('inodesize', 0)
1388 if self.inode_size == 0:
1389 # find the LOV for this MDS
1390 lovconfig_uuid = mds.get_first_ref('lovconfig')
1391 if not lovconfig_uuid:
1392 panic("No LOV config found for MDS ", mds.name)
1393 lovconfig = mds.lookup(lovconfig_uuid)
1394 lov_uuid = lovconfig.get_first_ref('lov')
1396 panic("No LOV found for lovconfig ", lovconfig.name)
1397 lov = LOV(self.db.lookup(lov_uuid), lov_uuid, 'FS_name', config_only = 1)
1399 # default stripe count controls default inode_size
1400 stripe_count = lov.stripe_cnt
1401 if stripe_count > 77:
1402 self.inode_size = 4096
1403 elif stripe_count > 35:
1404 self.inode_size = 2048
1405 elif stripe_count > 13:
1406 self.inode_size = 1024
1407 elif stripe_count > 3:
1408 self.inode_size = 512
1410 self.inode_size = 256
1412 self.target_dev_uuid = self.uuid
1413 self.uuid = target_uuid
1415 self.add_lustre_module('mdc', 'mdc')
1416 self.add_lustre_module('osc', 'osc')
1417 self.add_lustre_module('lov', 'lov')
1418 self.add_lustre_module('mds', 'mds')
1420 self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.fstype))
1422 def load_module(self):
1424 Module.load_module(self)
1427 if is_prepared(self.name):
1430 debug(self.uuid, "not active")
1433 # run write_conf automatically, if --reformat used
1435 self.info(self.devpath, self.fstype, self.size, self.format)
1437 # never reformat here
1438 blkdev = block_dev(self.devpath, self.size, self.fstype, 0,
1439 self.format, self.journal_size, self.inode_size,
1441 if not is_prepared('MDT'):
1442 lctl.newdev("mdt", 'MDT', 'MDT_UUID', setup ="")
1444 lctl.newdev("mds", self.name, self.uuid,
1445 setup ="%s %s %s" %(blkdev, self.fstype, self.name))
1446 except CommandError, e:
1448 panic("MDS is missing the config log. Need to run " +
1449 "lconf --write_conf.")
1453 def write_conf(self):
1454 if is_prepared(self.name):
1456 self.info(self.devpath, self.fstype, self.format)
1457 blkdev = block_dev(self.devpath, self.size, self.fstype,
1458 config.reformat, self.format, self.journal_size,
1459 self.inode_size, self.mkfsoptions)
1460 lctl.newdev("mds", self.name, self.uuid,
1461 setup ="%s %s" %(blkdev, self.fstype))
1463 # record logs for the MDS lov
1464 for uuid in self.filesystem_uuids:
1465 log("recording clients for filesystem:", uuid)
1466 fs = self.db.lookup(uuid)
1467 obd_uuid = fs.get_first_ref('obd')
1468 client_uuid = generate_client_uuid(self.name)
1469 client = VOSC(self.db.lookup(obd_uuid), client_uuid, self.name,
1472 lctl.clear_log(self.name, self.name)
1473 lctl.record(self.name, self.name)
1475 lctl.mount_option(self.name, client.get_name(), "")
1479 lctl.clear_log(self.name, self.name + '-clean')
1480 lctl.record(self.name, self.name + '-clean')
1482 lctl.del_mount_option(self.name)
1487 # record logs for each client
1489 config_options = "--ldapurl " + config.ldapurl + " --config " + config.config
1491 config_options = CONFIG_FILE
1493 for node_db in self.db.lookup_class('node'):
1494 client_name = node_db.getName()
1495 for prof_uuid in node_db.get_refs('profile'):
1496 prof_db = node_db.lookup(prof_uuid)
1497 # refactor this into a funtion to test "clientness"
1499 for ref_class, ref_uuid in prof_db.get_all_refs():
1500 if ref_class in ('mountpoint','echoclient'):
1501 debug("recording", client_name)
1502 old_noexec = config.noexec
1504 noexec_opt = ('', '-n')
1505 ret, out = run (sys.argv[0],
1506 noexec_opt[old_noexec == 1],
1507 " -v --record --nomod",
1508 "--record_log", client_name,
1509 "--record_device", self.name,
1510 "--node", client_name,
1513 for s in out: log("record> ", string.strip(s))
1514 ret, out = run (sys.argv[0],
1515 noexec_opt[old_noexec == 1],
1516 "--cleanup -v --record --nomod",
1517 "--record_log", client_name + "-clean",
1518 "--record_device", self.name,
1519 "--node", client_name,
1522 for s in out: log("record> ", string.strip(s))
1523 config.noexec = old_noexec
1525 lctl.cleanup(self.name, self.uuid, 0, 0)
1526 except CommandError, e:
1527 log(self.module_name, "cleanup failed: ", self.name)
1530 Module.cleanup(self)
1531 clean_loop(self.devpath)
1533 def msd_remaining(self):
1534 out = lctl.device_list()
1536 if string.split(s)[2] in ('mds',):
1539 def safe_to_clean(self):
1542 def safe_to_clean_modules(self):
1543 return not self.msd_remaining()
1547 debug(self.uuid, "not active")
1550 if is_prepared(self.name):
1552 lctl.cleanup(self.name, self.uuid, config.force,
1554 except CommandError, e:
1555 log(self.module_name, "cleanup failed: ", self.name)
1558 Module.cleanup(self)
1559 if not self.msd_remaining() and is_prepared('MDT'):
1561 lctl.cleanup("MDT", "MDT_UUID", config.force,
1563 except CommandError, e:
1564 print "cleanup failed: ", self.name
1567 clean_loop(self.devpath)
1570 def __init__(self, db):
1571 Module.__init__(self, 'OSD', db)
1572 self.osdtype = self.db.get_val('osdtype')
1573 self.devpath = self.db.get_val('devpath', '')
1574 self.size = self.db.get_val_int('devsize', 0)
1575 self.journal_size = self.db.get_val_int('journalsize', 0)
1576 self.inode_size = self.db.get_val_int('inodesize', 0)
1577 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1578 self.fstype = self.db.get_val('fstype', '')
1579 self.nspath = self.db.get_val('nspath', '')
1580 target_uuid = self.db.get_first_ref('target')
1581 ost = self.db.lookup(target_uuid)
1582 self.name = ost.getName()
1583 self.format = self.db.get_val('autoformat', 'yes')
1584 if ost.get_val('failover', 0):
1585 self.failover_ost = 'f'
1587 self.failover_ost = 'n'
1589 active_uuid = get_active_target(ost)
1591 panic("No target device found:", target_uuid)
1592 if active_uuid == self.uuid:
1596 if self.active and config.group and config.group != ost.get_val('group'):
1599 self.target_dev_uuid = self.uuid
1600 self.uuid = target_uuid
1602 self.add_lustre_module('ost', 'ost')
1603 # FIXME: should we default to ext3 here?
1605 self.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.fstype))
1606 self.add_lustre_module(self.osdtype, self.osdtype)
1608 def load_module(self):
1610 Module.load_module(self)
1612 # need to check /proc/mounts and /etc/mtab before
1613 # formatting anything.
1614 # FIXME: check if device is already formatted.
1616 if is_prepared(self.name):
1619 debug(self.uuid, "not active")
1621 self.info(self.osdtype, self.devpath, self.size, self.fstype,
1622 self.format, self.journal_size, self.inode_size)
1624 if self.osdtype == 'obdecho':
1627 blkdev = block_dev(self.devpath, self.size, self.fstype,
1628 config.reformat, self.format, self.journal_size,
1629 self.inode_size, self.mkfsoptions)
1630 lctl.newdev(self.osdtype, self.name, self.uuid,
1631 setup ="%s %s %s" %(blkdev, self.fstype,
1633 if not is_prepared('OSS'):
1634 lctl.newdev("ost", 'OSS', 'OSS_UUID', setup ="")
1636 def osd_remaining(self):
1637 out = lctl.device_list()
1639 if string.split(s)[2] in ('obdfilter', 'obdecho'):
1642 def safe_to_clean(self):
1645 def safe_to_clean_modules(self):
1646 return not self.osd_remaining()
1650 debug(self.uuid, "not active")
1652 if is_prepared(self.name):
1655 lctl.cleanup(self.name, self.uuid, config.force,
1657 except CommandError, e:
1658 log(self.module_name, "cleanup failed: ", self.name)
1661 if not self.osd_remaining() and is_prepared('OSS'):
1663 lctl.cleanup("OSS", "OSS_UUID", config.force,
1665 except CommandError, e:
1666 print "cleanup failed: ", self.name
1669 if not self.osdtype == 'obdecho':
1670 clean_loop(self.devpath)
1672 def mgmt_uuid_for_fs(mtpt_name):
1675 mtpt_db = toplevel.lookup_name(mtpt_name)
1676 fs_uuid = mtpt_db.get_first_ref('filesystem')
1677 fs = toplevel.lookup(fs_uuid)
1680 return fs.get_first_ref('mgmt')
1682 # Generic client module, used by OSC and MDC
1683 class Client(Module):
1684 def __init__(self, tgtdb, uuid, module, fs_name, self_name=None,
1686 self.target_name = tgtdb.getName()
1687 self.target_uuid = tgtdb.getUUID()
1690 self.tgt_dev_uuid = get_active_target(tgtdb)
1691 if not self.tgt_dev_uuid:
1692 panic("No target device found for target:", self.target_name)
1694 self.kmod = kmod(config.lustre, config.portals)
1698 self.module = module
1699 self.module_name = string.upper(module)
1701 self.name = '%s_%s_%s_%s' % (self.module_name, socket.gethostname(),
1702 self.target_name, fs_name)
1704 self.name = self_name
1706 self.lookup_server(self.tgt_dev_uuid)
1707 mgmt_uuid = mgmt_uuid_for_fs(fs_name)
1709 self.mgmt_name = mgmtcli_name_for_uuid(mgmt_uuid)
1712 self.fs_name = fs_name
1715 self.add_lustre_module(module_dir, module)
1717 def lookup_server(self, srv_uuid):
1718 """ Lookup a server's network information """
1719 self._server_nets = get_ost_net(self.db, srv_uuid)
1720 if len(self._server_nets) == 0:
1721 panic ("Unable to find a server for:", srv_uuid)
1723 def get_servers(self):
1724 return self._server_nets
1726 def prepare(self, ignore_connect_failure = 0):
1727 self.info(self.target_uuid)
1728 if is_prepared(self.name):
1731 srv = choose_local_server(self.get_servers())
1735 routes = find_route(self.get_servers())
1736 if len(routes) == 0:
1737 panic ("no route to", self.target_uuid)
1738 for (srv, r) in routes:
1739 lctl.add_route_host(r[0], srv.nid_uuid, r[1], r[3])
1740 except CommandError, e:
1741 if not ignore_connect_failure:
1744 if self.target_uuid in config.inactive and self.permits_inactive():
1745 debug("%s inactive" % self.target_uuid)
1746 inactive_p = "inactive"
1748 debug("%s active" % self.target_uuid)
1750 lctl.newdev(self.module, self.name, self.uuid,
1751 setup ="%s %s %s %s" % (self.target_uuid, srv.nid_uuid,
1752 inactive_p, self.mgmt_name))
1755 if is_prepared(self.name):
1756 Module.cleanup(self)
1758 srv = choose_local_server(self.get_servers())
1760 lctl.disconnect(srv)
1762 for (srv, r) in find_route(self.get_servers()):
1763 lctl.del_route_host(r[0], srv.nid_uuid, r[1], r[3])
1764 except CommandError, e:
1765 log(self.module_name, "cleanup failed: ", self.name)
1771 def __init__(self, db, uuid, fs_name):
1772 Client.__init__(self, db, uuid, 'mdc', fs_name)
1774 def permits_inactive(self):
1778 def __init__(self, db, uuid, fs_name):
1779 Client.__init__(self, db, uuid, 'osc', fs_name)
1781 def permits_inactive(self):
1784 def mgmtcli_name_for_uuid(uuid):
1785 return 'MGMTCLI_%s' % uuid
1787 class ManagementClient(Client):
1788 def __init__(self, db, uuid):
1789 Client.__init__(self, db, uuid, 'mgmt_cli', '',
1790 self_name = mgmtcli_name_for_uuid(db.getUUID()),
1791 module_dir = 'mgmt')
1794 def __init__(self, db):
1795 Module.__init__(self, 'COBD', db)
1796 self.real_uuid = self.db.get_first_ref('realobd')
1797 self.cache_uuid = self.db.get_first_ref('cacheobd')
1798 self.add_lustre_module('cobd' , 'cobd')
1800 # need to check /proc/mounts and /etc/mtab before
1801 # formatting anything.
1802 # FIXME: check if device is already formatted.
1804 if is_prepared(self.name):
1806 self.info(self.real_uuid, self.cache_uuid)
1807 lctl.newdev("cobd", self.name, self.uuid,
1808 setup ="%s %s" %(self.real_uuid, self.cache_uuid))
1811 # virtual interface for OSC and LOV
1813 def __init__(self, db, uuid, fs_name, name_override = None):
1814 Module.__init__(self, 'VOSC', db)
1815 if db.get_class() == 'lov':
1816 self.osc = LOV(db, uuid, fs_name, name_override)
1818 self.osc = get_osc(db, uuid, fs_name)
1820 return self.osc.uuid
1822 return self.osc.name
1827 def load_module(self):
1828 self.osc.load_module()
1829 def cleanup_module(self):
1830 self.osc.cleanup_module()
1833 class ECHO_CLIENT(Module):
1834 def __init__(self,db):
1835 Module.__init__(self, 'ECHO_CLIENT', db)
1836 self.add_lustre_module('obdecho', 'obdecho')
1837 self.obd_uuid = self.db.get_first_ref('obd')
1838 obd = self.db.lookup(self.obd_uuid)
1839 self.uuid = generate_client_uuid(self.name)
1840 self.osc = VOSC(obd, self.uuid, self.name)
1843 if is_prepared(self.name):
1846 self.osc.prepare() # XXX This is so cheating. -p
1847 self.info(self.obd_uuid)
1849 lctl.newdev("echo_client", self.name, self.uuid,
1850 setup = self.osc.get_name())
1853 if is_prepared(self.name):
1854 Module.cleanup(self)
1857 def load_module(self):
1858 self.osc.load_module()
1859 Module.load_module(self)
1861 def cleanup_module(self):
1862 Module.cleanup_module(self)
1863 self.osc.cleanup_module()
1866 def generate_client_uuid(name):
1867 client_uuid = '%05x_%.19s_%05x%05x' % (int(random.random() * 1048576),
1869 int(random.random() * 1048576),
1870 int(random.random() * 1048576))
1871 return client_uuid[:36]
1874 class Mountpoint(Module):
1875 def __init__(self,db):
1876 Module.__init__(self, 'MTPT', db)
1877 self.path = self.db.get_val('path')
1878 self.fs_uuid = self.db.get_first_ref('filesystem')
1879 fs = self.db.lookup(self.fs_uuid)
1880 self.mds_uuid = fs.get_first_ref('mds')
1881 self.obd_uuid = fs.get_first_ref('obd')
1882 self.mgmt_uuid = fs.get_first_ref('mgmt')
1883 obd = self.db.lookup(self.obd_uuid)
1884 client_uuid = generate_client_uuid(self.name)
1885 self.vosc = VOSC(obd, client_uuid, self.name)
1886 self.mdc = get_mdc(db, client_uuid, self.name, self.mds_uuid)
1888 self.add_lustre_module('mdc', 'mdc')
1889 self.add_lustre_module('llite', 'llite')
1891 self.mgmtcli = ManagementClient(db.lookup(self.mgmt_uuid),
1897 if fs_is_mounted(self.path):
1898 log(self.path, "already mounted.")
1902 self.mgmtcli.prepare()
1905 mdc_name = self.mdc.name
1907 self.info(self.path, self.mds_uuid, self.obd_uuid)
1908 if config.record or config.lctl_dump:
1909 lctl.mount_option(local_node_name, self.vosc.get_name(), mdc_name)
1911 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s %s %s" % \
1912 (self.vosc.get_name(), mdc_name, config.config, self.path)
1913 run("mkdir", self.path)
1918 panic("mount failed:", self.path, ":", string.join(val))
1921 self.info(self.path, self.mds_uuid,self.obd_uuid)
1923 if config.record or config.lctl_dump:
1924 lctl.del_mount_option(local_node_name)
1926 if fs_is_mounted(self.path):
1928 (rc, out) = run("umount", "-f", self.path)
1930 (rc, out) = run("umount", self.path)
1932 raise CommandError('umount', out, rc)
1934 if fs_is_mounted(self.path):
1935 panic("fs is still mounted:", self.path)
1940 self.mgmtcli.cleanup()
1942 def load_module(self):
1944 self.mgmtcli.load_module()
1945 self.vosc.load_module()
1946 Module.load_module(self)
1948 def cleanup_module(self):
1949 Module.cleanup_module(self)
1950 self.vosc.cleanup_module()
1952 self.mgmtcli.cleanup_module()
1955 # ============================================================
1956 # misc query functions
1958 def get_ost_net(self, osd_uuid):
1962 osd = self.lookup(osd_uuid)
1963 node_uuid = osd.get_first_ref('node')
1964 node = self.lookup(node_uuid)
1966 panic("unable to find node for osd_uuid:", osd_uuid,
1967 " node_ref:", node_uuid)
1968 for net_uuid in node.get_networks():
1969 db = node.lookup(net_uuid)
1970 srv_list.append(Network(db))
1974 # the order of iniitailization is based on level.
1975 def getServiceLevel(self):
1976 type = self.get_class()
1978 if type in ('network',):
1980 elif type in ('routetbl',):
1982 elif type in ('ldlm',):
1984 elif type in ('mgmt',):
1986 elif type in ('osd', 'cobd'):
1988 elif type in ('mdsdev',):
1990 elif type in ('mountpoint', 'echoclient'):
1993 panic("Unknown type: ", type)
1995 if ret < config.minlevel or ret > config.maxlevel:
2000 # return list of services in a profile. list is a list of tuples
2001 # [(level, db_object),]
2002 def getServices(self):
2004 for ref_class, ref_uuid in self.get_all_refs():
2005 servdb = self.lookup(ref_uuid)
2007 level = getServiceLevel(servdb)
2009 list.append((level, servdb))
2011 panic('service not found: ' + ref_uuid)
2017 ############################################################
2019 # FIXME: clean this mess up!
2021 # OSC is no longer in the xml, so we have to fake it.
2022 # this is getting ugly and begging for another refactoring
2023 def get_osc(ost_db, uuid, fs_name):
2024 osc = OSC(ost_db, uuid, fs_name)
2027 def get_mdc(db, uuid, fs_name, mds_uuid):
2028 mds_db = db.lookup(mds_uuid);
2030 panic("no mds:", mds_uuid)
2031 mdc = MDC(mds_db, uuid, fs_name)
2034 ############################################################
2035 # routing ("rooting")
2037 # list of (nettype, cluster_id, nid)
2040 def find_local_clusters(node_db):
2041 global local_clusters
2042 for netuuid in node_db.get_networks():
2043 net = node_db.lookup(netuuid)
2045 debug("add_local", netuuid)
2046 local_clusters.append((srv.net_type, srv.cluster_id, srv.nid))
2048 if acceptors.has_key(srv.port):
2049 panic("duplicate port:", srv.port)
2050 acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type,
2051 srv.send_mem, srv.recv_mem,
2054 # This node is a gateway.
2056 def node_is_router():
2059 # If there are any routers found in the config, then this will be true
2060 # and all nodes will load kptlrouter.
2062 def node_needs_router():
2063 return needs_router or is_router
2065 # list of (nettype, gw, tgt_cluster_id, lo, hi)
2066 # Currently, these local routes are only added to kptlrouter route
2067 # table if they are needed to connect to a specific server. This
2068 # should be changed so all available routes are loaded, and the
2069 # ptlrouter can make all the decisions.
2072 def find_local_routes(lustre):
2073 """ Scan the lustre config looking for routers . Build list of
2075 global local_routes, needs_router
2077 list = lustre.lookup_class('node')
2079 if router.get_val_int('router', 0):
2081 for (local_type, local_cluster_id, local_nid) in local_clusters:
2083 for netuuid in router.get_networks():
2084 db = router.lookup(netuuid)
2085 if (local_type == db.get_val('nettype') and
2086 local_cluster_id == db.get_val('clusterid')):
2087 gw = db.get_val('nid')
2090 debug("find_local_routes: gw is", gw)
2091 for route in router.get_local_routes(local_type, gw):
2092 local_routes.append(route)
2093 debug("find_local_routes:", local_routes)
2096 def choose_local_server(srv_list):
2097 for srv in srv_list:
2098 if local_cluster(srv.net_type, srv.cluster_id):
2101 def local_cluster(net_type, cluster_id):
2102 for cluster in local_clusters:
2103 if net_type == cluster[0] and cluster_id == cluster[1]:
2107 def local_interface(net_type, cluster_id, nid):
2108 for cluster in local_clusters:
2109 if (net_type == cluster[0] and cluster_id == cluster[1]
2110 and nid == cluster[2]):
2114 def find_route(srv_list):
2116 frm_type = local_clusters[0][0]
2117 for srv in srv_list:
2118 debug("find_route: srv:", srv.nid, "type: ", srv.net_type)
2119 to_type = srv.net_type
2121 cluster_id = srv.cluster_id
2122 debug ('looking for route to', to_type, to)
2123 for r in local_routes:
2124 debug("find_route: ", r)
2125 if (r[3] <= to and to <= r[4]) and cluster_id == r[2]:
2126 result.append((srv, r))
2129 def get_active_target(db):
2130 target_uuid = db.getUUID()
2131 target_name = db.getName()
2132 node_name = get_select(target_name)
2134 tgt_dev_uuid = db.get_node_tgt_dev(node_name, target_uuid)
2136 tgt_dev_uuid = db.get_first_ref('active')
2139 def get_server_by_nid_uuid(db, nid_uuid):
2140 for n in db.lookup_class("network"):
2142 if net.nid_uuid == nid_uuid:
2146 ############################################################
2150 type = db.get_class()
2151 debug('Service:', type, db.getName(), db.getUUID())
2156 n = LOV(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
2157 elif type == 'network':
2159 elif type == 'routetbl':
2163 elif type == 'cobd':
2165 elif type == 'mdsdev':
2167 elif type == 'mountpoint':
2169 elif type == 'echoclient':
2171 elif type == 'mgmt':
2174 panic ("unknown service type:", type)
2178 # Prepare the system to run lustre using a particular profile
2179 # in a the configuration.
2180 # * load & the modules
2181 # * setup networking for the current node
2182 # * make sure partitions are in place and prepared
2183 # * initialize devices with lctl
2184 # Levels is important, and needs to be enforced.
2185 def for_each_profile(db, prof_list, operation):
2186 for prof_uuid in prof_list:
2187 prof_db = db.lookup(prof_uuid)
2189 panic("profile:", profile, "not found.")
2190 services = getServices(prof_db)
2193 def doWriteconf(services):
2197 if s[1].get_class() == 'mdsdev':
2198 n = newService(s[1])
2201 def doSetup(services):
2205 n = newService(s[1])
2208 def doModules(services):
2212 n = newService(s[1])
2215 def doCleanup(services):
2220 n = newService(s[1])
2221 if n.safe_to_clean():
2224 def doUnloadModules(services):
2229 n = newService(s[1])
2230 if n.safe_to_clean_modules():
2235 def doHost(lustreDB, hosts):
2236 global is_router, local_node_name
2239 node_db = lustreDB.lookup_name(h, 'node')
2243 panic('No host entry found.')
2245 local_node_name = node_db.get_val('name', 0)
2246 is_router = node_db.get_val_int('router', 0)
2247 lustre_upcall = node_db.get_val('lustreUpcall', '')
2248 portals_upcall = node_db.get_val('portalsUpcall', '')
2249 timeout = node_db.get_val_int('timeout', 0)
2250 ptldebug = node_db.get_val('ptldebug', '')
2251 subsystem = node_db.get_val('subsystem', '')
2253 find_local_clusters(node_db)
2255 find_local_routes(lustreDB)
2257 # Two step process: (1) load modules, (2) setup lustre
2258 # if not cleaning, load modules first.
2259 prof_list = node_db.get_refs('profile')
2261 if config.write_conf:
2262 for_each_profile(node_db, prof_list, doModules)
2264 for_each_profile(node_db, prof_list, doWriteconf)
2265 for_each_profile(node_db, prof_list, doUnloadModules)
2267 elif config.recover:
2268 if not (config.tgt_uuid and config.client_uuid and config.conn_uuid):
2269 raise Lustre.LconfError( "--recovery requires --tgt_uuid <UUID> " +
2270 "--client_uuid <UUID> --conn_uuid <UUID>")
2271 doRecovery(lustreDB, lctl, config.tgt_uuid, config.client_uuid,
2273 elif config.cleanup:
2275 # the command line can override this value
2277 # ugly hack, only need to run lctl commands for --dump
2278 if config.lctl_dump or config.record:
2279 for_each_profile(node_db, prof_list, doCleanup)
2282 sys_set_timeout(timeout)
2283 sys_set_ptldebug(ptldebug)
2284 sys_set_subsystem(subsystem)
2285 sys_set_lustre_upcall(lustre_upcall)
2286 sys_set_portals_upcall(portals_upcall)
2288 for_each_profile(node_db, prof_list, doCleanup)
2289 for_each_profile(node_db, prof_list, doUnloadModules)
2292 # ugly hack, only need to run lctl commands for --dump
2293 if config.lctl_dump or config.record:
2294 sys_set_timeout(timeout)
2295 sys_set_lustre_upcall(lustre_upcall)
2296 for_each_profile(node_db, prof_list, doSetup)
2300 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
2301 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
2303 for_each_profile(node_db, prof_list, doModules)
2305 sys_set_debug_path()
2306 sys_set_ptldebug(ptldebug)
2307 sys_set_subsystem(subsystem)
2308 script = config.gdb_script
2309 run(lctl.lctl, ' modules >', script)
2311 log ("The GDB module script is in", script)
2312 # pause, so user has time to break and
2315 sys_set_timeout(timeout)
2316 sys_set_lustre_upcall(lustre_upcall)
2317 sys_set_portals_upcall(portals_upcall)
2319 for_each_profile(node_db, prof_list, doSetup)
2321 def doRecovery(db, lctl, tgt_uuid, client_uuid, nid_uuid):
2322 tgt = db.lookup(tgt_uuid)
2324 raise Lustre.LconfError("doRecovery: "+ tgt_uuid +" not found.")
2325 new_uuid = get_active_target(tgt)
2327 raise Lustre.LconfError("doRecovery: no active target found for: " +
2329 net = choose_local_server(get_ost_net(db, new_uuid))
2331 raise Lustre.LconfError("Unable to find a connection to:" + new_uuid)
2333 log("Reconnecting", tgt_uuid, " to ", net.nid_uuid);
2335 oldnet = get_server_by_nid_uuid(db, nid_uuid)
2337 lctl.disconnect(oldnet)
2338 except CommandError, e:
2339 log("recover: disconnect", nid_uuid, "failed: ")
2344 except CommandError, e:
2345 log("recover: connect failed")
2348 lctl.recover(client_uuid, net.nid_uuid)
2351 def setupModulePath(cmd, portals_dir = PORTALS_DIR):
2352 base = os.path.dirname(cmd)
2353 if development_mode():
2354 if not config.lustre:
2355 config.lustre = (os.path.join(base, ".."))
2356 # normalize the portals dir, using command line arg if set
2358 portals_dir = config.portals
2359 dir = os.path.join(config.lustre, portals_dir)
2360 config.portals = dir
2361 debug('config.portals', config.portals)
2362 elif config.lustre and config.portals:
2364 # if --lustre and --portals, normalize portals
2365 # can ignore POTRALS_DIR here, since it is probly useless here
2366 config.portals = os.path.join(config.lustre, config.portals)
2367 debug('config.portals B', config.portals)
2369 def sysctl(path, val):
2370 debug("+ sysctl", path, val)
2374 fp = open(os.path.join('/proc/sys', path), 'w')
2381 def sys_set_debug_path():
2382 sysctl('portals/debug_path', config.debug_path)
2384 def sys_set_lustre_upcall(upcall):
2385 # the command overrides the value in the node config
2386 if config.lustre_upcall:
2387 upcall = config.lustre_upcall
2389 upcall = config.upcall
2391 lctl.set_lustre_upcall(upcall)
2393 def sys_set_portals_upcall(upcall):
2394 # the command overrides the value in the node config
2395 if config.portals_upcall:
2396 upcall = config.portals_upcall
2398 upcall = config.upcall
2400 sysctl('portals/upcall', upcall)
2402 def sys_set_timeout(timeout):
2403 # the command overrides the value in the node config
2404 if config.timeout and config.timeout > 0:
2405 timeout = config.timeout
2406 if timeout != None and timeout > 0:
2407 lctl.set_timeout(timeout)
2409 def sys_tweak_socknal ():
2410 if config.single_socket:
2411 sysctl("socknal/typed", 0)
2413 def sys_optimize_elan ():
2414 procfiles = ["/proc/elan/config/eventint_punt_loops",
2415 "/proc/qsnet/elan3/config/eventint_punt_loops",
2416 "/proc/qsnet/elan4/config/elan4_mainint_punt_loops"]
2418 if os.access(p, os.R_OK):
2419 run ("echo 0 > " + p)
2421 def sys_set_ptldebug(ptldebug):
2423 ptldebug = config.ptldebug
2426 val = eval(ptldebug, ptldebug_names)
2427 val = "0x%x" % (val)
2428 sysctl('portals/debug', val)
2429 except NameError, e:
2432 def sys_set_subsystem(subsystem):
2433 if config.subsystem:
2434 subsystem = config.subsystem
2437 val = eval(subsystem, subsystem_names)
2438 val = "0x%x" % (val)
2439 sysctl('portals/subsystem_debug', val)
2440 except NameError, e:
2443 def sys_set_netmem_max(path, max):
2444 debug("setting", path, "to at least", max)
2452 fp = open(path, 'w')
2453 fp.write('%d\n' %(max))
2457 def sys_make_devices():
2458 if not os.access('/dev/portals', os.R_OK):
2459 run('mknod /dev/portals c 10 240')
2460 if not os.access('/dev/obd', os.R_OK):
2461 run('mknod /dev/obd c 10 241')
2464 # Add dir to the global PATH, if not already there.
2465 def add_to_path(new_dir):
2466 syspath = string.split(os.environ['PATH'], ':')
2467 if new_dir in syspath:
2469 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
2471 def default_debug_path():
2472 path = '/tmp/lustre-log'
2473 if os.path.isdir('/r'):
2478 def default_gdb_script():
2479 script = '/tmp/ogdb'
2480 if os.path.isdir('/r'):
2481 return '/r' + script
2486 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
2487 # ensure basic elements are in the system path
2488 def sanitise_path():
2489 for dir in DEFAULT_PATH:
2492 # global hack for the --select handling
2494 def init_select(args):
2495 # args = [service=nodeA,service2=nodeB service3=nodeC]
2498 list = string.split(arg, ',')
2500 srv, node = string.split(entry, '=')
2501 tgt_select[srv] = node
2503 def get_select(srv):
2504 if tgt_select.has_key(srv):
2505 return tgt_select[srv]
2509 FLAG = Lustre.Options.FLAG
2510 PARAM = Lustre.Options.PARAM
2511 INTPARAM = Lustre.Options.INTPARAM
2512 PARAMLIST = Lustre.Options.PARAMLIST
2514 ('verbose,v', "Print system commands as they are run"),
2515 ('ldapurl',"LDAP server URL, eg. ldap://localhost", PARAM),
2516 ('config', "Cluster config name used for LDAP query", PARAM),
2517 ('select', "service=nodeA,service2=nodeB ", PARAMLIST),
2518 ('node', "Load config for <nodename>", PARAM),
2519 ('cleanup,d', "Cleans up config. (Shutdown)"),
2520 ('force,f', "Forced unmounting and/or obd detach during cleanup",
2522 ('single_socket', "socknal option: only use one socket instead of bundle",
2524 ('failover',"""Used to shut down without saving state.
2525 This will allow this node to "give up" a service to a
2526 another node for failover purposes. This will not
2527 be a clean shutdown.""",
2529 ('gdb', """Prints message after creating gdb module script
2530 and sleeps for 5 seconds."""),
2531 ('noexec,n', """Prints the commands and steps that will be run for a
2532 config without executing them. This can used to check if a
2533 config file is doing what it should be doing"""),
2534 ('nomod', "Skip load/unload module step."),
2535 ('nosetup', "Skip device setup/cleanup step."),
2536 ('reformat', "Reformat all devices (without question)"),
2537 ('mkfsoptions', "Additional options for the mk*fs command line", PARAM),
2538 ('dump', "Dump the kernel debug log to file before portals is unloaded",
2540 ('write_conf', "Save all the client config information on mds."),
2541 ('record', "Write config information on mds."),
2542 ('record_log', "Name of config record log.", PARAM),
2543 ('record_device', "MDS device name that will record the config commands",
2545 ('minlevel', "Minimum level of services to configure/cleanup",
2547 ('maxlevel', """Maximum level of services to configure/cleanup
2548 Levels are aproximatly like:
2553 70 - mountpoint, echo_client, osc, mdc, lov""",
2555 ('lustre', """Base directory of lustre sources. This parameter will
2556 cause lconf to load modules from a source tree.""", PARAM),
2557 ('portals', """Portals source directory. If this is a relative path,
2558 then it is assumed to be relative to lustre. """, PARAM),
2559 ('timeout', "Set recovery timeout", INTPARAM),
2560 ('upcall', "Set both portals and lustre upcall script", PARAM),
2561 ('lustre_upcall', "Set lustre upcall script", PARAM),
2562 ('portals_upcall', "Set portals upcall script", PARAM),
2563 ('lctl_dump', "Save lctl ioctls to the dumpfile argument", PARAM),
2564 ('ptldebug', "Set the portals debug level", PARAM),
2565 ('subsystem', "Set the portals debug subsystem", PARAM),
2566 ('gdb_script', "Fullname of gdb debug script", PARAM, default_gdb_script()),
2567 ('debug_path', "Path to save debug dumps", PARAM, default_debug_path()),
2568 # Client recovery options
2569 ('recover', "Recover a device"),
2570 ('group', "The group of devices to configure or cleanup", PARAM),
2571 ('tgt_uuid', "The failed target (required for recovery)", PARAM),
2572 ('client_uuid', "The failed client (required for recovery)", PARAM),
2573 ('conn_uuid', "The failed connection (required for recovery)", PARAM),
2575 ('inactive', """The name of an inactive service, to be ignored during
2576 mounting (currently OST-only). Can be repeated.""",
2581 global lctl, config, toplevel, CONFIG_FILE
2583 # in the upcall this is set to SIG_IGN
2584 signal.signal(signal.SIGCHLD, signal.SIG_DFL)
2586 cl = Lustre.Options("lconf", "config.xml", lconf_options)
2588 config, args = cl.parse(sys.argv[1:])
2589 except Lustre.OptionError, e:
2593 setupModulePath(sys.argv[0])
2595 host = socket.gethostname()
2597 # the PRNG is normally seeded with time(), which is not so good for starting
2598 # time-synchronized clusters
2599 input = open('/dev/urandom', 'r')
2601 print 'Unable to open /dev/urandom!'
2603 seed = input.read(32)
2609 init_select(config.select)
2612 # allow config to be fetched via HTTP, but only with python2
2613 if sys.version[0] != '1' and args[0].startswith('http://'):
2616 config_file = urllib2.urlopen(args[0])
2617 except (urllib2.URLError, socket.error), err:
2618 if hasattr(err, 'args'):
2620 print "Could not access '%s': %s" %(args[0], err)
2622 elif not os.access(args[0], os.R_OK):
2623 print 'File not found or readable:', args[0]
2627 config_file = open(args[0], 'r')
2629 dom = xml.dom.minidom.parse(config_file)
2631 panic("%s does not appear to be a config file." % (args[0]))
2632 sys.exit(1) # make sure to die here, even in debug mode.
2633 CONFIG_FILE = args[0]
2634 db = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement)
2635 if not config.config:
2636 config.config = os.path.basename(args[0])# use full path?
2637 if config.config[-4:] == '.xml':
2638 config.config = config.config[:-4]
2639 elif config.ldapurl:
2640 if not config.config:
2641 panic("--ldapurl requires --config name")
2642 dn = "config=%s,fs=lustre" % (config.config)
2643 db = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl)
2644 elif config.ptldebug or config.subsystem:
2645 sys_set_ptldebug(None)
2646 sys_set_subsystem(None)
2649 print 'Missing config file or ldap URL.'
2650 print 'see lconf --help for command summary'
2655 ver = db.get_version()
2657 panic("No version found in config data, please recreate.")
2658 if ver != Lustre.CONFIG_VERSION:
2659 panic("Config version", ver, "does not match lconf version",
2660 Lustre.CONFIG_VERSION)
2664 node_list.append(config.node)
2667 node_list.append(host)
2668 node_list.append('localhost')
2670 debug("configuring for host: ", node_list)
2673 config.debug_path = config.debug_path + '-' + host
2674 config.gdb_script = config.gdb_script + '-' + host
2676 lctl = LCTLInterface('lctl')
2678 if config.lctl_dump:
2679 lctl.use_save_file(config.lctl_dump)
2682 if not (config.record_device and config.record_log):
2683 panic("When recording, both --record_log and --record_device must be specified.")
2684 lctl.clear_log(config.record_device, config.record_log)
2685 lctl.record(config.record_device, config.record_log)
2687 doHost(db, node_list)
2692 if __name__ == "__main__":
2695 except Lustre.LconfError, e:
2697 # traceback.print_exc(file=sys.stdout)
2699 except CommandError, e:
2703 if first_cleanup_error:
2704 sys.exit(first_cleanup_error)