3 # Copyright (C) 2002-2003 Cluster File Systems, Inc.
4 # Authors: Robert Read <rread@clusterfs.com>
5 # Mike Shaver <shaver@clusterfs.com>
6 # This file is part of Lustre, http://www.lustre.org.
8 # Lustre is free software; you can redistribute it and/or
9 # modify it under the terms of version 2 of the GNU General Public
10 # License as published by the Free Software Foundation.
12 # Lustre is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Lustre; if not, write to the Free Software
19 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 # lconf - lustre configuration tool
23 # lconf is the main driver script for starting and stopping
24 # lustre filesystem services.
26 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import sys, getopt, types
29 import string, os, stat, popen2, socket, time, random, fcntl, select
30 import re, exceptions, signal, traceback
31 import xml.dom.minidom
33 if sys.version[0] == '1':
34 from FCNTL import F_GETFL, F_SETFL
36 from fcntl import F_GETFL, F_SETFL
38 PYMOD_DIR = "/usr/lib/lustre/python"
40 def development_mode():
41 base = os.path.dirname(sys.argv[0])
42 if os.access(base+"/Makefile.am", os.R_OK):
46 if not development_mode():
47 sys.path.append(PYMOD_DIR)
53 DEFAULT_TCPBUF = 8388608
56 # Maximum number of devices to search for.
57 # (the /dev/loop* nodes need to be created beforehand)
58 MAX_LOOP_DEVICES = 256
59 PORTALS_DIR = 'portals'
61 # Needed to call lconf --record
64 # Please keep these in sync with the values in portals/kp30.h
76 "warning" : (1 << 10),
80 "portals" : (1 << 14),
82 "dlmtrace" : (1 << 16),
86 "rpctrace" : (1 << 20),
87 "vfstrace" : (1 << 21),
92 "undefined" : (1 << 0),
102 "portals" : (1 << 10),
103 "socknal" : (1 << 11),
104 "qswnal" : (1 << 12),
105 "pinger" : (1 << 13),
106 "filter" : (1 << 14),
112 "ptlrouter" : (1 << 20),
118 first_cleanup_error = 0
119 def cleanup_error(rc):
120 global first_cleanup_error
121 if not first_cleanup_error:
122 first_cleanup_error = rc
124 # ============================================================
125 # debugging and error funcs
127 def fixme(msg = "this feature"):
128 raise Lustre.LconfError, msg + ' not implmemented yet.'
131 msg = string.join(map(str,args))
132 if not config.noexec:
133 raise Lustre.LconfError(msg)
138 msg = string.join(map(str,args))
143 print string.strip(s)
147 msg = string.join(map(str,args))
150 # ack, python's builtin int() does not support '0x123' syntax.
151 # eval can do it, although what a hack!
155 return eval(s, {}, {})
158 except SyntaxError, e:
159 raise ValueError("not a number")
161 raise ValueError("not a number")
163 # ============================================================
164 # locally defined exceptions
165 class CommandError (exceptions.Exception):
166 def __init__(self, cmd_name, cmd_err, rc=None):
167 self.cmd_name = cmd_name
168 self.cmd_err = cmd_err
173 if type(self.cmd_err) == types.StringType:
175 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
177 print "! %s: %s" % (self.cmd_name, self.cmd_err)
178 elif type(self.cmd_err) == types.ListType:
180 print "! %s (error %d):" % (self.cmd_name, self.rc)
182 print "! %s:" % (self.cmd_name)
183 for s in self.cmd_err:
184 print "> %s" %(string.strip(s))
189 # ============================================================
190 # handle daemons, like the acceptor
192 """ Manage starting and stopping a daemon. Assumes daemon manages
193 it's own pid file. """
195 def __init__(self, cmd):
201 log(self.command, "already running.")
203 self.path = find_prog(self.command)
205 panic(self.command, "not found.")
206 ret, out = runcmd(self.path +' '+ self.command_line())
208 raise CommandError(self.path, out, ret)
212 pid = self.read_pidfile()
214 log ("killing process", pid)
216 #time.sleep(1) # let daemon die
218 log("unable to kill", self.command, e)
220 log("unable to kill", self.command)
223 pid = self.read_pidfile()
233 def read_pidfile(self):
235 fp = open(self.pidfile(), 'r')
242 def clean_pidfile(self):
243 """ Remove a stale pidfile """
244 log("removing stale pidfile:", self.pidfile())
246 os.unlink(self.pidfile())
248 log(self.pidfile(), e)
250 class AcceptorHandler(DaemonHandler):
251 def __init__(self, port, net_type, send_mem, recv_mem, irq_aff):
252 DaemonHandler.__init__(self, "acceptor")
255 self.send_mem = send_mem
256 self.recv_mem = recv_mem
259 self.flags = self.flags + ' -i'
262 return "/var/run/%s-%d.pid" % (self.command, self.port)
264 def command_line(self):
265 return string.join(map(str,('-s', self.send_mem, '-r', self.recv_mem, self.flags, self.port)))
269 # start the acceptors
271 if config.lctl_dump or config.record:
273 for port in acceptors.keys():
274 daemon = acceptors[port]
275 if not daemon.running():
278 def run_one_acceptor(port):
279 if config.lctl_dump or config.record:
281 if acceptors.has_key(port):
282 daemon = acceptors[port]
283 if not daemon.running():
286 panic("run_one_acceptor: No acceptor defined for port:", port)
288 def stop_acceptor(port):
289 if acceptors.has_key(port):
290 daemon = acceptors[port]
295 # ============================================================
296 # handle lctl interface
299 Manage communication with lctl
302 def __init__(self, cmd):
304 Initialize close by finding the lctl binary.
306 self.lctl = find_prog(cmd)
308 self.record_device = ''
311 debug('! lctl not found')
314 raise CommandError('lctl', "unable to find lctl binary.")
316 def use_save_file(self, file):
317 self.save_file = file
319 def record(self, dev_name, logname):
320 log("Recording log", logname, "on", dev_name)
321 self.record_device = dev_name
322 self.record_log = logname
324 def end_record(self):
325 log("End recording log", self.record_log, "on", self.record_device)
326 self.record_device = None
327 self.record_log = None
329 def set_nonblock(self, fd):
330 fl = fcntl.fcntl(fd, F_GETFL)
331 fcntl.fcntl(fd, F_SETFL, fl | os.O_NDELAY)
336 the cmds are written to stdin of lctl
337 lctl doesn't return errors when run in script mode, so
339 should modify command line to accept multiple commands, or
340 create complex command line options
344 cmds = '\n dump ' + self.save_file + '\n' + cmds
345 elif self.record_device:
349 %s""" % (self.record_device, self.record_log, cmds)
351 debug("+", cmd_line, cmds)
352 if config.noexec: return (0, [])
354 child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command
355 child.tochild.write(cmds + "\n")
356 child.tochild.close()
358 # From "Python Cookbook" from O'Reilly
359 outfile = child.fromchild
360 outfd = outfile.fileno()
361 self.set_nonblock(outfd)
362 errfile = child.childerr
363 errfd = errfile.fileno()
364 self.set_nonblock(errfd)
366 outdata = errdata = ''
369 ready = select.select([outfd,errfd],[],[]) # Wait for input
370 if outfd in ready[0]:
371 outchunk = outfile.read()
372 if outchunk == '': outeof = 1
373 outdata = outdata + outchunk
374 if errfd in ready[0]:
375 errchunk = errfile.read()
376 if errchunk == '': erreof = 1
377 errdata = errdata + errchunk
378 if outeof and erreof: break
379 # end of "borrowed" code
382 if os.WIFEXITED(ret):
383 rc = os.WEXITSTATUS(ret)
386 if rc or len(errdata):
387 raise CommandError(self.lctl, errdata, rc)
390 def runcmd(self, *args):
392 run lctl using the command line
394 cmd = string.join(map(str,args))
395 debug("+", self.lctl, cmd)
396 rc, out = run(self.lctl, cmd)
398 raise CommandError(self.lctl, out, rc)
402 def clear_log(self, dev, log):
403 """ clear an existing log """
408 quit """ % (dev, log)
411 def network(self, net, nid):
416 quit """ % (net, nid)
419 # create a new connection
420 def add_uuid(self, net_type, uuid, nid):
421 cmds = "\n add_uuid %s %s %s" %(uuid, nid, net_type)
424 def add_autoconn(self, net_type, send_mem, recv_mem, nid, hostaddr,
426 if net_type in ('tcp',) and not config.lctl_dump:
431 add_autoconn %s %s %d %s
435 nid, hostaddr, port, flags )
438 def connect(self, srv):
439 self.add_uuid(srv.net_type, srv.nid_uuid, srv.nid)
440 if srv.net_type in ('tcp',) and not config.lctl_dump:
444 self.add_autoconn(srv.net_type, srv.send_mem, srv.recv_mem,
445 srv.nid, srv.hostaddr, srv.port, flags)
448 def recover(self, dev_name, new_conn):
451 recover %s""" %(dev_name, new_conn)
454 # add a route to a range
455 def add_route(self, net, gw, lo, hi):
463 except CommandError, e:
467 def del_route(self, net, gw, lo, hi):
472 quit """ % (net, gw, lo, hi)
475 # add a route to a host
476 def add_route_host(self, net, uuid, gw, tgt):
477 self.add_uuid(net, uuid, tgt)
485 except CommandError, e:
489 # add a route to a range
490 def del_route_host(self, net, uuid, gw, tgt):
496 quit """ % (net, gw, tgt)
500 def del_autoconn(self, net_type, nid, hostaddr):
501 if net_type in ('tcp',) and not config.lctl_dump:
510 # disconnect one connection
511 def disconnect(self, srv):
512 self.del_uuid(srv.nid_uuid)
513 if srv.net_type in ('tcp',) and not config.lctl_dump:
514 self.del_autoconn(srv.net_type, srv.nid, srv.hostaddr)
516 def del_uuid(self, uuid):
524 def disconnectAll(self, net):
532 def attach(self, type, name, uuid):
535 quit""" % (type, name, uuid)
538 def setup(self, name, setup = ""):
542 quit""" % (name, setup)
546 # create a new device with lctl
547 def newdev(self, type, name, uuid, setup = ""):
548 self.attach(type, name, uuid);
550 self.setup(name, setup)
551 except CommandError, e:
552 self.cleanup(name, uuid, 0)
557 def cleanup(self, name, uuid, force, failover = 0):
558 if failover: force = 1
564 quit""" % (name, ('', 'force')[force],
565 ('', 'failover')[failover])
569 def lov_setup(self, name, uuid, desc_uuid, mdsuuid, stripe_cnt,
570 stripe_sz, stripe_off,
574 lov_setup %s %d %d %d %s %s
575 quit""" % (name, uuid, desc_uuid, stripe_cnt, stripe_sz, stripe_off,
580 def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off,
584 lov_setconfig %s %d %d %d %s %s
585 quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
589 def dump(self, dump_file):
592 quit""" % (dump_file)
595 # get list of devices
596 def device_list(self):
597 devices = '/proc/fs/lustre/devices'
599 if os.access(devices, os.R_OK):
601 fp = open(devices, 'r')
609 def lustre_version(self):
610 rc, out = self.runcmd('version')
614 def mount_option(self, profile, osc, mdc):
616 mount_option %s %s %s
617 quit""" % (profile, osc, mdc)
620 # delete mount options
621 def del_mount_option(self, profile):
627 def set_timeout(self, timeout):
633 # delete mount options
634 def set_lustre_upcall(self, upcall):
639 # ============================================================
640 # Various system-level functions
641 # (ideally moved to their own module)
643 # Run a command and return the output and status.
644 # stderr is sent to /dev/null, could use popen3 to
645 # save it if necessary
648 if config.noexec: return (0, [])
649 f = os.popen(cmd + ' 2>&1')
659 cmd = string.join(map(str,args))
662 # Run a command in the background.
663 def run_daemon(*args):
664 cmd = string.join(map(str,args))
666 if config.noexec: return 0
667 f = os.popen(cmd + ' 2>&1')
675 # Determine full path to use for an external command
676 # searches dirname(argv[0]) first, then PATH
678 syspath = string.split(os.environ['PATH'], ':')
679 cmdpath = os.path.dirname(sys.argv[0])
680 syspath.insert(0, cmdpath);
682 syspath.insert(0, os.path.join(config.portals, 'utils/'))
684 prog = os.path.join(d,cmd)
685 if os.access(prog, os.X_OK):
689 # Recursively look for file starting at base dir
690 def do_find_file(base, mod):
691 fullname = os.path.join(base, mod)
692 if os.access(fullname, os.R_OK):
694 for d in os.listdir(base):
695 dir = os.path.join(base,d)
696 if os.path.isdir(dir):
697 module = do_find_file(dir, mod)
701 def find_module(src_dir, dev_dir, modname):
702 mod = '%s.o' % (modname)
703 module = src_dir +'/'+ dev_dir +'/'+ mod
705 if os.access(module, os.R_OK):
711 # is the path a block device?
718 return stat.S_ISBLK(s[stat.ST_MODE])
720 # build fs according to type
722 def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1):
728 panic("size of filesystem on '%s' must be larger than 8MB, but is set to %s"%
730 # devsize is in 1k, and fs block count is in 4k
731 block_cnt = devsize/4
733 if fstype in ('ext3', 'extN'):
734 # ext3 journal size is in megabytes
737 if not is_block(dev):
738 ret, out = runcmd("ls -l %s" %dev)
739 devsize = int(string.split(out[0])[4]) / 1024
741 ret, out = runcmd("sfdisk -s %s" %dev)
742 devsize = int(out[0])
743 if devsize > 1024 * 1024:
744 jsize = ((devsize / 102400) * 4)
747 if jsize: jopt = "-J size=%d" %(jsize,)
748 if isize: iopt = "-I %d" %(isize,)
749 mkfs = 'mkfs.ext2 -j -b 4096 '
750 if not isblock or config.force:
752 elif fstype == 'reiserfs':
753 # reiserfs journal size is in blocks
754 if jsize: jopt = "--journal_size %d" %(jsize,)
755 mkfs = 'mkreiserfs -ff'
757 panic('unsupported fs type: ', fstype)
759 if config.mkfsoptions != None:
760 mkfs = mkfs + ' ' + config.mkfsoptions
761 if mkfsoptions != None:
762 mkfs = mkfs + ' ' + mkfsoptions
763 (ret, out) = run (mkfs, jopt, iopt, dev, block_cnt)
765 panic("Unable to build fs:", dev, string.join(out))
766 # enable hash tree indexing on fsswe
767 if fstype in ('ext3', 'extN'):
768 htree = 'echo "feature FEATURE_C5" | debugfs -w'
769 (ret, out) = run (htree, dev)
771 panic("Unable to enable htree:", dev)
773 # some systems use /dev/loopN, some /dev/loop/N
777 if not os.access(loop + str(0), os.R_OK):
779 if not os.access(loop + str(0), os.R_OK):
780 panic ("can't access loop devices")
783 # find loop device assigned to thefile
786 for n in xrange(0, MAX_LOOP_DEVICES):
788 if os.access(dev, os.R_OK):
789 (stat, out) = run('losetup', dev)
790 if out and stat == 0:
791 m = re.search(r'\((.*)\)', out[0])
792 if m and file == m.group(1):
798 # create file if necessary and assign the first free loop device
799 def init_loop(file, size, fstype, journal_size, inode_size, mkfsoptions, reformat):
800 dev = find_loop(file)
802 print 'WARNING file:', file, 'already mapped to', dev
804 if reformat or not os.access(file, os.R_OK | os.W_OK):
806 panic("size of loopback file '%s' must be larger than 8MB, but is set to %s" % (file,size))
807 (ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size,
810 panic("Unable to create backing store:", file)
811 mkfs(file, size, fstype, journal_size, inode_size, mkfsoptions, isblock=0)
814 # find next free loop
815 for n in xrange(0, MAX_LOOP_DEVICES):
817 if os.access(dev, os.R_OK):
818 (stat, out) = run('losetup', dev)
820 run('losetup', dev, file)
823 print "out of loop devices"
825 print "out of loop devices"
828 # undo loop assignment
829 def clean_loop(file):
830 dev = find_loop(file)
832 ret, out = run('losetup -d', dev)
834 log('unable to clean loop device:', dev, 'for file:', file)
837 # determine if dev is formatted as a <fstype> filesystem
838 def need_format(fstype, dev):
839 # FIXME don't know how to implement this
842 # initialize a block device if needed
843 def block_dev(dev, size, fstype, reformat, autoformat, journal_size,
844 inode_size, mkfsoptions):
845 if config.noexec: return dev
846 if not is_block(dev):
847 dev = init_loop(dev, size, fstype, journal_size, inode_size,
848 mkfsoptions, reformat)
849 elif reformat or (need_format(fstype, dev) and autoformat == 'yes'):
850 mkfs(dev, size, fstype, journal_size, inode_size, mkfsoptions,
853 # panic("device:", dev,
854 # "not prepared, and autoformat is not set.\n",
855 # "Rerun with --reformat option to format ALL filesystems")
860 """lookup IP address for an interface"""
861 rc, out = run("/sbin/ifconfig", iface)
864 addr = string.split(out[1])[1]
865 ip = string.split(addr, ':')[1]
868 def sys_get_elan_position_file():
869 procfiles = ["/proc/elan/device0/position",
870 "/proc/qsnet/elan4/device0/position",
871 "/proc/qsnet/elan3/device0/position"]
873 if os.access(p, os.R_OK):
877 def sys_get_local_nid(net_type, wildcard, cluster_id):
878 """Return the local nid."""
880 if sys_get_elan_position_file():
881 local = sys_get_local_address('elan', '*', cluster_id)
883 local = sys_get_local_address(net_type, wildcard, cluster_id)
886 def sys_get_local_address(net_type, wildcard, cluster_id):
887 """Return the local address for the network type."""
889 if net_type in ('tcp',):
891 iface, star = string.split(wildcard, ':')
892 local = if2addr(iface)
894 panic ("unable to determine ip for:", wildcard)
896 host = socket.gethostname()
897 local = socket.gethostbyname(host)
898 elif net_type == 'elan':
899 # awk '/NodeId/ { print $2 }' 'sys_get_elan_position_file()'
900 f = sys_get_elan_position_file()
902 panic ("unable to determine local Elan ID")
905 lines = fp.readlines()
913 nid = my_int(cluster_id) + my_int(elan_id)
915 except ValueError, e:
919 elif net_type == 'gm':
920 fixme("automatic local address for GM")
921 elif net_type == 'scimac':
922 scinode="/opt/scali/sbin/scinode"
923 if os.path.exists(scinode):
924 (rc,local) = run(scinode)
926 panic (scinode, " not found on node with scimac networking")
928 panic (scinode, " failed")
929 local=string.rstrip(local[0])
933 def mod_loaded(modname):
934 """Check if a module is already loaded. Look in /proc/modules for it."""
936 fp = open('/proc/modules')
937 lines = fp.readlines()
939 # please forgive my tired fingers for this one
940 ret = filter(lambda word, mod=modname: word == mod,
941 map(lambda line: string.split(line)[0], lines))
946 # XXX: instead of device_list, ask for $name and see what we get
947 def is_prepared(name):
948 """Return true if a device exists for the name"""
951 if (config.noexec or config.record) and config.cleanup:
954 # expect this format:
955 # 1 UP ldlm ldlm ldlm_UUID 2
956 out = lctl.device_list()
958 if name == string.split(s)[3]:
960 except CommandError, e:
964 def is_network_prepared():
965 """If the any device exists, then assume that all networking
966 has been configured"""
967 out = lctl.device_list()
970 def fs_is_mounted(path):
971 """Return true if path is a mounted lustre filesystem"""
973 fp = open('/proc/mounts')
974 lines = fp.readlines()
978 if a[1] == path and a[2] == 'lustre_lite':
986 """Manage kernel modules"""
987 def __init__(self, lustre_dir, portals_dir):
988 self.lustre_dir = lustre_dir
989 self.portals_dir = portals_dir
990 self.kmodule_list = []
992 def add_portals_module(self, dev_dir, modname):
993 """Append a module to list of modules to load."""
994 self.kmodule_list.append((self.portals_dir, dev_dir, modname))
996 def add_lustre_module(self, dev_dir, modname):
997 """Append a module to list of modules to load."""
998 self.kmodule_list.append((self.lustre_dir, dev_dir, modname))
1000 def load_module(self):
1001 """Load all the modules in the list in the order they appear."""
1002 for src_dir, dev_dir, mod in self.kmodule_list:
1003 if mod_loaded(mod) and not config.noexec:
1005 log ('loading module:', mod, 'srcdir', src_dir, 'devdir', dev_dir)
1007 module = find_module(src_dir, dev_dir, mod)
1009 panic('module not found:', mod)
1010 (rc, out) = run('/sbin/insmod', module)
1012 raise CommandError('insmod', out, rc)
1014 (rc, out) = run('/sbin/modprobe', mod)
1016 raise CommandError('modprobe', out, rc)
1018 def cleanup_module(self):
1019 """Unload the modules in the list in reverse order."""
1020 rev = self.kmodule_list
1022 for src_dir, dev_dir, mod in rev:
1023 if not mod_loaded(mod) and not config.noexec:
1026 if mod == 'portals' and config.dump:
1027 lctl.dump(config.dump)
1028 log('unloading module:', mod)
1029 (rc, out) = run('/sbin/rmmod', mod)
1031 log('! unable to unload module:', mod)
1034 # ============================================================
1035 # Classes to prepare and cleanup the various objects
1038 """ Base class for the rest of the modules. The default cleanup method is
1039 defined here, as well as some utilitiy funcs.
1041 def __init__(self, module_name, db):
1043 self.module_name = module_name
1044 self.name = self.db.getName()
1045 self.uuid = self.db.getUUID()
1048 self.kmod = kmod(config.lustre, config.portals)
1050 def info(self, *args):
1051 msg = string.join(map(str,args))
1052 print self.module_name + ":", self.name, self.uuid, msg
1055 """ default cleanup, used for most modules """
1058 lctl.cleanup(self.name, self.uuid, config.force)
1059 except CommandError, e:
1060 log(self.module_name, "cleanup failed: ", self.name)
1064 def add_portals_module(self, dev_dir, modname):
1065 """Append a module to list of modules to load."""
1066 self.kmod.add_portals_module(dev_dir, modname)
1068 def add_lustre_module(self, dev_dir, modname):
1069 """Append a module to list of modules to load."""
1070 self.kmod.add_lustre_module(dev_dir, modname)
1072 def load_module(self):
1073 """Load all the modules in the list in the order they appear."""
1074 self.kmod.load_module()
1076 def cleanup_module(self):
1077 """Unload the modules in the list in reverse order."""
1078 if self.safe_to_clean():
1079 self.kmod.cleanup_module()
1081 def safe_to_clean(self):
1084 def safe_to_clean_modules(self):
1085 return self.safe_to_clean()
1087 class Network(Module):
1088 def __init__(self,db):
1089 Module.__init__(self, 'NETWORK', db)
1090 self.net_type = self.db.get_val('nettype')
1091 self.nid = self.db.get_val('nid', '*')
1092 self.cluster_id = self.db.get_val('clusterid', "0")
1093 self.port = self.db.get_val_int('port', 0)
1094 self.send_mem = self.db.get_val_int('sendmem', DEFAULT_TCPBUF)
1095 self.recv_mem = self.db.get_val_int('recvmem', DEFAULT_TCPBUF)
1096 self.irq_affinity = self.db.get_val_int('irqaffinity', 0)
1099 self.nid = sys_get_local_nid(self.net_type, self.nid, self.cluster_id)
1101 panic("unable to set nid for", self.net_type, self.nid, cluster_id)
1102 self.generic_nid = 1
1103 debug("nid:", self.nid)
1105 self.generic_nid = 0
1107 self.nid_uuid = self.nid_to_uuid(self.nid)
1109 self.hostaddr = self.db.get_val('hostaddr', self.nid)
1110 if '*' in self.hostaddr:
1111 self.hostaddr = sys_get_local_address(self.net_type, self.hostaddr, self.cluster_id)
1112 if not self.hostaddr:
1113 panic("unable to set hostaddr for", self.net_type, self.hostaddr, self.cluster_id)
1114 debug("hostaddr:", self.hostaddr)
1116 self.add_portals_module("libcfs", 'portals')
1117 if node_needs_router():
1118 self.add_portals_module("router", 'kptlrouter')
1119 if self.net_type == 'tcp':
1120 self.add_portals_module("knals/socknal", 'ksocknal')
1121 if self.net_type == 'elan':
1122 self.add_portals_module("knals/qswnal", 'kqswnal')
1123 if self.net_type == 'gm':
1124 self.add_portals_module("knals/gmnal", 'kgmnal')
1125 if self.net_type == 'scimac':
1126 self.add_portals_module("knals/scimacnal", 'kscimacnal')
1128 def nid_to_uuid(self, nid):
1129 return "NID_%s_UUID" %(nid,)
1132 if is_network_prepared():
1134 self.info(self.net_type, self.nid, self.port)
1135 if not (config.record and self.generic_nid):
1136 lctl.network(self.net_type, self.nid)
1137 if self.net_type == 'tcp':
1139 if self.net_type == 'elan':
1141 if self.port and node_is_router():
1142 run_one_acceptor(self.port)
1143 self.connect_peer_gateways()
1145 def connect_peer_gateways(self):
1146 for router in self.db.lookup_class('node'):
1147 if router.get_val_int('router', 0):
1148 for netuuid in router.get_networks():
1149 net = self.db.lookup(netuuid)
1151 if (gw.cluster_id == self.cluster_id and
1152 gw.net_type == self.net_type):
1153 if gw.nid != self.nid:
1156 def disconnect_peer_gateways(self):
1157 for router in self.db.lookup_class('node'):
1158 if router.get_val_int('router', 0):
1159 for netuuid in router.get_networks():
1160 net = self.db.lookup(netuuid)
1162 if (gw.cluster_id == self.cluster_id and
1163 gw.net_type == self.net_type):
1164 if gw.nid != self.nid:
1167 except CommandError, e:
1168 print "disconnect failed: ", self.name
1172 def safe_to_clean(self):
1173 return not is_network_prepared()
1176 self.info(self.net_type, self.nid, self.port)
1178 stop_acceptor(self.port)
1179 if node_is_router():
1180 self.disconnect_peer_gateways()
1182 class RouteTable(Module):
1183 def __init__(self,db):
1184 Module.__init__(self, 'ROUTES', db)
1186 def server_for_route(self, net_type, gw, gw_cluster_id, tgt_cluster_id,
1188 # only setup connections for tcp NALs
1190 if not net_type in ('tcp',):
1193 # connect to target if route is to single node and this node is the gw
1194 if lo == hi and local_interface(net_type, gw_cluster_id, gw):
1195 if not local_cluster(net_type, tgt_cluster_id):
1196 panic("target", lo, " not on the local cluster")
1197 srvdb = self.db.nid2server(lo, net_type, gw_cluster_id)
1198 # connect to gateway if this node is not the gw
1199 elif (local_cluster(net_type, gw_cluster_id)
1200 and not local_interface(net_type, gw_cluster_id, gw)):
1201 srvdb = self.db.nid2server(gw, net_type, gw_cluster_id)
1206 panic("no server for nid", lo)
1209 return Network(srvdb)
1212 if is_network_prepared():
1215 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1216 lctl.add_route(net_type, gw, lo, hi)
1217 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1221 def safe_to_clean(self):
1222 return not is_network_prepared()
1225 if is_network_prepared():
1226 # the network is still being used, don't clean it up
1228 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1229 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1232 lctl.disconnect(srv)
1233 except CommandError, e:
1234 print "disconnect failed: ", self.name
1239 lctl.del_route(net_type, gw, lo, hi)
1240 except CommandError, e:
1241 print "del_route failed: ", self.name
1245 class Management(Module):
1246 def __init__(self, db):
1247 Module.__init__(self, 'MGMT', db)
1248 self.add_lustre_module('lvfs', 'lvfs')
1249 self.add_lustre_module('obdclass', 'obdclass')
1250 self.add_lustre_module('ptlrpc', 'ptlrpc')
1251 self.add_lustre_module('mgmt', 'mgmt_svc')
1254 if is_prepared(self.name):
1257 lctl.newdev("mgmt", self.name, self.uuid)
1259 def safe_to_clean(self):
1263 if is_prepared(self.name):
1264 Module.cleanup(self)
1266 # This is only needed to load the modules; the LDLM device
1267 # is now created automatically.
1269 def __init__(self,db):
1270 Module.__init__(self, 'LDLM', db)
1271 self.add_lustre_module('lvfs', 'lvfs')
1272 self.add_lustre_module('obdclass', 'obdclass')
1273 self.add_lustre_module('ptlrpc', 'ptlrpc')
1282 def __init__(self, db, uuid, fs_name, name_override = None, config_only = None):
1283 Module.__init__(self, 'LOV', db)
1284 if name_override != None:
1285 self.name = "lov_%s" % name_override
1286 self.add_lustre_module('lov', 'lov')
1287 self.mds_uuid = self.db.get_first_ref('mds')
1288 self.stripe_sz = self.db.get_val_int('stripesize', 65536)
1289 self.stripe_off = self.db.get_val_int('stripeoffset', 0)
1290 self.pattern = self.db.get_val_int('stripepattern', 0)
1291 self.devlist = self.db.get_refs('obd')
1292 self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist))
1294 self.desc_uuid = self.uuid
1295 self.uuid = generate_client_uuid(self.name)
1296 self.fs_name = fs_name
1298 self.config_only = 1
1300 self.config_only = None
1301 mds= self.db.lookup(self.mds_uuid)
1302 self.mds_name = mds.getName()
1303 for obd_uuid in self.devlist:
1304 obd = self.db.lookup(obd_uuid)
1305 osc = get_osc(obd, self.uuid, fs_name)
1307 self.osclist.append(osc)
1309 panic('osc not found:', obd_uuid)
1312 if is_prepared(self.name):
1314 if self.config_only:
1315 panic("Can't prepare config_only LOV ", self.name)
1317 for osc in self.osclist:
1319 # Only ignore connect failures with --force, which
1320 # isn't implemented here yet.
1321 osc.prepare(ignore_connect_failure=0)
1322 except CommandError, e:
1323 print "Error preparing OSC %s\n" % osc.uuid
1325 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
1326 self.stripe_off, self.pattern, self.devlist, self.mds_name)
1327 lctl.lov_setup(self.name, self.uuid,
1328 self.desc_uuid, self.mds_name, self.stripe_cnt,
1329 self.stripe_sz, self.stripe_off, self.pattern,
1330 string.join(self.devlist))
1333 if is_prepared(self.name):
1334 Module.cleanup(self)
1335 if self.config_only:
1336 panic("Can't clean up config_only LOV ", self.name)
1337 for osc in self.osclist:
1340 def load_module(self):
1341 if self.config_only:
1342 panic("Can't load modules for config_only LOV ", self.name)
1343 for osc in self.osclist:
1346 Module.load_module(self)
1348 def cleanup_module(self):
1349 if self.config_only:
1350 panic("Can't cleanup modules for config_only LOV ", self.name)
1351 Module.cleanup_module(self)
1352 for osc in self.osclist:
1353 osc.cleanup_module()
1356 class MDSDEV(Module):
1357 def __init__(self,db):
1358 Module.__init__(self, 'MDSDEV', db)
1359 self.devpath = self.db.get_val('devpath','')
1360 self.size = self.db.get_val_int('devsize', 0)
1361 self.journal_size = self.db.get_val_int('journalsize', 0)
1362 self.fstype = self.db.get_val('fstype', '')
1363 self.nspath = self.db.get_val('nspath', '')
1364 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1365 # overwrite the orignal MDSDEV name and uuid with the MDS name and uuid
1366 target_uuid = self.db.get_first_ref('target')
1367 mds = self.db.lookup(target_uuid)
1368 self.name = mds.getName()
1369 self.filesystem_uuids = mds.get_refs('filesystem')
1370 # FIXME: if fstype not set, then determine based on kernel version
1371 self.format = self.db.get_val('autoformat', "no")
1372 if mds.get_val('failover', 0):
1373 self.failover_mds = 'f'
1375 self.failover_mds = 'n'
1376 active_uuid = get_active_target(mds)
1378 panic("No target device found:", target_uuid)
1379 if active_uuid == self.uuid:
1383 if self.active and config.group and config.group != mds.get_val('group'):
1386 self.inode_size = self.db.get_val_int('inodesize', 0)
1387 if self.inode_size == 0:
1388 # find the LOV for this MDS
1389 lovconfig_uuid = mds.get_first_ref('lovconfig')
1390 if not lovconfig_uuid:
1391 panic("No LOV config found for MDS ", mds.name)
1392 lovconfig = mds.lookup(lovconfig_uuid)
1393 lov_uuid = lovconfig.get_first_ref('lov')
1395 panic("No LOV found for lovconfig ", lovconfig.name)
1396 lov = LOV(self.db.lookup(lov_uuid), lov_uuid, 'FS_name', config_only = 1)
1398 # default stripe count controls default inode_size
1399 stripe_count = lov.stripe_cnt
1400 if stripe_count > 77:
1401 self.inode_size = 4096
1402 elif stripe_count > 35:
1403 self.inode_size = 2048
1404 elif stripe_count > 13:
1405 self.inode_size = 1024
1406 elif stripe_count > 3:
1407 self.inode_size = 512
1409 self.inode_size = 256
1411 self.target_dev_uuid = self.uuid
1412 self.uuid = target_uuid
1414 self.add_lustre_module('mdc', 'mdc')
1415 self.add_lustre_module('osc', 'osc')
1416 self.add_lustre_module('lov', 'lov')
1417 self.add_lustre_module('mds', 'mds')
1419 self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.fstype))
1421 def load_module(self):
1423 Module.load_module(self)
1426 if is_prepared(self.name):
1429 debug(self.uuid, "not active")
1432 # run write_conf automatically, if --reformat used
1434 self.info(self.devpath, self.fstype, self.size, self.format)
1436 # never reformat here
1437 blkdev = block_dev(self.devpath, self.size, self.fstype, 0,
1438 self.format, self.journal_size, self.inode_size,
1440 if not is_prepared('MDT'):
1441 lctl.newdev("mdt", 'MDT', 'MDT_UUID', setup ="")
1443 lctl.newdev("mds", self.name, self.uuid,
1444 setup ="%s %s %s" %(blkdev, self.fstype, self.name))
1445 except CommandError, e:
1447 panic("MDS is missing the config log. Need to run " +
1448 "lconf --write_conf.")
1452 def write_conf(self):
1453 if is_prepared(self.name):
1455 self.info(self.devpath, self.fstype, self.format)
1456 blkdev = block_dev(self.devpath, self.size, self.fstype,
1457 config.reformat, self.format, self.journal_size,
1458 self.inode_size, self.mkfsoptions)
1459 lctl.newdev("mds", self.name, self.uuid,
1460 setup ="%s %s" %(blkdev, self.fstype))
1462 # record logs for the MDS lov
1463 for uuid in self.filesystem_uuids:
1464 log("recording clients for filesystem:", uuid)
1465 fs = self.db.lookup(uuid)
1466 obd_uuid = fs.get_first_ref('obd')
1467 client_uuid = generate_client_uuid(self.name)
1468 client = VOSC(self.db.lookup(obd_uuid), client_uuid, self.name,
1471 lctl.clear_log(self.name, self.name)
1472 lctl.record(self.name, self.name)
1474 lctl.mount_option(self.name, client.get_name(), "")
1478 lctl.clear_log(self.name, self.name + '-clean')
1479 lctl.record(self.name, self.name + '-clean')
1481 lctl.del_mount_option(self.name)
1486 # record logs for each client
1488 config_options = "--ldapurl " + config.ldapurl + " --config " + config.config
1490 config_options = CONFIG_FILE
1492 for node_db in self.db.lookup_class('node'):
1493 client_name = node_db.getName()
1494 for prof_uuid in node_db.get_refs('profile'):
1495 prof_db = node_db.lookup(prof_uuid)
1496 # refactor this into a funtion to test "clientness"
1498 for ref_class, ref_uuid in prof_db.get_all_refs():
1499 if ref_class in ('mountpoint','echoclient'):
1500 debug("recording", client_name)
1501 old_noexec = config.noexec
1503 noexec_opt = ('', '-n')
1504 ret, out = run (sys.argv[0],
1505 noexec_opt[old_noexec == 1],
1506 " -v --record --nomod",
1507 "--record_log", client_name,
1508 "--record_device", self.name,
1509 "--node", client_name,
1512 for s in out: log("record> ", string.strip(s))
1513 ret, out = run (sys.argv[0],
1514 noexec_opt[old_noexec == 1],
1515 "--cleanup -v --record --nomod",
1516 "--record_log", client_name + "-clean",
1517 "--record_device", self.name,
1518 "--node", client_name,
1521 for s in out: log("record> ", string.strip(s))
1522 config.noexec = old_noexec
1524 lctl.cleanup(self.name, self.uuid, 0, 0)
1525 except CommandError, e:
1526 log(self.module_name, "cleanup failed: ", self.name)
1529 Module.cleanup(self)
1530 clean_loop(self.devpath)
1532 def msd_remaining(self):
1533 out = lctl.device_list()
1535 if string.split(s)[2] in ('mds',):
1538 def safe_to_clean(self):
1541 def safe_to_clean_modules(self):
1542 return not self.msd_remaining()
1546 debug(self.uuid, "not active")
1549 if is_prepared(self.name):
1551 lctl.cleanup(self.name, self.uuid, config.force,
1553 except CommandError, e:
1554 log(self.module_name, "cleanup failed: ", self.name)
1557 Module.cleanup(self)
1558 if not self.msd_remaining() and is_prepared('MDT'):
1560 lctl.cleanup("MDT", "MDT_UUID", config.force,
1562 except CommandError, e:
1563 print "cleanup failed: ", self.name
1566 clean_loop(self.devpath)
1569 def __init__(self, db):
1570 Module.__init__(self, 'OSD', db)
1571 self.osdtype = self.db.get_val('osdtype')
1572 self.devpath = self.db.get_val('devpath', '')
1573 self.size = self.db.get_val_int('devsize', 0)
1574 self.journal_size = self.db.get_val_int('journalsize', 0)
1575 self.inode_size = self.db.get_val_int('inodesize', 0)
1576 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1577 self.fstype = self.db.get_val('fstype', '')
1578 self.nspath = self.db.get_val('nspath', '')
1579 target_uuid = self.db.get_first_ref('target')
1580 ost = self.db.lookup(target_uuid)
1581 self.name = ost.getName()
1582 self.format = self.db.get_val('autoformat', 'yes')
1583 if ost.get_val('failover', 0):
1584 self.failover_ost = 'f'
1586 self.failover_ost = 'n'
1588 active_uuid = get_active_target(ost)
1590 panic("No target device found:", target_uuid)
1591 if active_uuid == self.uuid:
1595 if self.active and config.group and config.group != ost.get_val('group'):
1598 self.target_dev_uuid = self.uuid
1599 self.uuid = target_uuid
1601 self.add_lustre_module('ost', 'ost')
1602 # FIXME: should we default to ext3 here?
1604 self.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.fstype))
1605 self.add_lustre_module(self.osdtype, self.osdtype)
1607 def load_module(self):
1609 Module.load_module(self)
1611 # need to check /proc/mounts and /etc/mtab before
1612 # formatting anything.
1613 # FIXME: check if device is already formatted.
1615 if is_prepared(self.name):
1618 debug(self.uuid, "not active")
1620 self.info(self.osdtype, self.devpath, self.size, self.fstype,
1621 self.format, self.journal_size, self.inode_size)
1623 if self.osdtype == 'obdecho':
1626 blkdev = block_dev(self.devpath, self.size, self.fstype,
1627 config.reformat, self.format, self.journal_size,
1628 self.inode_size, self.mkfsoptions)
1629 lctl.newdev(self.osdtype, self.name, self.uuid,
1630 setup ="%s %s %s" %(blkdev, self.fstype,
1632 if not is_prepared('OSS'):
1633 lctl.newdev("ost", 'OSS', 'OSS_UUID', setup ="")
1635 def osd_remaining(self):
1636 out = lctl.device_list()
1638 if string.split(s)[2] in ('obdfilter', 'obdecho'):
1641 def safe_to_clean(self):
1644 def safe_to_clean_modules(self):
1645 return not self.osd_remaining()
1649 debug(self.uuid, "not active")
1651 if is_prepared(self.name):
1654 lctl.cleanup(self.name, self.uuid, config.force,
1656 except CommandError, e:
1657 log(self.module_name, "cleanup failed: ", self.name)
1660 if not self.osd_remaining() and is_prepared('OSS'):
1662 lctl.cleanup("OSS", "OSS_UUID", config.force,
1664 except CommandError, e:
1665 print "cleanup failed: ", self.name
1668 if not self.osdtype == 'obdecho':
1669 clean_loop(self.devpath)
1671 def mgmt_uuid_for_fs(mtpt_name):
1674 mtpt_db = toplevel.lookup_name(mtpt_name)
1675 fs_uuid = mtpt_db.get_first_ref('filesystem')
1676 fs = toplevel.lookup(fs_uuid)
1679 return fs.get_first_ref('mgmt')
1681 # Generic client module, used by OSC and MDC
1682 class Client(Module):
1683 def __init__(self, tgtdb, uuid, module, fs_name, self_name=None,
1685 self.target_name = tgtdb.getName()
1686 self.target_uuid = tgtdb.getUUID()
1689 self.tgt_dev_uuid = get_active_target(tgtdb)
1690 if not self.tgt_dev_uuid:
1691 panic("No target device found for target:", self.target_name)
1693 self.kmod = kmod(config.lustre, config.portals)
1697 self.module = module
1698 self.module_name = string.upper(module)
1700 self.name = '%s_%s_%s_%s' % (self.module_name, socket.gethostname(),
1701 self.target_name, fs_name)
1703 self.name = self_name
1705 self.lookup_server(self.tgt_dev_uuid)
1706 mgmt_uuid = mgmt_uuid_for_fs(fs_name)
1708 self.mgmt_name = mgmtcli_name_for_uuid(mgmt_uuid)
1711 self.fs_name = fs_name
1714 self.add_lustre_module(module_dir, module)
1716 def lookup_server(self, srv_uuid):
1717 """ Lookup a server's network information """
1718 self._server_nets = get_ost_net(self.db, srv_uuid)
1719 if len(self._server_nets) == 0:
1720 panic ("Unable to find a server for:", srv_uuid)
1722 def get_servers(self):
1723 return self._server_nets
1725 def prepare(self, ignore_connect_failure = 0):
1726 self.info(self.target_uuid)
1727 if is_prepared(self.name):
1730 srv = choose_local_server(self.get_servers())
1734 routes = find_route(self.get_servers())
1735 if len(routes) == 0:
1736 panic ("no route to", self.target_uuid)
1737 for (srv, r) in routes:
1738 lctl.add_route_host(r[0], srv.nid_uuid, r[1], r[3])
1739 except CommandError, e:
1740 if not ignore_connect_failure:
1743 if self.target_uuid in config.inactive and self.permits_inactive():
1744 debug("%s inactive" % self.target_uuid)
1745 inactive_p = "inactive"
1747 debug("%s active" % self.target_uuid)
1749 lctl.newdev(self.module, self.name, self.uuid,
1750 setup ="%s %s %s %s" % (self.target_uuid, srv.nid_uuid,
1751 inactive_p, self.mgmt_name))
1754 if is_prepared(self.name):
1755 Module.cleanup(self)
1757 srv = choose_local_server(self.get_servers())
1759 lctl.disconnect(srv)
1761 for (srv, r) in find_route(self.get_servers()):
1762 lctl.del_route_host(r[0], srv.nid_uuid, r[1], r[3])
1763 except CommandError, e:
1764 log(self.module_name, "cleanup failed: ", self.name)
1770 def __init__(self, db, uuid, fs_name):
1771 Client.__init__(self, db, uuid, 'mdc', fs_name)
1773 def permits_inactive(self):
1777 def __init__(self, db, uuid, fs_name):
1778 Client.__init__(self, db, uuid, 'osc', fs_name)
1780 def permits_inactive(self):
1783 def mgmtcli_name_for_uuid(uuid):
1784 return 'MGMTCLI_%s' % uuid
1786 class ManagementClient(Client):
1787 def __init__(self, db, uuid):
1788 Client.__init__(self, db, uuid, 'mgmt_cli', '',
1789 self_name = mgmtcli_name_for_uuid(db.getUUID()),
1790 module_dir = 'mgmt')
1793 def __init__(self, db):
1794 Module.__init__(self, 'COBD', db)
1795 self.real_uuid = self.db.get_first_ref('realobd')
1796 self.cache_uuid = self.db.get_first_ref('cacheobd')
1797 self.add_lustre_module('cobd' , 'cobd')
1799 # need to check /proc/mounts and /etc/mtab before
1800 # formatting anything.
1801 # FIXME: check if device is already formatted.
1803 if is_prepared(self.name):
1805 self.info(self.real_uuid, self.cache_uuid)
1806 lctl.newdev("cobd", self.name, self.uuid,
1807 setup ="%s %s" %(self.real_uuid, self.cache_uuid))
1810 # virtual interface for OSC and LOV
1812 def __init__(self, db, uuid, fs_name, name_override = None):
1813 Module.__init__(self, 'VOSC', db)
1814 if db.get_class() == 'lov':
1815 self.osc = LOV(db, uuid, fs_name, name_override)
1817 self.osc = get_osc(db, uuid, fs_name)
1819 return self.osc.uuid
1821 return self.osc.name
1826 def load_module(self):
1827 self.osc.load_module()
1828 def cleanup_module(self):
1829 self.osc.cleanup_module()
1832 class ECHO_CLIENT(Module):
1833 def __init__(self,db):
1834 Module.__init__(self, 'ECHO_CLIENT', db)
1835 self.add_lustre_module('obdecho', 'obdecho')
1836 self.obd_uuid = self.db.get_first_ref('obd')
1837 obd = self.db.lookup(self.obd_uuid)
1838 self.uuid = generate_client_uuid(self.name)
1839 self.osc = VOSC(obd, self.uuid, self.name)
1842 if is_prepared(self.name):
1845 self.osc.prepare() # XXX This is so cheating. -p
1846 self.info(self.obd_uuid)
1848 lctl.newdev("echo_client", self.name, self.uuid,
1849 setup = self.osc.get_name())
1852 if is_prepared(self.name):
1853 Module.cleanup(self)
1856 def load_module(self):
1857 self.osc.load_module()
1858 Module.load_module(self)
1860 def cleanup_module(self):
1861 Module.cleanup_module(self)
1862 self.osc.cleanup_module()
1865 def generate_client_uuid(name):
1866 client_uuid = '%05x_%.19s_%05x%05x' % (int(random.random() * 1048576),
1868 int(random.random() * 1048576),
1869 int(random.random() * 1048576))
1870 return client_uuid[:36]
1873 class Mountpoint(Module):
1874 def __init__(self,db):
1875 Module.__init__(self, 'MTPT', db)
1876 self.path = self.db.get_val('path')
1877 self.fs_uuid = self.db.get_first_ref('filesystem')
1878 fs = self.db.lookup(self.fs_uuid)
1879 self.mds_uuid = fs.get_first_ref('mds')
1880 self.obd_uuid = fs.get_first_ref('obd')
1881 self.mgmt_uuid = fs.get_first_ref('mgmt')
1882 obd = self.db.lookup(self.obd_uuid)
1883 client_uuid = generate_client_uuid(self.name)
1884 self.vosc = VOSC(obd, client_uuid, self.name)
1885 self.mdc = get_mdc(db, client_uuid, self.name, self.mds_uuid)
1887 self.add_lustre_module('mdc', 'mdc')
1888 self.add_lustre_module('llite', 'llite')
1890 self.mgmtcli = ManagementClient(db.lookup(self.mgmt_uuid),
1896 if fs_is_mounted(self.path):
1897 log(self.path, "already mounted.")
1901 self.mgmtcli.prepare()
1904 mdc_name = self.mdc.name
1906 self.info(self.path, self.mds_uuid, self.obd_uuid)
1907 if config.record or config.lctl_dump:
1908 lctl.mount_option(local_node_name, self.vosc.get_name(), mdc_name)
1910 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s %s %s" % \
1911 (self.vosc.get_name(), mdc_name, config.config, self.path)
1912 run("mkdir", self.path)
1917 panic("mount failed:", self.path, ":", string.join(val))
1920 self.info(self.path, self.mds_uuid,self.obd_uuid)
1922 if config.record or config.lctl_dump:
1923 lctl.del_mount_option(local_node_name)
1925 if fs_is_mounted(self.path):
1927 (rc, out) = run("umount", "-f", self.path)
1929 (rc, out) = run("umount", self.path)
1931 raise CommandError('umount', out, rc)
1933 if fs_is_mounted(self.path):
1934 panic("fs is still mounted:", self.path)
1939 self.mgmtcli.cleanup()
1941 def load_module(self):
1943 self.mgmtcli.load_module()
1944 self.vosc.load_module()
1945 Module.load_module(self)
1947 def cleanup_module(self):
1948 Module.cleanup_module(self)
1949 self.vosc.cleanup_module()
1951 self.mgmtcli.cleanup_module()
1954 # ============================================================
1955 # misc query functions
1957 def get_ost_net(self, osd_uuid):
1961 osd = self.lookup(osd_uuid)
1962 node_uuid = osd.get_first_ref('node')
1963 node = self.lookup(node_uuid)
1965 panic("unable to find node for osd_uuid:", osd_uuid,
1966 " node_ref:", node_uuid)
1967 for net_uuid in node.get_networks():
1968 db = node.lookup(net_uuid)
1969 srv_list.append(Network(db))
1973 # the order of iniitailization is based on level.
1974 def getServiceLevel(self):
1975 type = self.get_class()
1977 if type in ('network',):
1979 elif type in ('routetbl',):
1981 elif type in ('ldlm',):
1983 elif type in ('mgmt',):
1985 elif type in ('osd', 'cobd'):
1987 elif type in ('mdsdev',):
1989 elif type in ('mountpoint', 'echoclient'):
1992 panic("Unknown type: ", type)
1994 if ret < config.minlevel or ret > config.maxlevel:
1999 # return list of services in a profile. list is a list of tuples
2000 # [(level, db_object),]
2001 def getServices(self):
2003 for ref_class, ref_uuid in self.get_all_refs():
2004 servdb = self.lookup(ref_uuid)
2006 level = getServiceLevel(servdb)
2008 list.append((level, servdb))
2010 panic('service not found: ' + ref_uuid)
2016 ############################################################
2018 # FIXME: clean this mess up!
2020 # OSC is no longer in the xml, so we have to fake it.
2021 # this is getting ugly and begging for another refactoring
2022 def get_osc(ost_db, uuid, fs_name):
2023 osc = OSC(ost_db, uuid, fs_name)
2026 def get_mdc(db, uuid, fs_name, mds_uuid):
2027 mds_db = db.lookup(mds_uuid);
2029 panic("no mds:", mds_uuid)
2030 mdc = MDC(mds_db, uuid, fs_name)
2033 ############################################################
2034 # routing ("rooting")
2036 # list of (nettype, cluster_id, nid)
2039 def find_local_clusters(node_db):
2040 global local_clusters
2041 for netuuid in node_db.get_networks():
2042 net = node_db.lookup(netuuid)
2044 debug("add_local", netuuid)
2045 local_clusters.append((srv.net_type, srv.cluster_id, srv.nid))
2047 if acceptors.has_key(srv.port):
2048 panic("duplicate port:", srv.port)
2049 acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type,
2050 srv.send_mem, srv.recv_mem,
2053 # This node is a gateway.
2055 def node_is_router():
2058 # If there are any routers found in the config, then this will be true
2059 # and all nodes will load kptlrouter.
2061 def node_needs_router():
2062 return needs_router or is_router
2064 # list of (nettype, gw, tgt_cluster_id, lo, hi)
2065 # Currently, these local routes are only added to kptlrouter route
2066 # table if they are needed to connect to a specific server. This
2067 # should be changed so all available routes are loaded, and the
2068 # ptlrouter can make all the decisions.
2071 def find_local_routes(lustre):
2072 """ Scan the lustre config looking for routers . Build list of
2074 global local_routes, needs_router
2076 list = lustre.lookup_class('node')
2078 if router.get_val_int('router', 0):
2080 for (local_type, local_cluster_id, local_nid) in local_clusters:
2082 for netuuid in router.get_networks():
2083 db = router.lookup(netuuid)
2084 if (local_type == db.get_val('nettype') and
2085 local_cluster_id == db.get_val('clusterid')):
2086 gw = db.get_val('nid')
2089 debug("find_local_routes: gw is", gw)
2090 for route in router.get_local_routes(local_type, gw):
2091 local_routes.append(route)
2092 debug("find_local_routes:", local_routes)
2095 def choose_local_server(srv_list):
2096 for srv in srv_list:
2097 if local_cluster(srv.net_type, srv.cluster_id):
2100 def local_cluster(net_type, cluster_id):
2101 for cluster in local_clusters:
2102 if net_type == cluster[0] and cluster_id == cluster[1]:
2106 def local_interface(net_type, cluster_id, nid):
2107 for cluster in local_clusters:
2108 if (net_type == cluster[0] and cluster_id == cluster[1]
2109 and nid == cluster[2]):
2113 def find_route(srv_list):
2115 frm_type = local_clusters[0][0]
2116 for srv in srv_list:
2117 debug("find_route: srv:", srv.nid, "type: ", srv.net_type)
2118 to_type = srv.net_type
2120 cluster_id = srv.cluster_id
2121 debug ('looking for route to', to_type, to)
2122 for r in local_routes:
2123 debug("find_route: ", r)
2124 if (r[3] <= to and to <= r[4]) and cluster_id == r[2]:
2125 result.append((srv, r))
2128 def get_active_target(db):
2129 target_uuid = db.getUUID()
2130 target_name = db.getName()
2131 node_name = get_select(target_name)
2133 tgt_dev_uuid = db.get_node_tgt_dev(node_name, target_uuid)
2135 tgt_dev_uuid = db.get_first_ref('active')
2138 def get_server_by_nid_uuid(db, nid_uuid):
2139 for n in db.lookup_class("network"):
2141 if net.nid_uuid == nid_uuid:
2145 ############################################################
2149 type = db.get_class()
2150 debug('Service:', type, db.getName(), db.getUUID())
2155 n = LOV(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
2156 elif type == 'network':
2158 elif type == 'routetbl':
2162 elif type == 'cobd':
2164 elif type == 'mdsdev':
2166 elif type == 'mountpoint':
2168 elif type == 'echoclient':
2170 elif type == 'mgmt':
2173 panic ("unknown service type:", type)
2177 # Prepare the system to run lustre using a particular profile
2178 # in a the configuration.
2179 # * load & the modules
2180 # * setup networking for the current node
2181 # * make sure partitions are in place and prepared
2182 # * initialize devices with lctl
2183 # Levels is important, and needs to be enforced.
2184 def for_each_profile(db, prof_list, operation):
2185 for prof_uuid in prof_list:
2186 prof_db = db.lookup(prof_uuid)
2188 panic("profile:", profile, "not found.")
2189 services = getServices(prof_db)
2192 def doWriteconf(services):
2196 if s[1].get_class() == 'mdsdev':
2197 n = newService(s[1])
2200 def doSetup(services):
2204 n = newService(s[1])
2207 def doModules(services):
2211 n = newService(s[1])
2214 def doCleanup(services):
2219 n = newService(s[1])
2220 if n.safe_to_clean():
2223 def doUnloadModules(services):
2228 n = newService(s[1])
2229 if n.safe_to_clean_modules():
2234 def doHost(lustreDB, hosts):
2235 global is_router, local_node_name
2238 node_db = lustreDB.lookup_name(h, 'node')
2242 panic('No host entry found.')
2244 local_node_name = node_db.get_val('name', 0)
2245 is_router = node_db.get_val_int('router', 0)
2246 lustre_upcall = node_db.get_val('lustreUpcall', '')
2247 portals_upcall = node_db.get_val('portalsUpcall', '')
2248 timeout = node_db.get_val_int('timeout', 0)
2249 ptldebug = node_db.get_val('ptldebug', '')
2250 subsystem = node_db.get_val('subsystem', '')
2252 find_local_clusters(node_db)
2254 find_local_routes(lustreDB)
2256 # Two step process: (1) load modules, (2) setup lustre
2257 # if not cleaning, load modules first.
2258 prof_list = node_db.get_refs('profile')
2260 if config.write_conf:
2261 for_each_profile(node_db, prof_list, doModules)
2263 for_each_profile(node_db, prof_list, doWriteconf)
2264 for_each_profile(node_db, prof_list, doUnloadModules)
2266 elif config.recover:
2267 if not (config.tgt_uuid and config.client_uuid and config.conn_uuid):
2268 raise Lustre.LconfError( "--recovery requires --tgt_uuid <UUID> " +
2269 "--client_uuid <UUID> --conn_uuid <UUID>")
2270 doRecovery(lustreDB, lctl, config.tgt_uuid, config.client_uuid,
2272 elif config.cleanup:
2274 # the command line can override this value
2276 # ugly hack, only need to run lctl commands for --dump
2277 if config.lctl_dump or config.record:
2278 for_each_profile(node_db, prof_list, doCleanup)
2281 sys_set_timeout(timeout)
2282 sys_set_ptldebug(ptldebug)
2283 sys_set_subsystem(subsystem)
2284 sys_set_lustre_upcall(lustre_upcall)
2285 sys_set_portals_upcall(portals_upcall)
2287 for_each_profile(node_db, prof_list, doCleanup)
2288 for_each_profile(node_db, prof_list, doUnloadModules)
2291 # ugly hack, only need to run lctl commands for --dump
2292 if config.lctl_dump or config.record:
2293 sys_set_timeout(timeout)
2294 sys_set_lustre_upcall(lustre_upcall)
2295 for_each_profile(node_db, prof_list, doSetup)
2299 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
2300 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
2302 for_each_profile(node_db, prof_list, doModules)
2304 sys_set_debug_path()
2305 sys_set_ptldebug(ptldebug)
2306 sys_set_subsystem(subsystem)
2307 script = config.gdb_script
2308 run(lctl.lctl, ' modules >', script)
2310 log ("The GDB module script is in", script)
2311 # pause, so user has time to break and
2314 sys_set_timeout(timeout)
2315 sys_set_lustre_upcall(lustre_upcall)
2316 sys_set_portals_upcall(portals_upcall)
2318 for_each_profile(node_db, prof_list, doSetup)
2320 def doRecovery(db, lctl, tgt_uuid, client_uuid, nid_uuid):
2321 tgt = db.lookup(tgt_uuid)
2323 raise Lustre.LconfError("doRecovery: "+ tgt_uuid +" not found.")
2324 new_uuid = get_active_target(tgt)
2326 raise Lustre.LconfError("doRecovery: no active target found for: " +
2328 net = choose_local_server(get_ost_net(db, new_uuid))
2330 raise Lustre.LconfError("Unable to find a connection to:" + new_uuid)
2332 log("Reconnecting", tgt_uuid, " to ", net.nid_uuid);
2334 oldnet = get_server_by_nid_uuid(db, nid_uuid)
2336 lctl.disconnect(oldnet)
2337 except CommandError, e:
2338 log("recover: disconnect", nid_uuid, "failed: ")
2343 except CommandError, e:
2344 log("recover: connect failed")
2347 lctl.recover(client_uuid, net.nid_uuid)
2350 def setupModulePath(cmd, portals_dir = PORTALS_DIR):
2351 base = os.path.dirname(cmd)
2352 if development_mode():
2353 if not config.lustre:
2354 config.lustre = (os.path.join(base, ".."))
2355 # normalize the portals dir, using command line arg if set
2357 portals_dir = config.portals
2358 dir = os.path.join(config.lustre, portals_dir)
2359 config.portals = dir
2360 debug('config.portals', config.portals)
2361 elif config.lustre and config.portals:
2363 # if --lustre and --portals, normalize portals
2364 # can ignore POTRALS_DIR here, since it is probly useless here
2365 config.portals = os.path.join(config.lustre, config.portals)
2366 debug('config.portals B', config.portals)
2368 def sysctl(path, val):
2369 debug("+ sysctl", path, val)
2373 fp = open(os.path.join('/proc/sys', path), 'w')
2380 def sys_set_debug_path():
2381 sysctl('portals/debug_path', config.debug_path)
2383 def sys_set_lustre_upcall(upcall):
2384 # the command overrides the value in the node config
2385 if config.lustre_upcall:
2386 upcall = config.lustre_upcall
2388 upcall = config.upcall
2390 lctl.set_lustre_upcall(upcall)
2392 def sys_set_portals_upcall(upcall):
2393 # the command overrides the value in the node config
2394 if config.portals_upcall:
2395 upcall = config.portals_upcall
2397 upcall = config.upcall
2399 sysctl('portals/upcall', upcall)
2401 def sys_set_timeout(timeout):
2402 # the command overrides the value in the node config
2403 if config.timeout and config.timeout > 0:
2404 timeout = config.timeout
2405 if timeout != None and timeout > 0:
2406 lctl.set_timeout(timeout)
2408 def sys_tweak_socknal ():
2409 if config.single_socket:
2410 sysctl("socknal/typed", 0)
2412 def sys_optimize_elan ():
2413 procfiles = ["/proc/elan/config/eventint_punt_loops",
2414 "/proc/qsnet/elan3/config/eventint_punt_loops",
2415 "/proc/qsnet/elan4/config/elan4_mainint_punt_loops"]
2417 if os.access(p, os.R_OK):
2418 run ("echo 0 > " + p)
2420 def sys_set_ptldebug(ptldebug):
2422 ptldebug = config.ptldebug
2425 val = eval(ptldebug, ptldebug_names)
2426 val = "0x%x" % (val)
2427 sysctl('portals/debug', val)
2428 except NameError, e:
2431 def sys_set_subsystem(subsystem):
2432 if config.subsystem:
2433 subsystem = config.subsystem
2436 val = eval(subsystem, subsystem_names)
2437 val = "0x%x" % (val)
2438 sysctl('portals/subsystem_debug', val)
2439 except NameError, e:
2442 def sys_set_netmem_max(path, max):
2443 debug("setting", path, "to at least", max)
2451 fp = open(path, 'w')
2452 fp.write('%d\n' %(max))
2456 def sys_make_devices():
2457 if not os.access('/dev/portals', os.R_OK):
2458 run('mknod /dev/portals c 10 240')
2459 if not os.access('/dev/obd', os.R_OK):
2460 run('mknod /dev/obd c 10 241')
2463 # Add dir to the global PATH, if not already there.
2464 def add_to_path(new_dir):
2465 syspath = string.split(os.environ['PATH'], ':')
2466 if new_dir in syspath:
2468 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
2470 def default_debug_path():
2471 path = '/tmp/lustre-log'
2472 if os.path.isdir('/r'):
2477 def default_gdb_script():
2478 script = '/tmp/ogdb'
2479 if os.path.isdir('/r'):
2480 return '/r' + script
2485 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
2486 # ensure basic elements are in the system path
2487 def sanitise_path():
2488 for dir in DEFAULT_PATH:
2491 # global hack for the --select handling
2493 def init_select(args):
2494 # args = [service=nodeA,service2=nodeB service3=nodeC]
2497 list = string.split(arg, ',')
2499 srv, node = string.split(entry, '=')
2500 tgt_select[srv] = node
2502 def get_select(srv):
2503 if tgt_select.has_key(srv):
2504 return tgt_select[srv]
2508 FLAG = Lustre.Options.FLAG
2509 PARAM = Lustre.Options.PARAM
2510 INTPARAM = Lustre.Options.INTPARAM
2511 PARAMLIST = Lustre.Options.PARAMLIST
2513 ('verbose,v', "Print system commands as they are run"),
2514 ('ldapurl',"LDAP server URL, eg. ldap://localhost", PARAM),
2515 ('config', "Cluster config name used for LDAP query", PARAM),
2516 ('select', "service=nodeA,service2=nodeB ", PARAMLIST),
2517 ('node', "Load config for <nodename>", PARAM),
2518 ('cleanup,d', "Cleans up config. (Shutdown)"),
2519 ('force,f', "Forced unmounting and/or obd detach during cleanup",
2521 ('single_socket', "socknal option: only use one socket instead of bundle",
2523 ('failover',"""Used to shut down without saving state.
2524 This will allow this node to "give up" a service to a
2525 another node for failover purposes. This will not
2526 be a clean shutdown.""",
2528 ('gdb', """Prints message after creating gdb module script
2529 and sleeps for 5 seconds."""),
2530 ('noexec,n', """Prints the commands and steps that will be run for a
2531 config without executing them. This can used to check if a
2532 config file is doing what it should be doing"""),
2533 ('nomod', "Skip load/unload module step."),
2534 ('nosetup', "Skip device setup/cleanup step."),
2535 ('reformat', "Reformat all devices (without question)"),
2536 ('mkfsoptions', "Additional options for the mk*fs command line", PARAM),
2537 ('dump', "Dump the kernel debug log to file before portals is unloaded",
2539 ('write_conf', "Save all the client config information on mds."),
2540 ('record', "Write config information on mds."),
2541 ('record_log', "Name of config record log.", PARAM),
2542 ('record_device', "MDS device name that will record the config commands",
2544 ('minlevel', "Minimum level of services to configure/cleanup",
2546 ('maxlevel', """Maximum level of services to configure/cleanup
2547 Levels are aproximatly like:
2552 70 - mountpoint, echo_client, osc, mdc, lov""",
2554 ('lustre', """Base directory of lustre sources. This parameter will
2555 cause lconf to load modules from a source tree.""", PARAM),
2556 ('portals', """Portals source directory. If this is a relative path,
2557 then it is assumed to be relative to lustre. """, PARAM),
2558 ('timeout', "Set recovery timeout", INTPARAM),
2559 ('upcall', "Set both portals and lustre upcall script", PARAM),
2560 ('lustre_upcall', "Set lustre upcall script", PARAM),
2561 ('portals_upcall', "Set portals upcall script", PARAM),
2562 ('lctl_dump', "Save lctl ioctls to the dumpfile argument", PARAM),
2563 ('ptldebug', "Set the portals debug level", PARAM),
2564 ('subsystem', "Set the portals debug subsystem", PARAM),
2565 ('gdb_script', "Fullname of gdb debug script", PARAM, default_gdb_script()),
2566 ('debug_path', "Path to save debug dumps", PARAM, default_debug_path()),
2567 # Client recovery options
2568 ('recover', "Recover a device"),
2569 ('group', "The group of devices to configure or cleanup", PARAM),
2570 ('tgt_uuid', "The failed target (required for recovery)", PARAM),
2571 ('client_uuid', "The failed client (required for recovery)", PARAM),
2572 ('conn_uuid', "The failed connection (required for recovery)", PARAM),
2574 ('inactive', """The name of an inactive service, to be ignored during
2575 mounting (currently OST-only). Can be repeated.""",
2580 global lctl, config, toplevel, CONFIG_FILE
2582 # in the upcall this is set to SIG_IGN
2583 signal.signal(signal.SIGCHLD, signal.SIG_DFL)
2585 cl = Lustre.Options("lconf", "config.xml", lconf_options)
2587 config, args = cl.parse(sys.argv[1:])
2588 except Lustre.OptionError, e:
2592 setupModulePath(sys.argv[0])
2594 host = socket.gethostname()
2596 # the PRNG is normally seeded with time(), which is not so good for starting
2597 # time-synchronized clusters
2598 input = open('/dev/urandom', 'r')
2600 print 'Unable to open /dev/urandom!'
2602 seed = input.read(32)
2608 init_select(config.select)
2611 if not os.access(args[0], os.R_OK):
2612 print 'File not found or readable:', args[0]
2615 dom = xml.dom.minidom.parse(args[0])
2617 panic("%s does not appear to be a config file." % (args[0]))
2618 sys.exit(1) # make sure to die here, even in debug mode.
2619 CONFIG_FILE = args[0]
2620 db = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement)
2621 if not config.config:
2622 config.config = os.path.basename(args[0])# use full path?
2623 if config.config[-4:] == '.xml':
2624 config.config = config.config[:-4]
2625 elif config.ldapurl:
2626 if not config.config:
2627 panic("--ldapurl requires --config name")
2628 dn = "config=%s,fs=lustre" % (config.config)
2629 db = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl)
2630 elif config.ptldebug or config.subsystem:
2631 sys_set_ptldebug(None)
2632 sys_set_subsystem(None)
2635 print 'Missing config file or ldap URL.'
2636 print 'see lconf --help for command summary'
2641 ver = db.get_version()
2643 panic("No version found in config data, please recreate.")
2644 if ver != Lustre.CONFIG_VERSION:
2645 panic("Config version", ver, "does not match lconf version",
2646 Lustre.CONFIG_VERSION)
2650 node_list.append(config.node)
2653 node_list.append(host)
2654 node_list.append('localhost')
2656 debug("configuring for host: ", node_list)
2659 config.debug_path = config.debug_path + '-' + host
2660 config.gdb_script = config.gdb_script + '-' + host
2662 lctl = LCTLInterface('lctl')
2664 if config.lctl_dump:
2665 lctl.use_save_file(config.lctl_dump)
2668 if not (config.record_device and config.record_log):
2669 panic("When recording, both --record_log and --record_device must be specified.")
2670 lctl.clear_log(config.record_device, config.record_log)
2671 lctl.record(config.record_device, config.record_log)
2673 doHost(db, node_list)
2678 if __name__ == "__main__":
2681 except Lustre.LconfError, e:
2683 # traceback.print_exc(file=sys.stdout)
2685 except CommandError, e:
2689 if first_cleanup_error:
2690 sys.exit(first_cleanup_error)