3 # Copyright (C) 2002-2003 Cluster File Systems, Inc.
4 # Authors: Robert Read <rread@clusterfs.com>
5 # Mike Shaver <shaver@clusterfs.com>
6 # This file is part of Lustre, http://www.lustre.org.
8 # Lustre is free software; you can redistribute it and/or
9 # modify it under the terms of version 2 of the GNU General Public
10 # License as published by the Free Software Foundation.
12 # Lustre is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Lustre; if not, write to the Free Software
19 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 # lconf - lustre configuration tool
23 # lconf is the main driver script for starting and stopping
24 # lustre filesystem services.
26 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import sys, getopt, types
29 import string, os, stat, popen2, socket, time, random, fcntl, select
30 import re, exceptions, signal, traceback
31 import xml.dom.minidom
33 if sys.version[0] == '1':
34 from FCNTL import F_GETFL, F_SETFL
36 from fcntl import F_GETFL, F_SETFL
38 PYMOD_DIR = "@PYMOD_DIR@"
40 def development_mode():
41 base = os.path.dirname(sys.argv[0])
42 if os.access(base+"/Makefile", os.R_OK):
46 if development_mode():
47 sys.path.append('@top_srcdir@/utils')
49 sys.path.append(PYMOD_DIR)
55 DEFAULT_TCPBUF = 8388608
58 # Maximum number of devices to search for.
59 # (the /dev/loop* nodes need to be created beforehand)
60 MAX_LOOP_DEVICES = 256
61 PORTALS_DIR = 'portals'
63 # Needed to call lconf --record
66 # Please keep these in sync with the values in portals/kp30.h
78 "warning" : (1 << 10),
82 "portals" : (1 << 14),
84 "dlmtrace" : (1 << 16),
88 "rpctrace" : (1 << 20),
89 "vfstrace" : (1 << 21),
94 "undefined" : (1 << 0),
104 "portals" : (1 << 10),
105 "socknal" : (1 << 11),
106 "qswnal" : (1 << 12),
107 "pinger" : (1 << 13),
108 "filter" : (1 << 14),
114 "ptlrouter" : (1 << 20),
120 first_cleanup_error = 0
121 def cleanup_error(rc):
122 global first_cleanup_error
123 if not first_cleanup_error:
124 first_cleanup_error = rc
126 # ============================================================
127 # debugging and error funcs
129 def fixme(msg = "this feature"):
130 raise Lustre.LconfError, msg + ' not implmemented yet.'
133 msg = string.join(map(str,args))
134 if not config.noexec:
135 raise Lustre.LconfError(msg)
140 msg = string.join(map(str,args))
145 print string.strip(s)
149 msg = string.join(map(str,args))
152 # ack, python's builtin int() does not support '0x123' syntax.
153 # eval can do it, although what a hack!
157 return eval(s, {}, {})
160 except SyntaxError, e:
161 raise ValueError("not a number")
163 raise ValueError("not a number")
165 # ============================================================
166 # locally defined exceptions
167 class CommandError (exceptions.Exception):
168 def __init__(self, cmd_name, cmd_err, rc=None):
169 self.cmd_name = cmd_name
170 self.cmd_err = cmd_err
175 if type(self.cmd_err) == types.StringType:
177 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
179 print "! %s: %s" % (self.cmd_name, self.cmd_err)
180 elif type(self.cmd_err) == types.ListType:
182 print "! %s (error %d):" % (self.cmd_name, self.rc)
184 print "! %s:" % (self.cmd_name)
185 for s in self.cmd_err:
186 print "> %s" %(string.strip(s))
191 # ============================================================
192 # handle daemons, like the acceptor
194 """ Manage starting and stopping a daemon. Assumes daemon manages
195 it's own pid file. """
197 def __init__(self, cmd):
203 log(self.command, "already running.")
205 self.path = find_prog(self.command)
207 panic(self.command, "not found.")
208 ret, out = runcmd(self.path +' '+ self.command_line())
210 raise CommandError(self.path, out, ret)
214 pid = self.read_pidfile()
216 log ("killing process", pid)
218 #time.sleep(1) # let daemon die
220 log("unable to kill", self.command, e)
222 log("unable to kill", self.command)
225 pid = self.read_pidfile()
235 def read_pidfile(self):
237 fp = open(self.pidfile(), 'r')
244 def clean_pidfile(self):
245 """ Remove a stale pidfile """
246 log("removing stale pidfile:", self.pidfile())
248 os.unlink(self.pidfile())
250 log(self.pidfile(), e)
252 class AcceptorHandler(DaemonHandler):
253 def __init__(self, port, net_type, send_mem, recv_mem, irq_aff):
254 DaemonHandler.__init__(self, "acceptor")
257 self.send_mem = send_mem
258 self.recv_mem = recv_mem
261 self.flags = self.flags + ' -i'
264 return "/var/run/%s-%d.pid" % (self.command, self.port)
266 def command_line(self):
267 return string.join(map(str,('-s', self.send_mem, '-r', self.recv_mem, self.flags, self.port)))
271 # start the acceptors
273 if config.lctl_dump or config.record:
275 for port in acceptors.keys():
276 daemon = acceptors[port]
277 if not daemon.running():
280 def run_one_acceptor(port):
281 if config.lctl_dump or config.record:
283 if acceptors.has_key(port):
284 daemon = acceptors[port]
285 if not daemon.running():
288 panic("run_one_acceptor: No acceptor defined for port:", port)
290 def stop_acceptor(port):
291 if acceptors.has_key(port):
292 daemon = acceptors[port]
297 # ============================================================
298 # handle lctl interface
301 Manage communication with lctl
304 def __init__(self, cmd):
306 Initialize close by finding the lctl binary.
308 self.lctl = find_prog(cmd)
310 self.record_device = ''
313 debug('! lctl not found')
316 raise CommandError('lctl', "unable to find lctl binary.")
318 def use_save_file(self, file):
319 self.save_file = file
321 def record(self, dev_name, logname):
322 log("Recording log", logname, "on", dev_name)
323 self.record_device = dev_name
324 self.record_log = logname
326 def end_record(self):
327 log("End recording log", self.record_log, "on", self.record_device)
328 self.record_device = None
329 self.record_log = None
331 def set_nonblock(self, fd):
332 fl = fcntl.fcntl(fd, F_GETFL)
333 fcntl.fcntl(fd, F_SETFL, fl | os.O_NDELAY)
338 the cmds are written to stdin of lctl
339 lctl doesn't return errors when run in script mode, so
341 should modify command line to accept multiple commands, or
342 create complex command line options
346 cmds = '\n dump ' + self.save_file + '\n' + cmds
347 elif self.record_device:
351 %s""" % (self.record_device, self.record_log, cmds)
353 debug("+", cmd_line, cmds)
354 if config.noexec: return (0, [])
356 child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command
357 child.tochild.write(cmds + "\n")
358 child.tochild.close()
360 # From "Python Cookbook" from O'Reilly
361 outfile = child.fromchild
362 outfd = outfile.fileno()
363 self.set_nonblock(outfd)
364 errfile = child.childerr
365 errfd = errfile.fileno()
366 self.set_nonblock(errfd)
368 outdata = errdata = ''
371 ready = select.select([outfd,errfd],[],[]) # Wait for input
372 if outfd in ready[0]:
373 outchunk = outfile.read()
374 if outchunk == '': outeof = 1
375 outdata = outdata + outchunk
376 if errfd in ready[0]:
377 errchunk = errfile.read()
378 if errchunk == '': erreof = 1
379 errdata = errdata + errchunk
380 if outeof and erreof: break
381 # end of "borrowed" code
384 if os.WIFEXITED(ret):
385 rc = os.WEXITSTATUS(ret)
388 if rc or len(errdata):
389 raise CommandError(self.lctl, errdata, rc)
392 def runcmd(self, *args):
394 run lctl using the command line
396 cmd = string.join(map(str,args))
397 debug("+", self.lctl, cmd)
398 rc, out = run(self.lctl, cmd)
400 raise CommandError(self.lctl, out, rc)
404 def clear_log(self, dev, log):
405 """ clear an existing log """
410 quit """ % (dev, log)
413 def network(self, net, nid):
418 quit """ % (net, nid)
421 # create a new connection
422 def add_uuid(self, net_type, uuid, nid):
423 cmds = "\n add_uuid %s %s %s" %(uuid, nid, net_type)
426 def add_autoconn(self, net_type, send_mem, recv_mem, nid, hostaddr,
428 if net_type in ('tcp',) and not config.lctl_dump:
433 add_autoconn %s %s %d %s
437 nid, hostaddr, port, flags )
440 def connect(self, srv):
441 self.add_uuid(srv.net_type, srv.nid_uuid, srv.nid)
442 if srv.net_type in ('tcp',) and not config.lctl_dump:
446 self.add_autoconn(srv.net_type, srv.send_mem, srv.recv_mem,
447 srv.nid, srv.hostaddr, srv.port, flags)
450 def recover(self, dev_name, new_conn):
453 recover %s""" %(dev_name, new_conn)
456 # add a route to a range
457 def add_route(self, net, gw, lo, hi):
465 except CommandError, e:
469 def del_route(self, net, gw, lo, hi):
474 quit """ % (net, gw, lo, hi)
477 # add a route to a host
478 def add_route_host(self, net, uuid, gw, tgt):
479 self.add_uuid(net, uuid, tgt)
487 except CommandError, e:
491 # add a route to a range
492 def del_route_host(self, net, uuid, gw, tgt):
498 quit """ % (net, gw, tgt)
502 def del_autoconn(self, net_type, nid, hostaddr):
503 if net_type in ('tcp',) and not config.lctl_dump:
512 # disconnect one connection
513 def disconnect(self, srv):
514 self.del_uuid(srv.nid_uuid)
515 if srv.net_type in ('tcp',) and not config.lctl_dump:
516 self.del_autoconn(srv.net_type, srv.nid, srv.hostaddr)
518 def del_uuid(self, uuid):
526 def disconnectAll(self, net):
534 def attach(self, type, name, uuid):
537 quit""" % (type, name, uuid)
540 def setup(self, name, setup = ""):
544 quit""" % (name, setup)
548 # create a new device with lctl
549 def newdev(self, type, name, uuid, setup = ""):
550 self.attach(type, name, uuid);
552 self.setup(name, setup)
553 except CommandError, e:
554 self.cleanup(name, uuid, 0)
559 def cleanup(self, name, uuid, force, failover = 0):
560 if failover: force = 1
566 quit""" % (name, ('', 'force')[force],
567 ('', 'failover')[failover])
571 def lov_setup(self, name, uuid, desc_uuid, mdsuuid, stripe_cnt,
572 stripe_sz, stripe_off,
576 lov_setup %s %d %d %d %s %s
577 quit""" % (name, uuid, desc_uuid, stripe_cnt, stripe_sz, stripe_off,
582 def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off,
586 lov_setconfig %s %d %d %d %s %s
587 quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
591 def dump(self, dump_file):
594 quit""" % (dump_file)
597 # get list of devices
598 def device_list(self):
599 devices = '/proc/fs/lustre/devices'
601 if os.access(devices, os.R_OK):
603 fp = open(devices, 'r')
611 def lustre_version(self):
612 rc, out = self.runcmd('version')
616 def mount_option(self, profile, osc, mdc):
618 mount_option %s %s %s
619 quit""" % (profile, osc, mdc)
622 # delete mount options
623 def del_mount_option(self, profile):
629 def set_timeout(self, timeout):
635 # delete mount options
636 def set_lustre_upcall(self, upcall):
641 # ============================================================
642 # Various system-level functions
643 # (ideally moved to their own module)
645 # Run a command and return the output and status.
646 # stderr is sent to /dev/null, could use popen3 to
647 # save it if necessary
650 if config.noexec: return (0, [])
651 f = os.popen(cmd + ' 2>&1')
661 cmd = string.join(map(str,args))
664 # Run a command in the background.
665 def run_daemon(*args):
666 cmd = string.join(map(str,args))
668 if config.noexec: return 0
669 f = os.popen(cmd + ' 2>&1')
677 # Determine full path to use for an external command
678 # searches dirname(argv[0]) first, then PATH
680 syspath = string.split(os.environ['PATH'], ':')
681 cmdpath = os.path.dirname(sys.argv[0])
682 syspath.insert(0, cmdpath);
684 syspath.insert(0, os.path.join(config.portals, 'utils/'))
686 prog = os.path.join(d,cmd)
687 if os.access(prog, os.X_OK):
691 # Recursively look for file starting at base dir
692 def do_find_file(base, mod):
693 fullname = os.path.join(base, mod)
694 if os.access(fullname, os.R_OK):
696 for d in os.listdir(base):
697 dir = os.path.join(base,d)
698 if os.path.isdir(dir):
699 module = do_find_file(dir, mod)
703 def find_module(src_dir, dev_dir, modname):
704 modbase = src_dir +'/'+ dev_dir +'/'+ modname
705 for modext in '.ko', '.o':
706 module = modbase + modext
708 if os.access(module, os.R_OK):
714 # is the path a block device?
721 return stat.S_ISBLK(s[stat.ST_MODE])
723 # build fs according to type
725 def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1):
731 panic("size of filesystem on '%s' must be larger than 8MB, but is set to %s"%
733 # devsize is in 1k, and fs block count is in 4k
734 block_cnt = devsize/4
736 if fstype in ('ext3', 'extN'):
737 # ext3 journal size is in megabytes
740 if not is_block(dev):
741 ret, out = runcmd("ls -l %s" %dev)
742 devsize = int(string.split(out[0])[4]) / 1024
744 ret, out = runcmd("sfdisk -s %s" %dev)
745 devsize = int(out[0])
746 if devsize > 1024 * 1024:
747 jsize = ((devsize / 102400) * 4)
750 if jsize: jopt = "-J size=%d" %(jsize,)
751 if isize: iopt = "-I %d" %(isize,)
752 mkfs = 'mkfs.ext2 -j -b 4096 '
753 if not isblock or config.force:
755 elif fstype == 'reiserfs':
756 # reiserfs journal size is in blocks
757 if jsize: jopt = "--journal_size %d" %(jsize,)
758 mkfs = 'mkreiserfs -ff'
760 panic('unsupported fs type: ', fstype)
762 if config.mkfsoptions != None:
763 mkfs = mkfs + ' ' + config.mkfsoptions
764 if mkfsoptions != None:
765 mkfs = mkfs + ' ' + mkfsoptions
766 (ret, out) = run (mkfs, jopt, iopt, dev, block_cnt)
768 panic("Unable to build fs:", dev, string.join(out))
769 # enable hash tree indexing on fsswe
770 if fstype in ('ext3', 'extN'):
771 htree = 'echo "feature FEATURE_C5" | debugfs -w'
772 (ret, out) = run (htree, dev)
774 panic("Unable to enable htree:", dev)
776 # some systems use /dev/loopN, some /dev/loop/N
780 if not os.access(loop + str(0), os.R_OK):
782 if not os.access(loop + str(0), os.R_OK):
783 panic ("can't access loop devices")
786 # find loop device assigned to thefile
789 for n in xrange(0, MAX_LOOP_DEVICES):
791 if os.access(dev, os.R_OK):
792 (stat, out) = run('losetup', dev)
793 if out and stat == 0:
794 m = re.search(r'\((.*)\)', out[0])
795 if m and file == m.group(1):
801 # create file if necessary and assign the first free loop device
802 def init_loop(file, size, fstype, journal_size, inode_size, mkfsoptions, reformat):
803 dev = find_loop(file)
805 print 'WARNING file:', file, 'already mapped to', dev
807 if reformat or not os.access(file, os.R_OK | os.W_OK):
809 panic("size of loopback file '%s' must be larger than 8MB, but is set to %s" % (file,size))
810 (ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size,
813 panic("Unable to create backing store:", file)
814 mkfs(file, size, fstype, journal_size, inode_size, mkfsoptions, isblock=0)
817 # find next free loop
818 for n in xrange(0, MAX_LOOP_DEVICES):
820 if os.access(dev, os.R_OK):
821 (stat, out) = run('losetup', dev)
823 run('losetup', dev, file)
826 print "out of loop devices"
828 print "out of loop devices"
831 # undo loop assignment
832 def clean_loop(file):
833 dev = find_loop(file)
835 ret, out = run('losetup -d', dev)
837 log('unable to clean loop device:', dev, 'for file:', file)
840 # determine if dev is formatted as a <fstype> filesystem
841 def need_format(fstype, dev):
842 # FIXME don't know how to implement this
845 # initialize a block device if needed
846 def block_dev(dev, size, fstype, reformat, autoformat, journal_size,
847 inode_size, mkfsoptions):
848 if config.noexec: return dev
849 if not is_block(dev):
850 dev = init_loop(dev, size, fstype, journal_size, inode_size,
851 mkfsoptions, reformat)
852 elif reformat or (need_format(fstype, dev) and autoformat == 'yes'):
853 mkfs(dev, size, fstype, journal_size, inode_size, mkfsoptions,
856 # panic("device:", dev,
857 # "not prepared, and autoformat is not set.\n",
858 # "Rerun with --reformat option to format ALL filesystems")
863 """lookup IP address for an interface"""
864 rc, out = run("/sbin/ifconfig", iface)
867 addr = string.split(out[1])[1]
868 ip = string.split(addr, ':')[1]
871 def sys_get_elan_position_file():
872 procfiles = ["/proc/elan/device0/position",
873 "/proc/qsnet/elan4/device0/position",
874 "/proc/qsnet/elan3/device0/position"]
876 if os.access(p, os.R_OK):
880 def sys_get_local_nid(net_type, wildcard, cluster_id):
881 """Return the local nid."""
883 if sys_get_elan_position_file():
884 local = sys_get_local_address('elan', '*', cluster_id)
886 local = sys_get_local_address(net_type, wildcard, cluster_id)
889 def sys_get_local_address(net_type, wildcard, cluster_id):
890 """Return the local address for the network type."""
892 if net_type in ('tcp',):
894 iface, star = string.split(wildcard, ':')
895 local = if2addr(iface)
897 panic ("unable to determine ip for:", wildcard)
899 host = socket.gethostname()
900 local = socket.gethostbyname(host)
901 elif net_type == 'elan':
902 # awk '/NodeId/ { print $2 }' 'sys_get_elan_position_file()'
903 f = sys_get_elan_position_file()
905 panic ("unable to determine local Elan ID")
908 lines = fp.readlines()
916 nid = my_int(cluster_id) + my_int(elan_id)
918 except ValueError, e:
922 elif net_type == 'gm':
923 fixme("automatic local address for GM")
924 elif net_type == 'scimac':
925 scinode="/opt/scali/sbin/scinode"
926 if os.path.exists(scinode):
927 (rc,local) = run(scinode)
929 panic (scinode, " not found on node with scimac networking")
931 panic (scinode, " failed")
932 local=string.rstrip(local[0])
936 def mod_loaded(modname):
937 """Check if a module is already loaded. Look in /proc/modules for it."""
939 fp = open('/proc/modules')
940 lines = fp.readlines()
942 # please forgive my tired fingers for this one
943 ret = filter(lambda word, mod=modname: word == mod,
944 map(lambda line: string.split(line)[0], lines))
949 # XXX: instead of device_list, ask for $name and see what we get
950 def is_prepared(name):
951 """Return true if a device exists for the name"""
954 if (config.noexec or config.record) and config.cleanup:
957 # expect this format:
958 # 1 UP ldlm ldlm ldlm_UUID 2
959 out = lctl.device_list()
961 if name == string.split(s)[3]:
963 except CommandError, e:
967 def is_network_prepared():
968 """If the any device exists, then assume that all networking
969 has been configured"""
970 out = lctl.device_list()
973 def fs_is_mounted(path):
974 """Return true if path is a mounted lustre filesystem"""
976 fp = open('/proc/mounts')
977 lines = fp.readlines()
981 if a[1] == path and a[2] == 'lustre_lite':
989 """Manage kernel modules"""
990 def __init__(self, lustre_dir, portals_dir):
991 self.lustre_dir = lustre_dir
992 self.portals_dir = portals_dir
993 self.kmodule_list = []
995 def add_portals_module(self, dev_dir, modname):
996 """Append a module to list of modules to load."""
997 self.kmodule_list.append((self.portals_dir, dev_dir, modname))
999 def add_lustre_module(self, dev_dir, modname):
1000 """Append a module to list of modules to load."""
1001 self.kmodule_list.append((self.lustre_dir, dev_dir, modname))
1003 def load_module(self):
1004 """Load all the modules in the list in the order they appear."""
1005 for src_dir, dev_dir, mod in self.kmodule_list:
1006 if mod_loaded(mod) and not config.noexec:
1008 log ('loading module:', mod, 'srcdir', src_dir, 'devdir', dev_dir)
1010 module = find_module(src_dir, dev_dir, mod)
1012 panic('module not found:', mod)
1013 (rc, out) = run('/sbin/insmod', module)
1015 raise CommandError('insmod', out, rc)
1017 (rc, out) = run('/sbin/modprobe', mod)
1019 raise CommandError('modprobe', out, rc)
1021 def cleanup_module(self):
1022 """Unload the modules in the list in reverse order."""
1023 rev = self.kmodule_list
1025 for src_dir, dev_dir, mod in rev:
1026 if not mod_loaded(mod) and not config.noexec:
1029 if mod == 'portals' and config.dump:
1030 lctl.dump(config.dump)
1031 log('unloading module:', mod)
1032 (rc, out) = run('/sbin/rmmod', mod)
1034 log('! unable to unload module:', mod)
1037 # ============================================================
1038 # Classes to prepare and cleanup the various objects
1041 """ Base class for the rest of the modules. The default cleanup method is
1042 defined here, as well as some utilitiy funcs.
1044 def __init__(self, module_name, db):
1046 self.module_name = module_name
1047 self.name = self.db.getName()
1048 self.uuid = self.db.getUUID()
1051 self.kmod = kmod(config.lustre, config.portals)
1053 def info(self, *args):
1054 msg = string.join(map(str,args))
1055 print self.module_name + ":", self.name, self.uuid, msg
1058 """ default cleanup, used for most modules """
1061 lctl.cleanup(self.name, self.uuid, config.force)
1062 except CommandError, e:
1063 log(self.module_name, "cleanup failed: ", self.name)
1067 def add_portals_module(self, dev_dir, modname):
1068 """Append a module to list of modules to load."""
1069 self.kmod.add_portals_module(dev_dir, modname)
1071 def add_lustre_module(self, dev_dir, modname):
1072 """Append a module to list of modules to load."""
1073 self.kmod.add_lustre_module(dev_dir, modname)
1075 def load_module(self):
1076 """Load all the modules in the list in the order they appear."""
1077 self.kmod.load_module()
1079 def cleanup_module(self):
1080 """Unload the modules in the list in reverse order."""
1081 if self.safe_to_clean():
1082 self.kmod.cleanup_module()
1084 def safe_to_clean(self):
1087 def safe_to_clean_modules(self):
1088 return self.safe_to_clean()
1090 class Network(Module):
1091 def __init__(self,db):
1092 Module.__init__(self, 'NETWORK', db)
1093 self.net_type = self.db.get_val('nettype')
1094 self.nid = self.db.get_val('nid', '*')
1095 self.cluster_id = self.db.get_val('clusterid', "0")
1096 self.port = self.db.get_val_int('port', 0)
1097 self.send_mem = self.db.get_val_int('sendmem', DEFAULT_TCPBUF)
1098 self.recv_mem = self.db.get_val_int('recvmem', DEFAULT_TCPBUF)
1099 self.irq_affinity = self.db.get_val_int('irqaffinity', 0)
1102 self.nid = sys_get_local_nid(self.net_type, self.nid, self.cluster_id)
1104 panic("unable to set nid for", self.net_type, self.nid, cluster_id)
1105 self.generic_nid = 1
1106 debug("nid:", self.nid)
1108 self.generic_nid = 0
1110 self.nid_uuid = self.nid_to_uuid(self.nid)
1112 self.hostaddr = self.db.get_val('hostaddr', self.nid)
1113 if '*' in self.hostaddr:
1114 self.hostaddr = sys_get_local_address(self.net_type, self.hostaddr, self.cluster_id)
1115 if not self.hostaddr:
1116 panic("unable to set hostaddr for", self.net_type, self.hostaddr, self.cluster_id)
1117 debug("hostaddr:", self.hostaddr)
1119 self.add_portals_module("libcfs", 'libcfs')
1120 self.add_portals_module("portals", 'portals')
1121 if node_needs_router():
1122 self.add_portals_module("router", 'kptlrouter')
1123 if self.net_type == 'tcp':
1124 self.add_portals_module("knals/socknal", 'ksocknal')
1125 if self.net_type == 'elan':
1126 self.add_portals_module("knals/qswnal", 'kqswnal')
1127 if self.net_type == 'gm':
1128 self.add_portals_module("knals/gmnal", 'kgmnal')
1129 if self.net_type == 'scimac':
1130 self.add_portals_module("knals/scimacnal", 'kscimacnal')
1132 def nid_to_uuid(self, nid):
1133 return "NID_%s_UUID" %(nid,)
1136 if is_network_prepared():
1138 self.info(self.net_type, self.nid, self.port)
1139 if not (config.record and self.generic_nid):
1140 lctl.network(self.net_type, self.nid)
1141 if self.net_type == 'tcp':
1143 if self.net_type == 'elan':
1145 if self.port and node_is_router():
1146 run_one_acceptor(self.port)
1147 self.connect_peer_gateways()
1149 def connect_peer_gateways(self):
1150 for router in self.db.lookup_class('node'):
1151 if router.get_val_int('router', 0):
1152 for netuuid in router.get_networks():
1153 net = self.db.lookup(netuuid)
1155 if (gw.cluster_id == self.cluster_id and
1156 gw.net_type == self.net_type):
1157 if gw.nid != self.nid:
1160 def disconnect_peer_gateways(self):
1161 for router in self.db.lookup_class('node'):
1162 if router.get_val_int('router', 0):
1163 for netuuid in router.get_networks():
1164 net = self.db.lookup(netuuid)
1166 if (gw.cluster_id == self.cluster_id and
1167 gw.net_type == self.net_type):
1168 if gw.nid != self.nid:
1171 except CommandError, e:
1172 print "disconnect failed: ", self.name
1176 def safe_to_clean(self):
1177 return not is_network_prepared()
1180 self.info(self.net_type, self.nid, self.port)
1182 stop_acceptor(self.port)
1183 if node_is_router():
1184 self.disconnect_peer_gateways()
1186 class RouteTable(Module):
1187 def __init__(self,db):
1188 Module.__init__(self, 'ROUTES', db)
1190 def server_for_route(self, net_type, gw, gw_cluster_id, tgt_cluster_id,
1192 # only setup connections for tcp NALs
1194 if not net_type in ('tcp',):
1197 # connect to target if route is to single node and this node is the gw
1198 if lo == hi and local_interface(net_type, gw_cluster_id, gw):
1199 if not local_cluster(net_type, tgt_cluster_id):
1200 panic("target", lo, " not on the local cluster")
1201 srvdb = self.db.nid2server(lo, net_type, gw_cluster_id)
1202 # connect to gateway if this node is not the gw
1203 elif (local_cluster(net_type, gw_cluster_id)
1204 and not local_interface(net_type, gw_cluster_id, gw)):
1205 srvdb = self.db.nid2server(gw, net_type, gw_cluster_id)
1210 panic("no server for nid", lo)
1213 return Network(srvdb)
1216 if is_network_prepared():
1219 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1220 lctl.add_route(net_type, gw, lo, hi)
1221 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1225 def safe_to_clean(self):
1226 return not is_network_prepared()
1229 if is_network_prepared():
1230 # the network is still being used, don't clean it up
1232 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1233 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1236 lctl.disconnect(srv)
1237 except CommandError, e:
1238 print "disconnect failed: ", self.name
1243 lctl.del_route(net_type, gw, lo, hi)
1244 except CommandError, e:
1245 print "del_route failed: ", self.name
1249 class Management(Module):
1250 def __init__(self, db):
1251 Module.__init__(self, 'MGMT', db)
1252 self.add_lustre_module('lvfs', 'lvfs')
1253 self.add_lustre_module('obdclass', 'obdclass')
1254 self.add_lustre_module('ptlrpc', 'ptlrpc')
1255 self.add_lustre_module('mgmt', 'mgmt_svc')
1258 if is_prepared(self.name):
1261 lctl.newdev("mgmt", self.name, self.uuid)
1263 def safe_to_clean(self):
1267 if is_prepared(self.name):
1268 Module.cleanup(self)
1270 # This is only needed to load the modules; the LDLM device
1271 # is now created automatically.
1273 def __init__(self,db):
1274 Module.__init__(self, 'LDLM', db)
1275 self.add_lustre_module('lvfs', 'lvfs')
1276 self.add_lustre_module('obdclass', 'obdclass')
1277 self.add_lustre_module('ptlrpc', 'ptlrpc')
1286 def __init__(self, db, uuid, fs_name, name_override = None, config_only = None):
1287 Module.__init__(self, 'LOV', db)
1288 if name_override != None:
1289 self.name = "lov_%s" % name_override
1290 self.add_lustre_module('lov', 'lov')
1291 self.mds_uuid = self.db.get_first_ref('mds')
1292 self.stripe_sz = self.db.get_val_int('stripesize', 65536)
1293 self.stripe_off = self.db.get_val_int('stripeoffset', 0)
1294 self.pattern = self.db.get_val_int('stripepattern', 0)
1295 self.devlist = self.db.get_refs('obd')
1296 self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist))
1298 self.desc_uuid = self.uuid
1299 self.uuid = generate_client_uuid(self.name)
1300 self.fs_name = fs_name
1302 self.config_only = 1
1304 self.config_only = None
1305 mds= self.db.lookup(self.mds_uuid)
1306 self.mds_name = mds.getName()
1307 for obd_uuid in self.devlist:
1308 obd = self.db.lookup(obd_uuid)
1309 osc = get_osc(obd, self.uuid, fs_name)
1311 self.osclist.append(osc)
1313 panic('osc not found:', obd_uuid)
1316 if is_prepared(self.name):
1318 if self.config_only:
1319 panic("Can't prepare config_only LOV ", self.name)
1321 for osc in self.osclist:
1323 # Only ignore connect failures with --force, which
1324 # isn't implemented here yet.
1325 osc.prepare(ignore_connect_failure=0)
1326 except CommandError, e:
1327 print "Error preparing OSC %s\n" % osc.uuid
1329 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
1330 self.stripe_off, self.pattern, self.devlist, self.mds_name)
1331 lctl.lov_setup(self.name, self.uuid,
1332 self.desc_uuid, self.mds_name, self.stripe_cnt,
1333 self.stripe_sz, self.stripe_off, self.pattern,
1334 string.join(self.devlist))
1337 if is_prepared(self.name):
1338 Module.cleanup(self)
1339 if self.config_only:
1340 panic("Can't clean up config_only LOV ", self.name)
1341 for osc in self.osclist:
1344 def load_module(self):
1345 if self.config_only:
1346 panic("Can't load modules for config_only LOV ", self.name)
1347 for osc in self.osclist:
1350 Module.load_module(self)
1352 def cleanup_module(self):
1353 if self.config_only:
1354 panic("Can't cleanup modules for config_only LOV ", self.name)
1355 Module.cleanup_module(self)
1356 for osc in self.osclist:
1357 osc.cleanup_module()
1360 class MDSDEV(Module):
1361 def __init__(self,db):
1362 Module.__init__(self, 'MDSDEV', db)
1363 self.devpath = self.db.get_val('devpath','')
1364 self.size = self.db.get_val_int('devsize', 0)
1365 self.journal_size = self.db.get_val_int('journalsize', 0)
1366 self.fstype = self.db.get_val('fstype', '')
1367 self.nspath = self.db.get_val('nspath', '')
1368 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1369 # overwrite the orignal MDSDEV name and uuid with the MDS name and uuid
1370 target_uuid = self.db.get_first_ref('target')
1371 mds = self.db.lookup(target_uuid)
1372 self.name = mds.getName()
1373 self.filesystem_uuids = mds.get_refs('filesystem')
1374 # FIXME: if fstype not set, then determine based on kernel version
1375 self.format = self.db.get_val('autoformat', "no")
1376 if mds.get_val('failover', 0):
1377 self.failover_mds = 'f'
1379 self.failover_mds = 'n'
1380 active_uuid = get_active_target(mds)
1382 panic("No target device found:", target_uuid)
1383 if active_uuid == self.uuid:
1387 if self.active and config.group and config.group != mds.get_val('group'):
1390 self.inode_size = self.db.get_val_int('inodesize', 0)
1391 if self.inode_size == 0:
1392 # find the LOV for this MDS
1393 lovconfig_uuid = mds.get_first_ref('lovconfig')
1394 if not lovconfig_uuid:
1395 panic("No LOV config found for MDS ", mds.name)
1396 lovconfig = mds.lookup(lovconfig_uuid)
1397 lov_uuid = lovconfig.get_first_ref('lov')
1399 panic("No LOV found for lovconfig ", lovconfig.name)
1400 lov = LOV(self.db.lookup(lov_uuid), lov_uuid, 'FS_name', config_only = 1)
1402 # default stripe count controls default inode_size
1403 stripe_count = lov.stripe_cnt
1404 if stripe_count > 77:
1405 self.inode_size = 4096
1406 elif stripe_count > 35:
1407 self.inode_size = 2048
1408 elif stripe_count > 13:
1409 self.inode_size = 1024
1410 elif stripe_count > 3:
1411 self.inode_size = 512
1413 self.inode_size = 256
1415 self.target_dev_uuid = self.uuid
1416 self.uuid = target_uuid
1418 self.add_lustre_module('mdc', 'mdc')
1419 self.add_lustre_module('osc', 'osc')
1420 self.add_lustre_module('lov', 'lov')
1421 self.add_lustre_module('mds', 'mds')
1423 self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.fstype))
1425 def load_module(self):
1427 Module.load_module(self)
1430 if is_prepared(self.name):
1433 debug(self.uuid, "not active")
1436 # run write_conf automatically, if --reformat used
1438 self.info(self.devpath, self.fstype, self.size, self.format)
1440 # never reformat here
1441 blkdev = block_dev(self.devpath, self.size, self.fstype, 0,
1442 self.format, self.journal_size, self.inode_size,
1444 if not is_prepared('MDT'):
1445 lctl.newdev("mdt", 'MDT', 'MDT_UUID', setup ="")
1447 lctl.newdev("mds", self.name, self.uuid,
1448 setup ="%s %s %s" %(blkdev, self.fstype, self.name))
1449 except CommandError, e:
1451 panic("MDS is missing the config log. Need to run " +
1452 "lconf --write_conf.")
1456 def write_conf(self):
1457 if is_prepared(self.name):
1459 self.info(self.devpath, self.fstype, self.format)
1460 blkdev = block_dev(self.devpath, self.size, self.fstype,
1461 config.reformat, self.format, self.journal_size,
1462 self.inode_size, self.mkfsoptions)
1463 lctl.newdev("mds", self.name, self.uuid,
1464 setup ="%s %s" %(blkdev, self.fstype))
1466 # record logs for the MDS lov
1467 for uuid in self.filesystem_uuids:
1468 log("recording clients for filesystem:", uuid)
1469 fs = self.db.lookup(uuid)
1470 obd_uuid = fs.get_first_ref('obd')
1471 client_uuid = generate_client_uuid(self.name)
1472 client = VOSC(self.db.lookup(obd_uuid), client_uuid, self.name,
1475 lctl.clear_log(self.name, self.name)
1476 lctl.record(self.name, self.name)
1478 lctl.mount_option(self.name, client.get_name(), "")
1482 lctl.clear_log(self.name, self.name + '-clean')
1483 lctl.record(self.name, self.name + '-clean')
1485 lctl.del_mount_option(self.name)
1490 # record logs for each client
1492 config_options = "--ldapurl " + config.ldapurl + " --config " + config.config
1494 config_options = CONFIG_FILE
1496 for node_db in self.db.lookup_class('node'):
1497 client_name = node_db.getName()
1498 for prof_uuid in node_db.get_refs('profile'):
1499 prof_db = node_db.lookup(prof_uuid)
1500 # refactor this into a funtion to test "clientness"
1502 for ref_class, ref_uuid in prof_db.get_all_refs():
1503 if ref_class in ('mountpoint','echoclient'):
1504 debug("recording", client_name)
1505 old_noexec = config.noexec
1507 noexec_opt = ('', '-n')
1508 ret, out = run (sys.argv[0],
1509 noexec_opt[old_noexec == 1],
1510 " -v --record --nomod",
1511 "--record_log", client_name,
1512 "--record_device", self.name,
1513 "--node", client_name,
1516 for s in out: log("record> ", string.strip(s))
1517 ret, out = run (sys.argv[0],
1518 noexec_opt[old_noexec == 1],
1519 "--cleanup -v --record --nomod",
1520 "--record_log", client_name + "-clean",
1521 "--record_device", self.name,
1522 "--node", client_name,
1525 for s in out: log("record> ", string.strip(s))
1526 config.noexec = old_noexec
1528 lctl.cleanup(self.name, self.uuid, 0, 0)
1529 except CommandError, e:
1530 log(self.module_name, "cleanup failed: ", self.name)
1533 Module.cleanup(self)
1534 clean_loop(self.devpath)
1536 def msd_remaining(self):
1537 out = lctl.device_list()
1539 if string.split(s)[2] in ('mds',):
1542 def safe_to_clean(self):
1545 def safe_to_clean_modules(self):
1546 return not self.msd_remaining()
1550 debug(self.uuid, "not active")
1553 if is_prepared(self.name):
1555 lctl.cleanup(self.name, self.uuid, config.force,
1557 except CommandError, e:
1558 log(self.module_name, "cleanup failed: ", self.name)
1561 Module.cleanup(self)
1562 if not self.msd_remaining() and is_prepared('MDT'):
1564 lctl.cleanup("MDT", "MDT_UUID", config.force,
1566 except CommandError, e:
1567 print "cleanup failed: ", self.name
1570 clean_loop(self.devpath)
1573 def __init__(self, db):
1574 Module.__init__(self, 'OSD', db)
1575 self.osdtype = self.db.get_val('osdtype')
1576 self.devpath = self.db.get_val('devpath', '')
1577 self.size = self.db.get_val_int('devsize', 0)
1578 self.journal_size = self.db.get_val_int('journalsize', 0)
1579 self.inode_size = self.db.get_val_int('inodesize', 0)
1580 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1581 self.fstype = self.db.get_val('fstype', '')
1582 self.nspath = self.db.get_val('nspath', '')
1583 target_uuid = self.db.get_first_ref('target')
1584 ost = self.db.lookup(target_uuid)
1585 self.name = ost.getName()
1586 self.format = self.db.get_val('autoformat', 'yes')
1587 if ost.get_val('failover', 0):
1588 self.failover_ost = 'f'
1590 self.failover_ost = 'n'
1592 active_uuid = get_active_target(ost)
1594 panic("No target device found:", target_uuid)
1595 if active_uuid == self.uuid:
1599 if self.active and config.group and config.group != ost.get_val('group'):
1602 self.target_dev_uuid = self.uuid
1603 self.uuid = target_uuid
1605 self.add_lustre_module('ost', 'ost')
1606 # FIXME: should we default to ext3 here?
1608 self.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.fstype))
1609 self.add_lustre_module(self.osdtype, self.osdtype)
1611 def load_module(self):
1613 Module.load_module(self)
1615 # need to check /proc/mounts and /etc/mtab before
1616 # formatting anything.
1617 # FIXME: check if device is already formatted.
1619 if is_prepared(self.name):
1622 debug(self.uuid, "not active")
1624 self.info(self.osdtype, self.devpath, self.size, self.fstype,
1625 self.format, self.journal_size, self.inode_size)
1627 if self.osdtype == 'obdecho':
1630 blkdev = block_dev(self.devpath, self.size, self.fstype,
1631 config.reformat, self.format, self.journal_size,
1632 self.inode_size, self.mkfsoptions)
1633 lctl.newdev(self.osdtype, self.name, self.uuid,
1634 setup ="%s %s %s" %(blkdev, self.fstype,
1636 if not is_prepared('OSS'):
1637 lctl.newdev("ost", 'OSS', 'OSS_UUID', setup ="")
1639 def osd_remaining(self):
1640 out = lctl.device_list()
1642 if string.split(s)[2] in ('obdfilter', 'obdecho'):
1645 def safe_to_clean(self):
1648 def safe_to_clean_modules(self):
1649 return not self.osd_remaining()
1653 debug(self.uuid, "not active")
1655 if is_prepared(self.name):
1658 lctl.cleanup(self.name, self.uuid, config.force,
1660 except CommandError, e:
1661 log(self.module_name, "cleanup failed: ", self.name)
1664 if not self.osd_remaining() and is_prepared('OSS'):
1666 lctl.cleanup("OSS", "OSS_UUID", config.force,
1668 except CommandError, e:
1669 print "cleanup failed: ", self.name
1672 if not self.osdtype == 'obdecho':
1673 clean_loop(self.devpath)
1675 def mgmt_uuid_for_fs(mtpt_name):
1678 mtpt_db = toplevel.lookup_name(mtpt_name)
1679 fs_uuid = mtpt_db.get_first_ref('filesystem')
1680 fs = toplevel.lookup(fs_uuid)
1683 return fs.get_first_ref('mgmt')
1685 # Generic client module, used by OSC and MDC
1686 class Client(Module):
1687 def __init__(self, tgtdb, uuid, module, fs_name, self_name=None,
1689 self.target_name = tgtdb.getName()
1690 self.target_uuid = tgtdb.getUUID()
1693 self.tgt_dev_uuid = get_active_target(tgtdb)
1694 if not self.tgt_dev_uuid:
1695 panic("No target device found for target:", self.target_name)
1697 self.kmod = kmod(config.lustre, config.portals)
1701 self.module = module
1702 self.module_name = string.upper(module)
1704 self.name = '%s_%s_%s_%s' % (self.module_name, socket.gethostname(),
1705 self.target_name, fs_name)
1707 self.name = self_name
1709 self.lookup_server(self.tgt_dev_uuid)
1710 mgmt_uuid = mgmt_uuid_for_fs(fs_name)
1712 self.mgmt_name = mgmtcli_name_for_uuid(mgmt_uuid)
1715 self.fs_name = fs_name
1718 self.add_lustre_module(module_dir, module)
1720 def lookup_server(self, srv_uuid):
1721 """ Lookup a server's network information """
1722 self._server_nets = get_ost_net(self.db, srv_uuid)
1723 if len(self._server_nets) == 0:
1724 panic ("Unable to find a server for:", srv_uuid)
1726 def get_servers(self):
1727 return self._server_nets
1729 def prepare(self, ignore_connect_failure = 0):
1730 self.info(self.target_uuid)
1731 if is_prepared(self.name):
1734 srv = choose_local_server(self.get_servers())
1738 routes = find_route(self.get_servers())
1739 if len(routes) == 0:
1740 panic ("no route to", self.target_uuid)
1741 for (srv, r) in routes:
1742 lctl.add_route_host(r[0], srv.nid_uuid, r[1], r[3])
1743 except CommandError, e:
1744 if not ignore_connect_failure:
1747 if self.target_uuid in config.inactive and self.permits_inactive():
1748 debug("%s inactive" % self.target_uuid)
1749 inactive_p = "inactive"
1751 debug("%s active" % self.target_uuid)
1753 lctl.newdev(self.module, self.name, self.uuid,
1754 setup ="%s %s %s %s" % (self.target_uuid, srv.nid_uuid,
1755 inactive_p, self.mgmt_name))
1758 if is_prepared(self.name):
1759 Module.cleanup(self)
1761 srv = choose_local_server(self.get_servers())
1763 lctl.disconnect(srv)
1765 for (srv, r) in find_route(self.get_servers()):
1766 lctl.del_route_host(r[0], srv.nid_uuid, r[1], r[3])
1767 except CommandError, e:
1768 log(self.module_name, "cleanup failed: ", self.name)
1774 def __init__(self, db, uuid, fs_name):
1775 Client.__init__(self, db, uuid, 'mdc', fs_name)
1777 def permits_inactive(self):
1781 def __init__(self, db, uuid, fs_name):
1782 Client.__init__(self, db, uuid, 'osc', fs_name)
1784 def permits_inactive(self):
1787 def mgmtcli_name_for_uuid(uuid):
1788 return 'MGMTCLI_%s' % uuid
1790 class ManagementClient(Client):
1791 def __init__(self, db, uuid):
1792 Client.__init__(self, db, uuid, 'mgmt_cli', '',
1793 self_name = mgmtcli_name_for_uuid(db.getUUID()),
1794 module_dir = 'mgmt')
1797 def __init__(self, db):
1798 Module.__init__(self, 'COBD', db)
1799 self.real_uuid = self.db.get_first_ref('realobd')
1800 self.cache_uuid = self.db.get_first_ref('cacheobd')
1801 self.add_lustre_module('cobd' , 'cobd')
1803 # need to check /proc/mounts and /etc/mtab before
1804 # formatting anything.
1805 # FIXME: check if device is already formatted.
1807 if is_prepared(self.name):
1809 self.info(self.real_uuid, self.cache_uuid)
1810 lctl.newdev("cobd", self.name, self.uuid,
1811 setup ="%s %s" %(self.real_uuid, self.cache_uuid))
1814 # virtual interface for OSC and LOV
1816 def __init__(self, db, uuid, fs_name, name_override = None):
1817 Module.__init__(self, 'VOSC', db)
1818 if db.get_class() == 'lov':
1819 self.osc = LOV(db, uuid, fs_name, name_override)
1821 self.osc = get_osc(db, uuid, fs_name)
1823 return self.osc.uuid
1825 return self.osc.name
1830 def load_module(self):
1831 self.osc.load_module()
1832 def cleanup_module(self):
1833 self.osc.cleanup_module()
1836 class ECHO_CLIENT(Module):
1837 def __init__(self,db):
1838 Module.__init__(self, 'ECHO_CLIENT', db)
1839 self.add_lustre_module('obdecho', 'obdecho')
1840 self.obd_uuid = self.db.get_first_ref('obd')
1841 obd = self.db.lookup(self.obd_uuid)
1842 self.uuid = generate_client_uuid(self.name)
1843 self.osc = VOSC(obd, self.uuid, self.name)
1846 if is_prepared(self.name):
1849 self.osc.prepare() # XXX This is so cheating. -p
1850 self.info(self.obd_uuid)
1852 lctl.newdev("echo_client", self.name, self.uuid,
1853 setup = self.osc.get_name())
1856 if is_prepared(self.name):
1857 Module.cleanup(self)
1860 def load_module(self):
1861 self.osc.load_module()
1862 Module.load_module(self)
1864 def cleanup_module(self):
1865 Module.cleanup_module(self)
1866 self.osc.cleanup_module()
1869 def generate_client_uuid(name):
1870 client_uuid = '%05x_%.19s_%05x%05x' % (int(random.random() * 1048576),
1872 int(random.random() * 1048576),
1873 int(random.random() * 1048576))
1874 return client_uuid[:36]
1877 class Mountpoint(Module):
1878 def __init__(self,db):
1879 Module.__init__(self, 'MTPT', db)
1880 self.path = self.db.get_val('path')
1881 self.fs_uuid = self.db.get_first_ref('filesystem')
1882 fs = self.db.lookup(self.fs_uuid)
1883 self.mds_uuid = fs.get_first_ref('mds')
1884 self.obd_uuid = fs.get_first_ref('obd')
1885 self.mgmt_uuid = fs.get_first_ref('mgmt')
1886 obd = self.db.lookup(self.obd_uuid)
1887 client_uuid = generate_client_uuid(self.name)
1888 self.vosc = VOSC(obd, client_uuid, self.name)
1889 self.mdc = get_mdc(db, client_uuid, self.name, self.mds_uuid)
1891 self.add_lustre_module('mdc', 'mdc')
1892 self.add_lustre_module('llite', 'llite')
1894 self.mgmtcli = ManagementClient(db.lookup(self.mgmt_uuid),
1900 if fs_is_mounted(self.path):
1901 log(self.path, "already mounted.")
1905 self.mgmtcli.prepare()
1908 mdc_name = self.mdc.name
1910 self.info(self.path, self.mds_uuid, self.obd_uuid)
1911 if config.record or config.lctl_dump:
1912 lctl.mount_option(local_node_name, self.vosc.get_name(), mdc_name)
1914 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s %s %s" % \
1915 (self.vosc.get_name(), mdc_name, config.config, self.path)
1916 run("mkdir", self.path)
1921 panic("mount failed:", self.path, ":", string.join(val))
1924 self.info(self.path, self.mds_uuid,self.obd_uuid)
1926 if config.record or config.lctl_dump:
1927 lctl.del_mount_option(local_node_name)
1929 if fs_is_mounted(self.path):
1931 (rc, out) = run("umount", "-f", self.path)
1933 (rc, out) = run("umount", self.path)
1935 raise CommandError('umount', out, rc)
1937 if fs_is_mounted(self.path):
1938 panic("fs is still mounted:", self.path)
1943 self.mgmtcli.cleanup()
1945 def load_module(self):
1947 self.mgmtcli.load_module()
1948 self.vosc.load_module()
1949 Module.load_module(self)
1951 def cleanup_module(self):
1952 Module.cleanup_module(self)
1953 self.vosc.cleanup_module()
1955 self.mgmtcli.cleanup_module()
1958 # ============================================================
1959 # misc query functions
1961 def get_ost_net(self, osd_uuid):
1965 osd = self.lookup(osd_uuid)
1966 node_uuid = osd.get_first_ref('node')
1967 node = self.lookup(node_uuid)
1969 panic("unable to find node for osd_uuid:", osd_uuid,
1970 " node_ref:", node_uuid)
1971 for net_uuid in node.get_networks():
1972 db = node.lookup(net_uuid)
1973 srv_list.append(Network(db))
1977 # the order of iniitailization is based on level.
1978 def getServiceLevel(self):
1979 type = self.get_class()
1981 if type in ('network',):
1983 elif type in ('routetbl',):
1985 elif type in ('ldlm',):
1987 elif type in ('mgmt',):
1989 elif type in ('osd', 'cobd'):
1991 elif type in ('mdsdev',):
1993 elif type in ('mountpoint', 'echoclient'):
1996 panic("Unknown type: ", type)
1998 if ret < config.minlevel or ret > config.maxlevel:
2003 # return list of services in a profile. list is a list of tuples
2004 # [(level, db_object),]
2005 def getServices(self):
2007 for ref_class, ref_uuid in self.get_all_refs():
2008 servdb = self.lookup(ref_uuid)
2010 level = getServiceLevel(servdb)
2012 list.append((level, servdb))
2014 panic('service not found: ' + ref_uuid)
2020 ############################################################
2022 # FIXME: clean this mess up!
2024 # OSC is no longer in the xml, so we have to fake it.
2025 # this is getting ugly and begging for another refactoring
2026 def get_osc(ost_db, uuid, fs_name):
2027 osc = OSC(ost_db, uuid, fs_name)
2030 def get_mdc(db, uuid, fs_name, mds_uuid):
2031 mds_db = db.lookup(mds_uuid);
2033 panic("no mds:", mds_uuid)
2034 mdc = MDC(mds_db, uuid, fs_name)
2037 ############################################################
2038 # routing ("rooting")
2040 # list of (nettype, cluster_id, nid)
2043 def find_local_clusters(node_db):
2044 global local_clusters
2045 for netuuid in node_db.get_networks():
2046 net = node_db.lookup(netuuid)
2048 debug("add_local", netuuid)
2049 local_clusters.append((srv.net_type, srv.cluster_id, srv.nid))
2051 if acceptors.has_key(srv.port):
2052 panic("duplicate port:", srv.port)
2053 acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type,
2054 srv.send_mem, srv.recv_mem,
2057 # This node is a gateway.
2059 def node_is_router():
2062 # If there are any routers found in the config, then this will be true
2063 # and all nodes will load kptlrouter.
2065 def node_needs_router():
2066 return needs_router or is_router
2068 # list of (nettype, gw, tgt_cluster_id, lo, hi)
2069 # Currently, these local routes are only added to kptlrouter route
2070 # table if they are needed to connect to a specific server. This
2071 # should be changed so all available routes are loaded, and the
2072 # ptlrouter can make all the decisions.
2075 def find_local_routes(lustre):
2076 """ Scan the lustre config looking for routers . Build list of
2078 global local_routes, needs_router
2080 list = lustre.lookup_class('node')
2082 if router.get_val_int('router', 0):
2084 for (local_type, local_cluster_id, local_nid) in local_clusters:
2086 for netuuid in router.get_networks():
2087 db = router.lookup(netuuid)
2088 if (local_type == db.get_val('nettype') and
2089 local_cluster_id == db.get_val('clusterid')):
2090 gw = db.get_val('nid')
2093 debug("find_local_routes: gw is", gw)
2094 for route in router.get_local_routes(local_type, gw):
2095 local_routes.append(route)
2096 debug("find_local_routes:", local_routes)
2099 def choose_local_server(srv_list):
2100 for srv in srv_list:
2101 if local_cluster(srv.net_type, srv.cluster_id):
2104 def local_cluster(net_type, cluster_id):
2105 for cluster in local_clusters:
2106 if net_type == cluster[0] and cluster_id == cluster[1]:
2110 def local_interface(net_type, cluster_id, nid):
2111 for cluster in local_clusters:
2112 if (net_type == cluster[0] and cluster_id == cluster[1]
2113 and nid == cluster[2]):
2117 def find_route(srv_list):
2119 frm_type = local_clusters[0][0]
2120 for srv in srv_list:
2121 debug("find_route: srv:", srv.nid, "type: ", srv.net_type)
2122 to_type = srv.net_type
2124 cluster_id = srv.cluster_id
2125 debug ('looking for route to', to_type, to)
2126 for r in local_routes:
2127 debug("find_route: ", r)
2128 if (r[3] <= to and to <= r[4]) and cluster_id == r[2]:
2129 result.append((srv, r))
2132 def get_active_target(db):
2133 target_uuid = db.getUUID()
2134 target_name = db.getName()
2135 node_name = get_select(target_name)
2137 tgt_dev_uuid = db.get_node_tgt_dev(node_name, target_uuid)
2139 tgt_dev_uuid = db.get_first_ref('active')
2142 def get_server_by_nid_uuid(db, nid_uuid):
2143 for n in db.lookup_class("network"):
2145 if net.nid_uuid == nid_uuid:
2149 ############################################################
2153 type = db.get_class()
2154 debug('Service:', type, db.getName(), db.getUUID())
2159 n = LOV(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
2160 elif type == 'network':
2162 elif type == 'routetbl':
2166 elif type == 'cobd':
2168 elif type == 'mdsdev':
2170 elif type == 'mountpoint':
2172 elif type == 'echoclient':
2174 elif type == 'mgmt':
2177 panic ("unknown service type:", type)
2181 # Prepare the system to run lustre using a particular profile
2182 # in a the configuration.
2183 # * load & the modules
2184 # * setup networking for the current node
2185 # * make sure partitions are in place and prepared
2186 # * initialize devices with lctl
2187 # Levels is important, and needs to be enforced.
2188 def for_each_profile(db, prof_list, operation):
2189 for prof_uuid in prof_list:
2190 prof_db = db.lookup(prof_uuid)
2192 panic("profile:", profile, "not found.")
2193 services = getServices(prof_db)
2196 def doWriteconf(services):
2200 if s[1].get_class() == 'mdsdev':
2201 n = newService(s[1])
2204 def doSetup(services):
2208 n = newService(s[1])
2211 def doModules(services):
2215 n = newService(s[1])
2218 def doCleanup(services):
2223 n = newService(s[1])
2224 if n.safe_to_clean():
2227 def doUnloadModules(services):
2232 n = newService(s[1])
2233 if n.safe_to_clean_modules():
2238 def doHost(lustreDB, hosts):
2239 global is_router, local_node_name
2242 node_db = lustreDB.lookup_name(h, 'node')
2246 panic('No host entry found.')
2248 local_node_name = node_db.get_val('name', 0)
2249 is_router = node_db.get_val_int('router', 0)
2250 lustre_upcall = node_db.get_val('lustreUpcall', '')
2251 portals_upcall = node_db.get_val('portalsUpcall', '')
2252 timeout = node_db.get_val_int('timeout', 0)
2253 ptldebug = node_db.get_val('ptldebug', '')
2254 subsystem = node_db.get_val('subsystem', '')
2256 find_local_clusters(node_db)
2258 find_local_routes(lustreDB)
2260 # Two step process: (1) load modules, (2) setup lustre
2261 # if not cleaning, load modules first.
2262 prof_list = node_db.get_refs('profile')
2264 if config.write_conf:
2265 for_each_profile(node_db, prof_list, doModules)
2267 for_each_profile(node_db, prof_list, doWriteconf)
2268 for_each_profile(node_db, prof_list, doUnloadModules)
2270 elif config.recover:
2271 if not (config.tgt_uuid and config.client_uuid and config.conn_uuid):
2272 raise Lustre.LconfError( "--recovery requires --tgt_uuid <UUID> " +
2273 "--client_uuid <UUID> --conn_uuid <UUID>")
2274 doRecovery(lustreDB, lctl, config.tgt_uuid, config.client_uuid,
2276 elif config.cleanup:
2278 # the command line can override this value
2280 # ugly hack, only need to run lctl commands for --dump
2281 if config.lctl_dump or config.record:
2282 for_each_profile(node_db, prof_list, doCleanup)
2285 sys_set_timeout(timeout)
2286 sys_set_ptldebug(ptldebug)
2287 sys_set_subsystem(subsystem)
2288 sys_set_lustre_upcall(lustre_upcall)
2289 sys_set_portals_upcall(portals_upcall)
2291 for_each_profile(node_db, prof_list, doCleanup)
2292 for_each_profile(node_db, prof_list, doUnloadModules)
2295 # ugly hack, only need to run lctl commands for --dump
2296 if config.lctl_dump or config.record:
2297 sys_set_timeout(timeout)
2298 sys_set_lustre_upcall(lustre_upcall)
2299 for_each_profile(node_db, prof_list, doSetup)
2303 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
2304 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
2306 for_each_profile(node_db, prof_list, doModules)
2308 sys_set_debug_path()
2309 sys_set_ptldebug(ptldebug)
2310 sys_set_subsystem(subsystem)
2311 script = config.gdb_script
2312 run(lctl.lctl, ' modules >', script)
2314 log ("The GDB module script is in", script)
2315 # pause, so user has time to break and
2318 sys_set_timeout(timeout)
2319 sys_set_lustre_upcall(lustre_upcall)
2320 sys_set_portals_upcall(portals_upcall)
2322 for_each_profile(node_db, prof_list, doSetup)
2324 def doRecovery(db, lctl, tgt_uuid, client_uuid, nid_uuid):
2325 tgt = db.lookup(tgt_uuid)
2327 raise Lustre.LconfError("doRecovery: "+ tgt_uuid +" not found.")
2328 new_uuid = get_active_target(tgt)
2330 raise Lustre.LconfError("doRecovery: no active target found for: " +
2332 net = choose_local_server(get_ost_net(db, new_uuid))
2334 raise Lustre.LconfError("Unable to find a connection to:" + new_uuid)
2336 log("Reconnecting", tgt_uuid, " to ", net.nid_uuid);
2338 oldnet = get_server_by_nid_uuid(db, nid_uuid)
2340 lctl.disconnect(oldnet)
2341 except CommandError, e:
2342 log("recover: disconnect", nid_uuid, "failed: ")
2347 except CommandError, e:
2348 log("recover: connect failed")
2351 lctl.recover(client_uuid, net.nid_uuid)
2354 def setupModulePath(cmd, portals_dir = PORTALS_DIR):
2355 base = os.path.dirname(cmd)
2356 if development_mode():
2357 if not config.lustre:
2358 debug('using objdir module paths')
2359 config.lustre = (os.path.join(base, ".."))
2360 # normalize the portals dir, using command line arg if set
2362 portals_dir = config.portals
2363 dir = os.path.join(config.lustre, portals_dir)
2364 config.portals = dir
2365 debug('config.portals', config.portals)
2366 elif config.lustre and config.portals:
2368 # if --lustre and --portals, normalize portals
2369 # can ignore POTRALS_DIR here, since it is probly useless here
2370 config.portals = os.path.join(config.lustre, config.portals)
2371 debug('config.portals B', config.portals)
2373 def sysctl(path, val):
2374 debug("+ sysctl", path, val)
2378 fp = open(os.path.join('/proc/sys', path), 'w')
2385 def sys_set_debug_path():
2386 sysctl('portals/debug_path', config.debug_path)
2388 def sys_set_lustre_upcall(upcall):
2389 # the command overrides the value in the node config
2390 if config.lustre_upcall:
2391 upcall = config.lustre_upcall
2393 upcall = config.upcall
2395 lctl.set_lustre_upcall(upcall)
2397 def sys_set_portals_upcall(upcall):
2398 # the command overrides the value in the node config
2399 if config.portals_upcall:
2400 upcall = config.portals_upcall
2402 upcall = config.upcall
2404 sysctl('portals/upcall', upcall)
2406 def sys_set_timeout(timeout):
2407 # the command overrides the value in the node config
2408 if config.timeout and config.timeout > 0:
2409 timeout = config.timeout
2410 if timeout != None and timeout > 0:
2411 lctl.set_timeout(timeout)
2413 def sys_tweak_socknal ():
2414 if config.single_socket:
2415 sysctl("socknal/typed", 0)
2417 def sys_optimize_elan ():
2418 procfiles = ["/proc/elan/config/eventint_punt_loops",
2419 "/proc/qsnet/elan3/config/eventint_punt_loops",
2420 "/proc/qsnet/elan4/config/elan4_mainint_punt_loops"]
2422 if os.access(p, os.R_OK):
2423 run ("echo 0 > " + p)
2425 def sys_set_ptldebug(ptldebug):
2427 ptldebug = config.ptldebug
2430 val = eval(ptldebug, ptldebug_names)
2431 val = "0x%x" % (val)
2432 sysctl('portals/debug', val)
2433 except NameError, e:
2436 def sys_set_subsystem(subsystem):
2437 if config.subsystem:
2438 subsystem = config.subsystem
2441 val = eval(subsystem, subsystem_names)
2442 val = "0x%x" % (val)
2443 sysctl('portals/subsystem_debug', val)
2444 except NameError, e:
2447 def sys_set_netmem_max(path, max):
2448 debug("setting", path, "to at least", max)
2456 fp = open(path, 'w')
2457 fp.write('%d\n' %(max))
2461 def sys_make_devices():
2462 if not os.access('/dev/portals', os.R_OK):
2463 run('mknod /dev/portals c 10 240')
2464 if not os.access('/dev/obd', os.R_OK):
2465 run('mknod /dev/obd c 10 241')
2468 # Add dir to the global PATH, if not already there.
2469 def add_to_path(new_dir):
2470 syspath = string.split(os.environ['PATH'], ':')
2471 if new_dir in syspath:
2473 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
2475 def default_debug_path():
2476 path = '/tmp/lustre-log'
2477 if os.path.isdir('/r'):
2482 def default_gdb_script():
2483 script = '/tmp/ogdb'
2484 if os.path.isdir('/r'):
2485 return '/r' + script
2490 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
2491 # ensure basic elements are in the system path
2492 def sanitise_path():
2493 for dir in DEFAULT_PATH:
2496 # global hack for the --select handling
2498 def init_select(args):
2499 # args = [service=nodeA,service2=nodeB service3=nodeC]
2502 list = string.split(arg, ',')
2504 srv, node = string.split(entry, '=')
2505 tgt_select[srv] = node
2507 def get_select(srv):
2508 if tgt_select.has_key(srv):
2509 return tgt_select[srv]
2513 FLAG = Lustre.Options.FLAG
2514 PARAM = Lustre.Options.PARAM
2515 INTPARAM = Lustre.Options.INTPARAM
2516 PARAMLIST = Lustre.Options.PARAMLIST
2518 ('verbose,v', "Print system commands as they are run"),
2519 ('ldapurl',"LDAP server URL, eg. ldap://localhost", PARAM),
2520 ('config', "Cluster config name used for LDAP query", PARAM),
2521 ('select', "service=nodeA,service2=nodeB ", PARAMLIST),
2522 ('node', "Load config for <nodename>", PARAM),
2523 ('cleanup,d', "Cleans up config. (Shutdown)"),
2524 ('force,f', "Forced unmounting and/or obd detach during cleanup",
2526 ('single_socket', "socknal option: only use one socket instead of bundle",
2528 ('failover',"""Used to shut down without saving state.
2529 This will allow this node to "give up" a service to a
2530 another node for failover purposes. This will not
2531 be a clean shutdown.""",
2533 ('gdb', """Prints message after creating gdb module script
2534 and sleeps for 5 seconds."""),
2535 ('noexec,n', """Prints the commands and steps that will be run for a
2536 config without executing them. This can used to check if a
2537 config file is doing what it should be doing"""),
2538 ('nomod', "Skip load/unload module step."),
2539 ('nosetup', "Skip device setup/cleanup step."),
2540 ('reformat', "Reformat all devices (without question)"),
2541 ('mkfsoptions', "Additional options for the mk*fs command line", PARAM),
2542 ('dump', "Dump the kernel debug log to file before portals is unloaded",
2544 ('write_conf', "Save all the client config information on mds."),
2545 ('record', "Write config information on mds."),
2546 ('record_log', "Name of config record log.", PARAM),
2547 ('record_device', "MDS device name that will record the config commands",
2549 ('minlevel', "Minimum level of services to configure/cleanup",
2551 ('maxlevel', """Maximum level of services to configure/cleanup
2552 Levels are aproximatly like:
2557 70 - mountpoint, echo_client, osc, mdc, lov""",
2559 ('lustre', """Base directory of lustre sources. This parameter will
2560 cause lconf to load modules from a source tree.""", PARAM),
2561 ('portals', """Portals source directory. If this is a relative path,
2562 then it is assumed to be relative to lustre. """, PARAM),
2563 ('timeout', "Set recovery timeout", INTPARAM),
2564 ('upcall', "Set both portals and lustre upcall script", PARAM),
2565 ('lustre_upcall', "Set lustre upcall script", PARAM),
2566 ('portals_upcall', "Set portals upcall script", PARAM),
2567 ('lctl_dump', "Save lctl ioctls to the dumpfile argument", PARAM),
2568 ('ptldebug', "Set the portals debug level", PARAM),
2569 ('subsystem', "Set the portals debug subsystem", PARAM),
2570 ('gdb_script', "Fullname of gdb debug script", PARAM, default_gdb_script()),
2571 ('debug_path', "Path to save debug dumps", PARAM, default_debug_path()),
2572 # Client recovery options
2573 ('recover', "Recover a device"),
2574 ('group', "The group of devices to configure or cleanup", PARAM),
2575 ('tgt_uuid', "The failed target (required for recovery)", PARAM),
2576 ('client_uuid', "The failed client (required for recovery)", PARAM),
2577 ('conn_uuid', "The failed connection (required for recovery)", PARAM),
2579 ('inactive', """The name of an inactive service, to be ignored during
2580 mounting (currently OST-only). Can be repeated.""",
2585 global lctl, config, toplevel, CONFIG_FILE
2587 # in the upcall this is set to SIG_IGN
2588 signal.signal(signal.SIGCHLD, signal.SIG_DFL)
2590 cl = Lustre.Options("lconf", "config.xml", lconf_options)
2592 config, args = cl.parse(sys.argv[1:])
2593 except Lustre.OptionError, e:
2597 setupModulePath(sys.argv[0])
2599 host = socket.gethostname()
2601 # the PRNG is normally seeded with time(), which is not so good for starting
2602 # time-synchronized clusters
2603 input = open('/dev/urandom', 'r')
2605 print 'Unable to open /dev/urandom!'
2607 seed = input.read(32)
2613 init_select(config.select)
2616 # allow config to be fetched via HTTP, but only with python2
2617 if sys.version[0] != '1' and args[0].startswith('http://'):
2620 config_file = urllib2.urlopen(args[0])
2621 except (urllib2.URLError, socket.error), err:
2622 if hasattr(err, 'args'):
2624 print "Could not access '%s': %s" %(args[0], err)
2626 elif not os.access(args[0], os.R_OK):
2627 print 'File not found or readable:', args[0]
2631 config_file = open(args[0], 'r')
2633 dom = xml.dom.minidom.parse(config_file)
2635 panic("%s does not appear to be a config file." % (args[0]))
2636 sys.exit(1) # make sure to die here, even in debug mode.
2637 CONFIG_FILE = args[0]
2638 db = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement)
2639 if not config.config:
2640 config.config = os.path.basename(args[0])# use full path?
2641 if config.config[-4:] == '.xml':
2642 config.config = config.config[:-4]
2643 elif config.ldapurl:
2644 if not config.config:
2645 panic("--ldapurl requires --config name")
2646 dn = "config=%s,fs=lustre" % (config.config)
2647 db = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl)
2648 elif config.ptldebug or config.subsystem:
2649 sys_set_ptldebug(None)
2650 sys_set_subsystem(None)
2653 print 'Missing config file or ldap URL.'
2654 print 'see lconf --help for command summary'
2659 ver = db.get_version()
2661 panic("No version found in config data, please recreate.")
2662 if ver != Lustre.CONFIG_VERSION:
2663 panic("Config version", ver, "does not match lconf version",
2664 Lustre.CONFIG_VERSION)
2668 node_list.append(config.node)
2671 node_list.append(host)
2672 node_list.append('localhost')
2674 debug("configuring for host: ", node_list)
2677 config.debug_path = config.debug_path + '-' + host
2678 config.gdb_script = config.gdb_script + '-' + host
2680 lctl = LCTLInterface('lctl')
2682 if config.lctl_dump:
2683 lctl.use_save_file(config.lctl_dump)
2686 if not (config.record_device and config.record_log):
2687 panic("When recording, both --record_log and --record_device must be specified.")
2688 lctl.clear_log(config.record_device, config.record_log)
2689 lctl.record(config.record_device, config.record_log)
2691 doHost(db, node_list)
2696 if __name__ == "__main__":
2699 except Lustre.LconfError, e:
2701 # traceback.print_exc(file=sys.stdout)
2703 except CommandError, e:
2707 if first_cleanup_error:
2708 sys.exit(first_cleanup_error)