3 # Copyright (C) 2002-2003 Cluster File Systems, Inc.
4 # Authors: Robert Read <rread@clusterfs.com>
5 # Mike Shaver <shaver@clusterfs.com>
6 # This file is part of Lustre, http://www.lustre.org.
8 # Lustre is free software; you can redistribute it and/or
9 # modify it under the terms of version 2 of the GNU General Public
10 # License as published by the Free Software Foundation.
12 # Lustre is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Lustre; if not, write to the Free Software
19 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 # lconf - lustre configuration tool
23 # lconf is the main driver script for starting and stopping
24 # lustre filesystem services.
26 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import sys, getopt, types
29 import string, os, stat, popen2, socket, time, random, fcntl, select
30 import re, exceptions, signal, traceback
31 import xml.dom.minidom
33 if sys.version[0] == '1':
34 from FCNTL import F_GETFL, F_SETFL
36 from fcntl import F_GETFL, F_SETFL
38 PYMOD_DIR = "/usr/lib/lustre/python"
40 def development_mode():
41 base = os.path.dirname(sys.argv[0])
42 if os.access(base+"/Makefile", os.R_OK):
46 if development_mode():
47 sys.path.append('../utils')
49 sys.path.append(PYMOD_DIR)
55 DEFAULT_TCPBUF = 8388608
58 # Maximum number of devices to search for.
59 # (the /dev/loop* nodes need to be created beforehand)
60 MAX_LOOP_DEVICES = 256
61 PORTALS_DIR = 'portals'
63 # Needed to call lconf --record
66 # Please keep these in sync with the values in portals/kp30.h
78 "warning" : (1 << 10),
82 "portals" : (1 << 14),
84 "dlmtrace" : (1 << 16),
88 "rpctrace" : (1 << 20),
89 "vfstrace" : (1 << 21),
94 "undefined" : (1 << 0),
104 "portals" : (1 << 10),
105 "socknal" : (1 << 11),
106 "qswnal" : (1 << 12),
107 "pinger" : (1 << 13),
108 "filter" : (1 << 14),
114 "ptlrouter" : (1 << 20),
120 first_cleanup_error = 0
121 def cleanup_error(rc):
122 global first_cleanup_error
123 if not first_cleanup_error:
124 first_cleanup_error = rc
126 # ============================================================
127 # debugging and error funcs
129 def fixme(msg = "this feature"):
130 raise Lustre.LconfError, msg + ' not implmemented yet.'
133 msg = string.join(map(str,args))
134 if not config.noexec:
135 raise Lustre.LconfError(msg)
140 msg = string.join(map(str,args))
145 print string.strip(s)
149 msg = string.join(map(str,args))
152 # ack, python's builtin int() does not support '0x123' syntax.
153 # eval can do it, although what a hack!
157 return eval(s, {}, {})
160 except SyntaxError, e:
161 raise ValueError("not a number")
163 raise ValueError("not a number")
165 # ============================================================
166 # locally defined exceptions
167 class CommandError (exceptions.Exception):
168 def __init__(self, cmd_name, cmd_err, rc=None):
169 self.cmd_name = cmd_name
170 self.cmd_err = cmd_err
175 if type(self.cmd_err) == types.StringType:
177 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
179 print "! %s: %s" % (self.cmd_name, self.cmd_err)
180 elif type(self.cmd_err) == types.ListType:
182 print "! %s (error %d):" % (self.cmd_name, self.rc)
184 print "! %s:" % (self.cmd_name)
185 for s in self.cmd_err:
186 print "> %s" %(string.strip(s))
191 # ============================================================
192 # handle daemons, like the acceptor
194 """ Manage starting and stopping a daemon. Assumes daemon manages
195 it's own pid file. """
197 def __init__(self, cmd):
203 log(self.command, "already running.")
205 self.path = find_prog(self.command)
207 panic(self.command, "not found.")
208 ret, out = runcmd(self.path +' '+ self.command_line())
210 raise CommandError(self.path, out, ret)
214 pid = self.read_pidfile()
216 log ("killing process", pid)
218 #time.sleep(1) # let daemon die
220 log("unable to kill", self.command, e)
222 log("unable to kill", self.command)
225 pid = self.read_pidfile()
235 def read_pidfile(self):
237 fp = open(self.pidfile(), 'r')
244 def clean_pidfile(self):
245 """ Remove a stale pidfile """
246 log("removing stale pidfile:", self.pidfile())
248 os.unlink(self.pidfile())
250 log(self.pidfile(), e)
252 class AcceptorHandler(DaemonHandler):
253 def __init__(self, port, net_type, send_mem, recv_mem, irq_aff):
254 DaemonHandler.__init__(self, "acceptor")
257 self.send_mem = send_mem
258 self.recv_mem = recv_mem
261 self.flags = self.flags + ' -i'
264 return "/var/run/%s-%d.pid" % (self.command, self.port)
266 def command_line(self):
267 return string.join(map(str,('-s', self.send_mem, '-r', self.recv_mem, self.flags, self.port)))
271 # start the acceptors
273 if config.lctl_dump or config.record:
275 for port in acceptors.keys():
276 daemon = acceptors[port]
277 if not daemon.running():
280 def run_one_acceptor(port):
281 if config.lctl_dump or config.record:
283 if acceptors.has_key(port):
284 daemon = acceptors[port]
285 if not daemon.running():
288 panic("run_one_acceptor: No acceptor defined for port:", port)
290 def stop_acceptor(port):
291 if acceptors.has_key(port):
292 daemon = acceptors[port]
297 # ============================================================
298 # handle lctl interface
301 Manage communication with lctl
304 def __init__(self, cmd):
306 Initialize close by finding the lctl binary.
308 self.lctl = find_prog(cmd)
310 self.record_device = ''
313 debug('! lctl not found')
316 raise CommandError('lctl', "unable to find lctl binary.")
318 def use_save_file(self, file):
319 self.save_file = file
321 def record(self, dev_name, logname):
322 log("Recording log", logname, "on", dev_name)
323 self.record_device = dev_name
324 self.record_log = logname
326 def end_record(self):
327 log("End recording log", self.record_log, "on", self.record_device)
328 self.record_device = None
329 self.record_log = None
331 def set_nonblock(self, fd):
332 fl = fcntl.fcntl(fd, F_GETFL)
333 fcntl.fcntl(fd, F_SETFL, fl | os.O_NDELAY)
338 the cmds are written to stdin of lctl
339 lctl doesn't return errors when run in script mode, so
341 should modify command line to accept multiple commands, or
342 create complex command line options
346 cmds = '\n dump ' + self.save_file + '\n' + cmds
347 elif self.record_device:
351 %s""" % (self.record_device, self.record_log, cmds)
353 debug("+", cmd_line, cmds)
354 if config.noexec: return (0, [])
356 child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command
357 child.tochild.write(cmds + "\n")
358 child.tochild.close()
360 # From "Python Cookbook" from O'Reilly
361 outfile = child.fromchild
362 outfd = outfile.fileno()
363 self.set_nonblock(outfd)
364 errfile = child.childerr
365 errfd = errfile.fileno()
366 self.set_nonblock(errfd)
368 outdata = errdata = ''
371 ready = select.select([outfd,errfd],[],[]) # Wait for input
372 if outfd in ready[0]:
373 outchunk = outfile.read()
374 if outchunk == '': outeof = 1
375 outdata = outdata + outchunk
376 if errfd in ready[0]:
377 errchunk = errfile.read()
378 if errchunk == '': erreof = 1
379 errdata = errdata + errchunk
380 if outeof and erreof: break
381 # end of "borrowed" code
384 if os.WIFEXITED(ret):
385 rc = os.WEXITSTATUS(ret)
388 if rc or len(errdata):
389 raise CommandError(self.lctl, errdata, rc)
392 def runcmd(self, *args):
394 run lctl using the command line
396 cmd = string.join(map(str,args))
397 debug("+", self.lctl, cmd)
398 rc, out = run(self.lctl, cmd)
400 raise CommandError(self.lctl, out, rc)
404 def clear_log(self, dev, log):
405 """ clear an existing log """
410 quit """ % (dev, log)
413 def network(self, net, nid):
418 quit """ % (net, nid)
421 # create a new connection
422 def add_uuid(self, net_type, uuid, nid):
423 cmds = "\n add_uuid %s %s %s" %(uuid, nid, net_type)
426 def add_autoconn(self, net_type, send_mem, recv_mem, nid, hostaddr,
428 if net_type in ('tcp',) and not config.lctl_dump:
433 add_autoconn %s %s %d %s
437 nid, hostaddr, port, flags )
440 def connect(self, srv):
441 self.add_uuid(srv.net_type, srv.nid_uuid, srv.nid)
442 if srv.net_type in ('tcp',) and not config.lctl_dump:
446 self.add_autoconn(srv.net_type, srv.send_mem, srv.recv_mem,
447 srv.nid, srv.hostaddr, srv.port, flags)
450 def recover(self, dev_name, new_conn):
453 recover %s""" %(dev_name, new_conn)
456 # add a route to a range
457 def add_route(self, net, gw, lo, hi):
465 except CommandError, e:
469 def del_route(self, net, gw, lo, hi):
474 quit """ % (net, gw, lo, hi)
477 # add a route to a host
478 def add_route_host(self, net, uuid, gw, tgt):
479 self.add_uuid(net, uuid, tgt)
487 except CommandError, e:
491 # add a route to a range
492 def del_route_host(self, net, uuid, gw, tgt):
498 quit """ % (net, gw, tgt)
502 def del_autoconn(self, net_type, nid, hostaddr):
503 if net_type in ('tcp',) and not config.lctl_dump:
512 # disconnect one connection
513 def disconnect(self, srv):
514 self.del_uuid(srv.nid_uuid)
515 if srv.net_type in ('tcp',) and not config.lctl_dump:
516 self.del_autoconn(srv.net_type, srv.nid, srv.hostaddr)
518 def del_uuid(self, uuid):
526 def disconnectAll(self, net):
534 def attach(self, type, name, uuid):
537 quit""" % (type, name, uuid)
540 def setup(self, name, setup = ""):
544 quit""" % (name, setup)
548 # create a new device with lctl
549 def newdev(self, type, name, uuid, setup = ""):
550 self.attach(type, name, uuid);
552 self.setup(name, setup)
553 except CommandError, e:
554 self.cleanup(name, uuid, 0)
559 def cleanup(self, name, uuid, force, failover = 0):
560 if failover: force = 1
566 quit""" % (name, ('', 'force')[force],
567 ('', 'failover')[failover])
571 def lov_setup(self, name, uuid, desc_uuid, mdsuuid, stripe_cnt,
572 stripe_sz, stripe_off,
576 lov_setup %s %d %d %d %s %s
577 quit""" % (name, uuid, desc_uuid, stripe_cnt, stripe_sz, stripe_off,
582 def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off,
586 lov_setconfig %s %d %d %d %s %s
587 quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
591 def dump(self, dump_file):
594 quit""" % (dump_file)
597 # get list of devices
598 def device_list(self):
599 devices = '/proc/fs/lustre/devices'
601 if os.access(devices, os.R_OK):
603 fp = open(devices, 'r')
611 def lustre_version(self):
612 rc, out = self.runcmd('version')
616 def mount_option(self, profile, osc, mdc):
618 mount_option %s %s %s
619 quit""" % (profile, osc, mdc)
622 # delete mount options
623 def del_mount_option(self, profile):
629 def set_timeout(self, timeout):
635 # delete mount options
636 def set_lustre_upcall(self, upcall):
641 # ============================================================
642 # Various system-level functions
643 # (ideally moved to their own module)
645 # Run a command and return the output and status.
646 # stderr is sent to /dev/null, could use popen3 to
647 # save it if necessary
650 if config.noexec: return (0, [])
651 f = os.popen(cmd + ' 2>&1')
661 cmd = string.join(map(str,args))
664 # Run a command in the background.
665 def run_daemon(*args):
666 cmd = string.join(map(str,args))
668 if config.noexec: return 0
669 f = os.popen(cmd + ' 2>&1')
677 # Determine full path to use for an external command
678 # searches dirname(argv[0]) first, then PATH
680 syspath = string.split(os.environ['PATH'], ':')
681 cmdpath = os.path.dirname(sys.argv[0])
682 syspath.insert(0, cmdpath);
684 syspath.insert(0, os.path.join(config.portals, 'utils/'))
686 prog = os.path.join(d,cmd)
687 if os.access(prog, os.X_OK):
691 # Recursively look for file starting at base dir
692 def do_find_file(base, mod):
693 fullname = os.path.join(base, mod)
694 if os.access(fullname, os.R_OK):
696 for d in os.listdir(base):
697 dir = os.path.join(base,d)
698 if os.path.isdir(dir):
699 module = do_find_file(dir, mod)
703 def find_module(src_dir, dev_dir, modname):
704 modbase = src_dir +'/'+ dev_dir +'/'+ modname
705 for modext in '.ko', '.o':
706 module = modbase + modext
708 if os.access(module, os.R_OK):
714 # is the path a block device?
721 return stat.S_ISBLK(s[stat.ST_MODE])
723 # build fs according to type
725 def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1):
731 panic("size of filesystem on '%s' must be larger than 8MB, but is set to %s"%
733 # devsize is in 1k, and fs block count is in 4k
734 block_cnt = devsize/4
736 if fstype in ('ext3', 'extN', 'ldiskfs'):
737 # ext3 journal size is in megabytes
740 if not is_block(dev):
741 ret, out = runcmd("ls -l %s" %dev)
742 devsize = int(string.split(out[0])[4]) / 1024
744 ret, out = runcmd("sfdisk -s %s" %dev)
745 devsize = int(out[0])
746 if devsize > 1024 * 1024:
747 jsize = ((devsize / 102400) * 4)
750 if jsize: jopt = "-J size=%d" %(jsize,)
751 if isize: iopt = "-I %d" %(isize,)
752 mkfs = 'mkfs.ext2 -j -b 4096 '
753 if not isblock or config.force:
755 elif fstype == 'reiserfs':
756 # reiserfs journal size is in blocks
757 if jsize: jopt = "--journal_size %d" %(jsize,)
758 mkfs = 'mkreiserfs -ff'
760 panic('unsupported fs type: ', fstype)
762 if config.mkfsoptions != None:
763 mkfs = mkfs + ' ' + config.mkfsoptions
764 if mkfsoptions != None:
765 mkfs = mkfs + ' ' + mkfsoptions
766 (ret, out) = run (mkfs, jopt, iopt, dev, block_cnt)
768 panic("Unable to build fs:", dev, string.join(out))
769 # enable hash tree indexing on fsswe
770 if fstype in ('ext3', 'extN', 'ldiskfs'):
771 htree = 'echo "feature FEATURE_C5" | debugfs -w'
772 (ret, out) = run (htree, dev)
774 panic("Unable to enable htree:", dev)
776 # some systems use /dev/loopN, some /dev/loop/N
780 if not os.access(loop + str(0), os.R_OK):
782 if not os.access(loop + str(0), os.R_OK):
783 panic ("can't access loop devices")
786 # find loop device assigned to thefile
789 for n in xrange(0, MAX_LOOP_DEVICES):
791 if os.access(dev, os.R_OK):
792 (stat, out) = run('losetup', dev)
793 if out and stat == 0:
794 m = re.search(r'\((.*)\)', out[0])
795 if m and file == m.group(1):
801 # create file if necessary and assign the first free loop device
802 def init_loop(file, size, fstype, journal_size, inode_size, mkfsoptions, reformat):
803 dev = find_loop(file)
805 print 'WARNING file:', file, 'already mapped to', dev
807 if reformat or not os.access(file, os.R_OK | os.W_OK):
809 panic("size of loopback file '%s' must be larger than 8MB, but is set to %s" % (file,size))
810 (ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size,
813 panic("Unable to create backing store:", file)
814 mkfs(file, size, fstype, journal_size, inode_size, mkfsoptions, isblock=0)
817 # find next free loop
818 for n in xrange(0, MAX_LOOP_DEVICES):
820 if os.access(dev, os.R_OK):
821 (stat, out) = run('losetup', dev)
823 run('losetup', dev, file)
826 print "out of loop devices"
828 print "out of loop devices"
831 # undo loop assignment
832 def clean_loop(file):
833 dev = find_loop(file)
835 ret, out = run('losetup -d', dev)
837 log('unable to clean loop device:', dev, 'for file:', file)
840 # determine if dev is formatted as a <fstype> filesystem
841 def need_format(fstype, dev):
842 # FIXME don't know how to implement this
845 # initialize a block device if needed
846 def block_dev(dev, size, fstype, reformat, autoformat, journal_size,
847 inode_size, mkfsoptions):
848 if config.noexec: return dev
849 if not is_block(dev):
850 dev = init_loop(dev, size, fstype, journal_size, inode_size,
851 mkfsoptions, reformat)
852 elif reformat or (need_format(fstype, dev) and autoformat == 'yes'):
853 mkfs(dev, size, fstype, journal_size, inode_size, mkfsoptions,
856 # panic("device:", dev,
857 # "not prepared, and autoformat is not set.\n",
858 # "Rerun with --reformat option to format ALL filesystems")
863 """lookup IP address for an interface"""
864 rc, out = run("/sbin/ifconfig", iface)
867 addr = string.split(out[1])[1]
868 ip = string.split(addr, ':')[1]
871 def sys_get_elan_position_file():
872 procfiles = ["/proc/elan/device0/position",
873 "/proc/qsnet/elan4/device0/position",
874 "/proc/qsnet/elan3/device0/position"]
876 if os.access(p, os.R_OK):
880 def sys_get_local_nid(net_type, wildcard, cluster_id):
881 """Return the local nid."""
883 if sys_get_elan_position_file():
884 local = sys_get_local_address('elan', '*', cluster_id)
886 local = sys_get_local_address(net_type, wildcard, cluster_id)
889 def sys_get_local_address(net_type, wildcard, cluster_id):
890 """Return the local address for the network type."""
892 if net_type in ('tcp',):
894 iface, star = string.split(wildcard, ':')
895 local = if2addr(iface)
897 panic ("unable to determine ip for:", wildcard)
899 host = socket.gethostname()
900 local = socket.gethostbyname(host)
901 elif net_type == 'elan':
902 # awk '/NodeId/ { print $2 }' 'sys_get_elan_position_file()'
903 f = sys_get_elan_position_file()
905 panic ("unable to determine local Elan ID")
908 lines = fp.readlines()
916 nid = my_int(cluster_id) + my_int(elan_id)
918 except ValueError, e:
922 elif net_type == 'gm':
923 fixme("automatic local address for GM")
924 elif net_type == 'scimac':
925 scinode="/opt/scali/sbin/scinode"
926 if os.path.exists(scinode):
927 (rc,local) = run(scinode)
929 panic (scinode, " not found on node with scimac networking")
931 panic (scinode, " failed")
932 local=string.rstrip(local[0])
936 def mod_loaded(modname):
937 """Check if a module is already loaded. Look in /proc/modules for it."""
939 fp = open('/proc/modules')
940 lines = fp.readlines()
942 # please forgive my tired fingers for this one
943 ret = filter(lambda word, mod=modname: word == mod,
944 map(lambda line: string.split(line)[0], lines))
949 # XXX: instead of device_list, ask for $name and see what we get
950 def is_prepared(name):
951 """Return true if a device exists for the name"""
954 if (config.noexec or config.record) and config.cleanup:
957 # expect this format:
958 # 1 UP ldlm ldlm ldlm_UUID 2
959 out = lctl.device_list()
961 if name == string.split(s)[3]:
963 except CommandError, e:
967 def is_network_prepared():
968 """If the any device exists, then assume that all networking
969 has been configured"""
970 out = lctl.device_list()
973 def fs_is_mounted(path):
974 """Return true if path is a mounted lustre filesystem"""
976 fp = open('/proc/mounts')
977 lines = fp.readlines()
981 if a[1] == path and a[2] == 'lustre_lite':
989 """Manage kernel modules"""
990 def __init__(self, lustre_dir, portals_dir):
991 self.lustre_dir = lustre_dir
992 self.portals_dir = portals_dir
993 self.kmodule_list = []
995 def add_portals_module(self, dev_dir, modname):
996 """Append a module to list of modules to load."""
997 self.kmodule_list.append((self.portals_dir, dev_dir, modname))
999 def add_lustre_module(self, dev_dir, modname):
1000 """Append a module to list of modules to load."""
1001 self.kmodule_list.append((self.lustre_dir, dev_dir, modname))
1003 def load_module(self):
1004 """Load all the modules in the list in the order they appear."""
1005 for src_dir, dev_dir, mod in self.kmodule_list:
1006 if mod_loaded(mod) and not config.noexec:
1008 log ('loading module:', mod, 'srcdir', src_dir, 'devdir', dev_dir)
1010 module = find_module(src_dir, dev_dir, mod)
1012 panic('module not found:', mod)
1013 (rc, out) = run('/sbin/insmod', module)
1015 raise CommandError('insmod', out, rc)
1017 (rc, out) = run('/sbin/modprobe', mod)
1019 raise CommandError('modprobe', out, rc)
1021 def cleanup_module(self):
1022 """Unload the modules in the list in reverse order."""
1023 rev = self.kmodule_list
1025 for src_dir, dev_dir, mod in rev:
1026 if not mod_loaded(mod) and not config.noexec:
1029 if mod == 'portals' and config.dump:
1030 lctl.dump(config.dump)
1031 log('unloading module:', mod)
1032 (rc, out) = run('/sbin/rmmod', mod)
1034 log('! unable to unload module:', mod)
1037 # ============================================================
1038 # Classes to prepare and cleanup the various objects
1041 """ Base class for the rest of the modules. The default cleanup method is
1042 defined here, as well as some utilitiy funcs.
1044 def __init__(self, module_name, db):
1046 self.module_name = module_name
1047 self.name = self.db.getName()
1048 self.uuid = self.db.getUUID()
1051 self.kmod = kmod(config.lustre, config.portals)
1053 def info(self, *args):
1054 msg = string.join(map(str,args))
1055 print self.module_name + ":", self.name, self.uuid, msg
1058 """ default cleanup, used for most modules """
1061 lctl.cleanup(self.name, self.uuid, config.force)
1062 except CommandError, e:
1063 log(self.module_name, "cleanup failed: ", self.name)
1067 def add_portals_module(self, dev_dir, modname):
1068 """Append a module to list of modules to load."""
1069 self.kmod.add_portals_module(dev_dir, modname)
1071 def add_lustre_module(self, dev_dir, modname):
1072 """Append a module to list of modules to load."""
1073 self.kmod.add_lustre_module(dev_dir, modname)
1075 def load_module(self):
1076 """Load all the modules in the list in the order they appear."""
1077 self.kmod.load_module()
1079 def cleanup_module(self):
1080 """Unload the modules in the list in reverse order."""
1081 if self.safe_to_clean():
1082 self.kmod.cleanup_module()
1084 def safe_to_clean(self):
1087 def safe_to_clean_modules(self):
1088 return self.safe_to_clean()
1090 class Network(Module):
1091 def __init__(self,db):
1092 Module.__init__(self, 'NETWORK', db)
1093 self.net_type = self.db.get_val('nettype')
1094 self.nid = self.db.get_val('nid', '*')
1095 self.cluster_id = self.db.get_val('clusterid', "0")
1096 self.port = self.db.get_val_int('port', 0)
1097 self.send_mem = self.db.get_val_int('sendmem', DEFAULT_TCPBUF)
1098 self.recv_mem = self.db.get_val_int('recvmem', DEFAULT_TCPBUF)
1099 self.irq_affinity = self.db.get_val_int('irqaffinity', 0)
1102 self.nid = sys_get_local_nid(self.net_type, self.nid, self.cluster_id)
1104 panic("unable to set nid for", self.net_type, self.nid, cluster_id)
1105 self.generic_nid = 1
1106 debug("nid:", self.nid)
1108 self.generic_nid = 0
1110 self.nid_uuid = self.nid_to_uuid(self.nid)
1112 self.hostaddr = self.db.get_val('hostaddr', self.nid)
1113 if '*' in self.hostaddr:
1114 self.hostaddr = sys_get_local_address(self.net_type, self.hostaddr, self.cluster_id)
1115 if not self.hostaddr:
1116 panic("unable to set hostaddr for", self.net_type, self.hostaddr, self.cluster_id)
1117 debug("hostaddr:", self.hostaddr)
1119 self.add_portals_module("libcfs", 'libcfs')
1120 self.add_portals_module("portals", 'portals')
1121 if node_needs_router():
1122 self.add_portals_module("router", 'kptlrouter')
1123 if self.net_type == 'tcp':
1124 self.add_portals_module("knals/socknal", 'ksocknal')
1125 if self.net_type == 'elan':
1126 self.add_portals_module("knals/qswnal", 'kqswnal')
1127 if self.net_type == 'gm':
1128 self.add_portals_module("knals/gmnal", 'kgmnal')
1129 if self.net_type == 'scimac':
1130 self.add_portals_module("knals/scimacnal", 'kscimacnal')
1132 def nid_to_uuid(self, nid):
1133 return "NID_%s_UUID" %(nid,)
1136 if is_network_prepared():
1138 self.info(self.net_type, self.nid, self.port)
1139 if not (config.record and self.generic_nid):
1140 lctl.network(self.net_type, self.nid)
1141 if self.net_type == 'tcp':
1143 if self.net_type == 'elan':
1145 if self.port and node_is_router():
1146 run_one_acceptor(self.port)
1147 self.connect_peer_gateways()
1149 def connect_peer_gateways(self):
1150 for router in self.db.lookup_class('node'):
1151 if router.get_val_int('router', 0):
1152 for netuuid in router.get_networks():
1153 net = self.db.lookup(netuuid)
1155 if (gw.cluster_id == self.cluster_id and
1156 gw.net_type == self.net_type):
1157 if gw.nid != self.nid:
1160 def disconnect_peer_gateways(self):
1161 for router in self.db.lookup_class('node'):
1162 if router.get_val_int('router', 0):
1163 for netuuid in router.get_networks():
1164 net = self.db.lookup(netuuid)
1166 if (gw.cluster_id == self.cluster_id and
1167 gw.net_type == self.net_type):
1168 if gw.nid != self.nid:
1171 except CommandError, e:
1172 print "disconnect failed: ", self.name
1176 def safe_to_clean(self):
1177 return not is_network_prepared()
1180 self.info(self.net_type, self.nid, self.port)
1182 stop_acceptor(self.port)
1183 if node_is_router():
1184 self.disconnect_peer_gateways()
1186 class RouteTable(Module):
1187 def __init__(self,db):
1188 Module.__init__(self, 'ROUTES', db)
1190 def server_for_route(self, net_type, gw, gw_cluster_id, tgt_cluster_id,
1192 # only setup connections for tcp NALs
1194 if not net_type in ('tcp',):
1197 # connect to target if route is to single node and this node is the gw
1198 if lo == hi and local_interface(net_type, gw_cluster_id, gw):
1199 if not local_cluster(net_type, tgt_cluster_id):
1200 panic("target", lo, " not on the local cluster")
1201 srvdb = self.db.nid2server(lo, net_type, gw_cluster_id)
1202 # connect to gateway if this node is not the gw
1203 elif (local_cluster(net_type, gw_cluster_id)
1204 and not local_interface(net_type, gw_cluster_id, gw)):
1205 srvdb = self.db.nid2server(gw, net_type, gw_cluster_id)
1210 panic("no server for nid", lo)
1213 return Network(srvdb)
1216 if is_network_prepared():
1219 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1220 lctl.add_route(net_type, gw, lo, hi)
1221 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1225 def safe_to_clean(self):
1226 return not is_network_prepared()
1229 if is_network_prepared():
1230 # the network is still being used, don't clean it up
1232 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1233 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1236 lctl.disconnect(srv)
1237 except CommandError, e:
1238 print "disconnect failed: ", self.name
1243 lctl.del_route(net_type, gw, lo, hi)
1244 except CommandError, e:
1245 print "del_route failed: ", self.name
1249 class Management(Module):
1250 def __init__(self, db):
1251 Module.__init__(self, 'MGMT', db)
1252 self.add_lustre_module('lvfs', 'lvfs')
1253 self.add_lustre_module('obdclass', 'obdclass')
1254 self.add_lustre_module('ptlrpc', 'ptlrpc')
1255 self.add_lustre_module('mgmt', 'mgmt_svc')
1258 if is_prepared(self.name):
1261 lctl.newdev("mgmt", self.name, self.uuid)
1263 def safe_to_clean(self):
1267 if is_prepared(self.name):
1268 Module.cleanup(self)
1270 # This is only needed to load the modules; the LDLM device
1271 # is now created automatically.
1273 def __init__(self,db):
1274 Module.__init__(self, 'LDLM', db)
1275 self.add_lustre_module('lvfs', 'lvfs')
1276 self.add_lustre_module('obdclass', 'obdclass')
1277 self.add_lustre_module('ptlrpc', 'ptlrpc')
1286 def __init__(self, db, uuid, fs_name, name_override = None, config_only = None):
1287 Module.__init__(self, 'LOV', db)
1288 if name_override != None:
1289 self.name = "lov_%s" % name_override
1290 self.add_lustre_module('lov', 'lov')
1291 self.mds_uuid = self.db.get_first_ref('mds')
1292 self.stripe_sz = self.db.get_val_int('stripesize', 65536)
1293 self.stripe_off = self.db.get_val_int('stripeoffset', 0)
1294 self.pattern = self.db.get_val_int('stripepattern', 0)
1295 self.devlist = self.db.get_refs('obd')
1296 self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist))
1298 self.desc_uuid = self.uuid
1299 self.uuid = generate_client_uuid(self.name)
1300 self.fs_name = fs_name
1302 self.config_only = 1
1304 self.config_only = None
1305 mds= self.db.lookup(self.mds_uuid)
1306 self.mds_name = mds.getName()
1307 for obd_uuid in self.devlist:
1308 obd = self.db.lookup(obd_uuid)
1309 osc = get_osc(obd, self.uuid, fs_name)
1311 self.osclist.append(osc)
1313 panic('osc not found:', obd_uuid)
1316 if is_prepared(self.name):
1318 if self.config_only:
1319 panic("Can't prepare config_only LOV ", self.name)
1321 for osc in self.osclist:
1323 # Only ignore connect failures with --force, which
1324 # isn't implemented here yet.
1325 osc.prepare(ignore_connect_failure=0)
1326 except CommandError, e:
1327 print "Error preparing OSC %s\n" % osc.uuid
1329 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
1330 self.stripe_off, self.pattern, self.devlist, self.mds_name)
1331 lctl.lov_setup(self.name, self.uuid,
1332 self.desc_uuid, self.mds_name, self.stripe_cnt,
1333 self.stripe_sz, self.stripe_off, self.pattern,
1334 string.join(self.devlist))
1337 if is_prepared(self.name):
1338 Module.cleanup(self)
1339 if self.config_only:
1340 panic("Can't clean up config_only LOV ", self.name)
1341 for osc in self.osclist:
1344 def load_module(self):
1345 if self.config_only:
1346 panic("Can't load modules for config_only LOV ", self.name)
1347 for osc in self.osclist:
1350 Module.load_module(self)
1352 def cleanup_module(self):
1353 if self.config_only:
1354 panic("Can't cleanup modules for config_only LOV ", self.name)
1355 Module.cleanup_module(self)
1356 for osc in self.osclist:
1357 osc.cleanup_module()
1360 class MDSDEV(Module):
1361 def __init__(self,db):
1362 Module.__init__(self, 'MDSDEV', db)
1363 self.devpath = self.db.get_val('devpath','')
1364 self.size = self.db.get_val_int('devsize', 0)
1365 self.journal_size = self.db.get_val_int('journalsize', 0)
1366 self.fstype = self.db.get_val('fstype', '')
1367 self.nspath = self.db.get_val('nspath', '')
1368 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1369 # overwrite the orignal MDSDEV name and uuid with the MDS name and uuid
1370 target_uuid = self.db.get_first_ref('target')
1371 mds = self.db.lookup(target_uuid)
1372 self.name = mds.getName()
1373 self.filesystem_uuids = mds.get_refs('filesystem')
1374 # FIXME: if fstype not set, then determine based on kernel version
1375 self.format = self.db.get_val('autoformat', "no")
1376 if mds.get_val('failover', 0):
1377 self.failover_mds = 'f'
1379 self.failover_mds = 'n'
1380 active_uuid = get_active_target(mds)
1382 panic("No target device found:", target_uuid)
1383 if active_uuid == self.uuid:
1387 if self.active and config.group and config.group != mds.get_val('group'):
1390 self.inode_size = self.db.get_val_int('inodesize', 0)
1391 if self.inode_size == 0:
1392 # find the LOV for this MDS
1393 lovconfig_uuid = mds.get_first_ref('lovconfig')
1394 if not lovconfig_uuid:
1395 panic("No LOV config found for MDS ", mds.name)
1396 lovconfig = mds.lookup(lovconfig_uuid)
1397 lov_uuid = lovconfig.get_first_ref('lov')
1399 panic("No LOV found for lovconfig ", lovconfig.name)
1400 lov = LOV(self.db.lookup(lov_uuid), lov_uuid, 'FS_name', config_only = 1)
1402 # default stripe count controls default inode_size
1403 stripe_count = lov.stripe_cnt
1404 if stripe_count > 77:
1405 self.inode_size = 4096
1406 elif stripe_count > 35:
1407 self.inode_size = 2048
1408 elif stripe_count > 13:
1409 self.inode_size = 1024
1410 elif stripe_count > 3:
1411 self.inode_size = 512
1413 self.inode_size = 256
1415 self.target_dev_uuid = self.uuid
1416 self.uuid = target_uuid
1418 self.add_lustre_module('mdc', 'mdc')
1419 self.add_lustre_module('osc', 'osc')
1420 self.add_lustre_module('lov', 'lov')
1421 self.add_lustre_module('mds', 'mds')
1422 if self.fstype == 'ldiskfs':
1423 self.add_lustre_module('ldiskfs', 'ldiskfs')
1425 self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.fstype))
1427 def load_module(self):
1429 Module.load_module(self)
1432 if is_prepared(self.name):
1435 debug(self.uuid, "not active")
1438 # run write_conf automatically, if --reformat used
1440 self.info(self.devpath, self.fstype, self.size, self.format)
1442 # never reformat here
1443 blkdev = block_dev(self.devpath, self.size, self.fstype, 0,
1444 self.format, self.journal_size, self.inode_size,
1446 if not is_prepared('MDT'):
1447 lctl.newdev("mdt", 'MDT', 'MDT_UUID', setup ="")
1449 lctl.newdev("mds", self.name, self.uuid,
1450 setup ="%s %s %s" %(blkdev, self.fstype, self.name))
1451 except CommandError, e:
1453 panic("MDS is missing the config log. Need to run " +
1454 "lconf --write_conf.")
1458 def write_conf(self):
1459 if is_prepared(self.name):
1461 self.info(self.devpath, self.fstype, self.format)
1462 blkdev = block_dev(self.devpath, self.size, self.fstype,
1463 config.reformat, self.format, self.journal_size,
1464 self.inode_size, self.mkfsoptions)
1465 lctl.newdev("mds", self.name, self.uuid,
1466 setup ="%s %s" %(blkdev, self.fstype))
1468 # record logs for the MDS lov
1469 for uuid in self.filesystem_uuids:
1470 log("recording clients for filesystem:", uuid)
1471 fs = self.db.lookup(uuid)
1472 obd_uuid = fs.get_first_ref('obd')
1473 client_uuid = generate_client_uuid(self.name)
1474 client = VOSC(self.db.lookup(obd_uuid), client_uuid, self.name,
1477 lctl.clear_log(self.name, self.name)
1478 lctl.record(self.name, self.name)
1480 lctl.mount_option(self.name, client.get_name(), "")
1484 lctl.clear_log(self.name, self.name + '-clean')
1485 lctl.record(self.name, self.name + '-clean')
1487 lctl.del_mount_option(self.name)
1492 # record logs for each client
1494 config_options = "--ldapurl " + config.ldapurl + " --config " + config.config
1496 config_options = CONFIG_FILE
1498 for node_db in self.db.lookup_class('node'):
1499 client_name = node_db.getName()
1500 for prof_uuid in node_db.get_refs('profile'):
1501 prof_db = node_db.lookup(prof_uuid)
1502 # refactor this into a funtion to test "clientness"
1504 for ref_class, ref_uuid in prof_db.get_all_refs():
1505 if ref_class in ('mountpoint','echoclient'):
1506 debug("recording", client_name)
1507 old_noexec = config.noexec
1509 noexec_opt = ('', '-n')
1510 ret, out = run (sys.argv[0],
1511 noexec_opt[old_noexec == 1],
1512 " -v --record --nomod",
1513 "--record_log", client_name,
1514 "--record_device", self.name,
1515 "--node", client_name,
1518 for s in out: log("record> ", string.strip(s))
1519 ret, out = run (sys.argv[0],
1520 noexec_opt[old_noexec == 1],
1521 "--cleanup -v --record --nomod",
1522 "--record_log", client_name + "-clean",
1523 "--record_device", self.name,
1524 "--node", client_name,
1527 for s in out: log("record> ", string.strip(s))
1528 config.noexec = old_noexec
1530 lctl.cleanup(self.name, self.uuid, 0, 0)
1531 except CommandError, e:
1532 log(self.module_name, "cleanup failed: ", self.name)
1535 Module.cleanup(self)
1536 clean_loop(self.devpath)
1538 def msd_remaining(self):
1539 out = lctl.device_list()
1541 if string.split(s)[2] in ('mds',):
1544 def safe_to_clean(self):
1547 def safe_to_clean_modules(self):
1548 return not self.msd_remaining()
1552 debug(self.uuid, "not active")
1555 if is_prepared(self.name):
1557 lctl.cleanup(self.name, self.uuid, config.force,
1559 except CommandError, e:
1560 log(self.module_name, "cleanup failed: ", self.name)
1563 Module.cleanup(self)
1564 if not self.msd_remaining() and is_prepared('MDT'):
1566 lctl.cleanup("MDT", "MDT_UUID", config.force,
1568 except CommandError, e:
1569 print "cleanup failed: ", self.name
1572 clean_loop(self.devpath)
1575 def __init__(self, db):
1576 Module.__init__(self, 'OSD', db)
1577 self.osdtype = self.db.get_val('osdtype')
1578 self.devpath = self.db.get_val('devpath', '')
1579 self.size = self.db.get_val_int('devsize', 0)
1580 self.journal_size = self.db.get_val_int('journalsize', 0)
1581 self.inode_size = self.db.get_val_int('inodesize', 0)
1582 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1583 self.fstype = self.db.get_val('fstype', '')
1584 self.nspath = self.db.get_val('nspath', '')
1585 target_uuid = self.db.get_first_ref('target')
1586 ost = self.db.lookup(target_uuid)
1587 self.name = ost.getName()
1588 self.format = self.db.get_val('autoformat', 'yes')
1589 if ost.get_val('failover', 0):
1590 self.failover_ost = 'f'
1592 self.failover_ost = 'n'
1594 active_uuid = get_active_target(ost)
1596 panic("No target device found:", target_uuid)
1597 if active_uuid == self.uuid:
1601 if self.active and config.group and config.group != ost.get_val('group'):
1604 self.target_dev_uuid = self.uuid
1605 self.uuid = target_uuid
1607 self.add_lustre_module('ost', 'ost')
1608 # FIXME: should we default to ext3 here?
1609 if self.fstype == 'ldiskfs':
1610 self.add_lustre_module('ldiskfs', 'ldiskfs')
1612 self.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.fstype))
1613 self.add_lustre_module(self.osdtype, self.osdtype)
1615 def load_module(self):
1617 Module.load_module(self)
1619 # need to check /proc/mounts and /etc/mtab before
1620 # formatting anything.
1621 # FIXME: check if device is already formatted.
1623 if is_prepared(self.name):
1626 debug(self.uuid, "not active")
1628 self.info(self.osdtype, self.devpath, self.size, self.fstype,
1629 self.format, self.journal_size, self.inode_size)
1631 if self.osdtype == 'obdecho':
1634 blkdev = block_dev(self.devpath, self.size, self.fstype,
1635 config.reformat, self.format, self.journal_size,
1636 self.inode_size, self.mkfsoptions)
1637 lctl.newdev(self.osdtype, self.name, self.uuid,
1638 setup ="%s %s %s" %(blkdev, self.fstype,
1640 if not is_prepared('OSS'):
1641 lctl.newdev("ost", 'OSS', 'OSS_UUID', setup ="")
1643 def osd_remaining(self):
1644 out = lctl.device_list()
1646 if string.split(s)[2] in ('obdfilter', 'obdecho'):
1649 def safe_to_clean(self):
1652 def safe_to_clean_modules(self):
1653 return not self.osd_remaining()
1657 debug(self.uuid, "not active")
1659 if is_prepared(self.name):
1662 lctl.cleanup(self.name, self.uuid, config.force,
1664 except CommandError, e:
1665 log(self.module_name, "cleanup failed: ", self.name)
1668 if not self.osd_remaining() and is_prepared('OSS'):
1670 lctl.cleanup("OSS", "OSS_UUID", config.force,
1672 except CommandError, e:
1673 print "cleanup failed: ", self.name
1676 if not self.osdtype == 'obdecho':
1677 clean_loop(self.devpath)
1679 def mgmt_uuid_for_fs(mtpt_name):
1682 mtpt_db = toplevel.lookup_name(mtpt_name)
1683 fs_uuid = mtpt_db.get_first_ref('filesystem')
1684 fs = toplevel.lookup(fs_uuid)
1687 return fs.get_first_ref('mgmt')
1689 # Generic client module, used by OSC and MDC
1690 class Client(Module):
1691 def __init__(self, tgtdb, uuid, module, fs_name, self_name=None,
1693 self.target_name = tgtdb.getName()
1694 self.target_uuid = tgtdb.getUUID()
1697 self.tgt_dev_uuid = get_active_target(tgtdb)
1698 if not self.tgt_dev_uuid:
1699 panic("No target device found for target:", self.target_name)
1701 self.kmod = kmod(config.lustre, config.portals)
1705 self.module = module
1706 self.module_name = string.upper(module)
1708 self.name = '%s_%s_%s_%s' % (self.module_name, socket.gethostname(),
1709 self.target_name, fs_name)
1711 self.name = self_name
1713 self.lookup_server(self.tgt_dev_uuid)
1714 mgmt_uuid = mgmt_uuid_for_fs(fs_name)
1716 self.mgmt_name = mgmtcli_name_for_uuid(mgmt_uuid)
1719 self.fs_name = fs_name
1722 self.add_lustre_module(module_dir, module)
1724 def lookup_server(self, srv_uuid):
1725 """ Lookup a server's network information """
1726 self._server_nets = get_ost_net(self.db, srv_uuid)
1727 if len(self._server_nets) == 0:
1728 panic ("Unable to find a server for:", srv_uuid)
1730 def get_servers(self):
1731 return self._server_nets
1733 def prepare(self, ignore_connect_failure = 0):
1734 self.info(self.target_uuid)
1735 if is_prepared(self.name):
1738 srv = choose_local_server(self.get_servers())
1742 routes = find_route(self.get_servers())
1743 if len(routes) == 0:
1744 panic ("no route to", self.target_uuid)
1745 for (srv, r) in routes:
1746 lctl.add_route_host(r[0], srv.nid_uuid, r[1], r[3])
1747 except CommandError, e:
1748 if not ignore_connect_failure:
1751 if self.target_uuid in config.inactive and self.permits_inactive():
1752 debug("%s inactive" % self.target_uuid)
1753 inactive_p = "inactive"
1755 debug("%s active" % self.target_uuid)
1757 lctl.newdev(self.module, self.name, self.uuid,
1758 setup ="%s %s %s %s" % (self.target_uuid, srv.nid_uuid,
1759 inactive_p, self.mgmt_name))
1762 if is_prepared(self.name):
1763 Module.cleanup(self)
1765 srv = choose_local_server(self.get_servers())
1767 lctl.disconnect(srv)
1769 for (srv, r) in find_route(self.get_servers()):
1770 lctl.del_route_host(r[0], srv.nid_uuid, r[1], r[3])
1771 except CommandError, e:
1772 log(self.module_name, "cleanup failed: ", self.name)
1778 def __init__(self, db, uuid, fs_name):
1779 Client.__init__(self, db, uuid, 'mdc', fs_name)
1781 def permits_inactive(self):
1785 def __init__(self, db, uuid, fs_name):
1786 Client.__init__(self, db, uuid, 'osc', fs_name)
1788 def permits_inactive(self):
1791 def mgmtcli_name_for_uuid(uuid):
1792 return 'MGMTCLI_%s' % uuid
1794 class ManagementClient(Client):
1795 def __init__(self, db, uuid):
1796 Client.__init__(self, db, uuid, 'mgmt_cli', '',
1797 self_name = mgmtcli_name_for_uuid(db.getUUID()),
1798 module_dir = 'mgmt')
1801 def __init__(self, db):
1802 Module.__init__(self, 'COBD', db)
1803 self.real_uuid = self.db.get_first_ref('realobd')
1804 self.cache_uuid = self.db.get_first_ref('cacheobd')
1805 self.add_lustre_module('cobd' , 'cobd')
1807 # need to check /proc/mounts and /etc/mtab before
1808 # formatting anything.
1809 # FIXME: check if device is already formatted.
1811 if is_prepared(self.name):
1813 self.info(self.real_uuid, self.cache_uuid)
1814 lctl.newdev("cobd", self.name, self.uuid,
1815 setup ="%s %s" %(self.real_uuid, self.cache_uuid))
1818 # virtual interface for OSC and LOV
1820 def __init__(self, db, uuid, fs_name, name_override = None):
1821 Module.__init__(self, 'VOSC', db)
1822 if db.get_class() == 'lov':
1823 self.osc = LOV(db, uuid, fs_name, name_override)
1825 self.osc = get_osc(db, uuid, fs_name)
1827 return self.osc.uuid
1829 return self.osc.name
1834 def load_module(self):
1835 self.osc.load_module()
1836 def cleanup_module(self):
1837 self.osc.cleanup_module()
1840 class ECHO_CLIENT(Module):
1841 def __init__(self,db):
1842 Module.__init__(self, 'ECHO_CLIENT', db)
1843 self.add_lustre_module('obdecho', 'obdecho')
1844 self.obd_uuid = self.db.get_first_ref('obd')
1845 obd = self.db.lookup(self.obd_uuid)
1846 self.uuid = generate_client_uuid(self.name)
1847 self.osc = VOSC(obd, self.uuid, self.name)
1850 if is_prepared(self.name):
1853 self.osc.prepare() # XXX This is so cheating. -p
1854 self.info(self.obd_uuid)
1856 lctl.newdev("echo_client", self.name, self.uuid,
1857 setup = self.osc.get_name())
1860 if is_prepared(self.name):
1861 Module.cleanup(self)
1864 def load_module(self):
1865 self.osc.load_module()
1866 Module.load_module(self)
1868 def cleanup_module(self):
1869 Module.cleanup_module(self)
1870 self.osc.cleanup_module()
1873 def generate_client_uuid(name):
1874 client_uuid = '%05x_%.19s_%05x%05x' % (int(random.random() * 1048576),
1876 int(random.random() * 1048576),
1877 int(random.random() * 1048576))
1878 return client_uuid[:36]
1881 class Mountpoint(Module):
1882 def __init__(self,db):
1883 Module.__init__(self, 'MTPT', db)
1884 self.path = self.db.get_val('path')
1885 self.fs_uuid = self.db.get_first_ref('filesystem')
1886 fs = self.db.lookup(self.fs_uuid)
1887 self.mds_uuid = fs.get_first_ref('mds')
1888 self.obd_uuid = fs.get_first_ref('obd')
1889 self.mgmt_uuid = fs.get_first_ref('mgmt')
1890 obd = self.db.lookup(self.obd_uuid)
1891 client_uuid = generate_client_uuid(self.name)
1892 self.vosc = VOSC(obd, client_uuid, self.name)
1893 self.mdc = get_mdc(db, client_uuid, self.name, self.mds_uuid)
1895 self.add_lustre_module('mdc', 'mdc')
1896 self.add_lustre_module('llite', 'llite')
1898 self.mgmtcli = ManagementClient(db.lookup(self.mgmt_uuid),
1904 if fs_is_mounted(self.path):
1905 log(self.path, "already mounted.")
1909 self.mgmtcli.prepare()
1912 mdc_name = self.mdc.name
1914 self.info(self.path, self.mds_uuid, self.obd_uuid)
1915 if config.record or config.lctl_dump:
1916 lctl.mount_option(local_node_name, self.vosc.get_name(), mdc_name)
1918 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s %s %s" % \
1919 (self.vosc.get_name(), mdc_name, config.config, self.path)
1920 run("mkdir", self.path)
1925 panic("mount failed:", self.path, ":", string.join(val))
1928 self.info(self.path, self.mds_uuid,self.obd_uuid)
1930 if config.record or config.lctl_dump:
1931 lctl.del_mount_option(local_node_name)
1933 if fs_is_mounted(self.path):
1935 (rc, out) = run("umount", "-f", self.path)
1937 (rc, out) = run("umount", self.path)
1939 raise CommandError('umount', out, rc)
1941 if fs_is_mounted(self.path):
1942 panic("fs is still mounted:", self.path)
1947 self.mgmtcli.cleanup()
1949 def load_module(self):
1951 self.mgmtcli.load_module()
1952 self.vosc.load_module()
1953 Module.load_module(self)
1955 def cleanup_module(self):
1956 Module.cleanup_module(self)
1957 self.vosc.cleanup_module()
1959 self.mgmtcli.cleanup_module()
1962 # ============================================================
1963 # misc query functions
1965 def get_ost_net(self, osd_uuid):
1969 osd = self.lookup(osd_uuid)
1970 node_uuid = osd.get_first_ref('node')
1971 node = self.lookup(node_uuid)
1973 panic("unable to find node for osd_uuid:", osd_uuid,
1974 " node_ref:", node_uuid)
1975 for net_uuid in node.get_networks():
1976 db = node.lookup(net_uuid)
1977 srv_list.append(Network(db))
1981 # the order of iniitailization is based on level.
1982 def getServiceLevel(self):
1983 type = self.get_class()
1985 if type in ('network',):
1987 elif type in ('routetbl',):
1989 elif type in ('ldlm',):
1991 elif type in ('mgmt',):
1993 elif type in ('osd', 'cobd'):
1995 elif type in ('mdsdev',):
1997 elif type in ('mountpoint', 'echoclient'):
2000 panic("Unknown type: ", type)
2002 if ret < config.minlevel or ret > config.maxlevel:
2007 # return list of services in a profile. list is a list of tuples
2008 # [(level, db_object),]
2009 def getServices(self):
2011 for ref_class, ref_uuid in self.get_all_refs():
2012 servdb = self.lookup(ref_uuid)
2014 level = getServiceLevel(servdb)
2016 list.append((level, servdb))
2018 panic('service not found: ' + ref_uuid)
2024 ############################################################
2026 # FIXME: clean this mess up!
2028 # OSC is no longer in the xml, so we have to fake it.
2029 # this is getting ugly and begging for another refactoring
2030 def get_osc(ost_db, uuid, fs_name):
2031 osc = OSC(ost_db, uuid, fs_name)
2034 def get_mdc(db, uuid, fs_name, mds_uuid):
2035 mds_db = db.lookup(mds_uuid);
2037 panic("no mds:", mds_uuid)
2038 mdc = MDC(mds_db, uuid, fs_name)
2041 ############################################################
2042 # routing ("rooting")
2044 # list of (nettype, cluster_id, nid)
2047 def find_local_clusters(node_db):
2048 global local_clusters
2049 for netuuid in node_db.get_networks():
2050 net = node_db.lookup(netuuid)
2052 debug("add_local", netuuid)
2053 local_clusters.append((srv.net_type, srv.cluster_id, srv.nid))
2055 if acceptors.has_key(srv.port):
2056 panic("duplicate port:", srv.port)
2057 acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type,
2058 srv.send_mem, srv.recv_mem,
2061 # This node is a gateway.
2063 def node_is_router():
2066 # If there are any routers found in the config, then this will be true
2067 # and all nodes will load kptlrouter.
2069 def node_needs_router():
2070 return needs_router or is_router
2072 # list of (nettype, gw, tgt_cluster_id, lo, hi)
2073 # Currently, these local routes are only added to kptlrouter route
2074 # table if they are needed to connect to a specific server. This
2075 # should be changed so all available routes are loaded, and the
2076 # ptlrouter can make all the decisions.
2079 def find_local_routes(lustre):
2080 """ Scan the lustre config looking for routers . Build list of
2082 global local_routes, needs_router
2084 list = lustre.lookup_class('node')
2086 if router.get_val_int('router', 0):
2088 for (local_type, local_cluster_id, local_nid) in local_clusters:
2090 for netuuid in router.get_networks():
2091 db = router.lookup(netuuid)
2092 if (local_type == db.get_val('nettype') and
2093 local_cluster_id == db.get_val('clusterid')):
2094 gw = db.get_val('nid')
2097 debug("find_local_routes: gw is", gw)
2098 for route in router.get_local_routes(local_type, gw):
2099 local_routes.append(route)
2100 debug("find_local_routes:", local_routes)
2103 def choose_local_server(srv_list):
2104 for srv in srv_list:
2105 if local_cluster(srv.net_type, srv.cluster_id):
2108 def local_cluster(net_type, cluster_id):
2109 for cluster in local_clusters:
2110 if net_type == cluster[0] and cluster_id == cluster[1]:
2114 def local_interface(net_type, cluster_id, nid):
2115 for cluster in local_clusters:
2116 if (net_type == cluster[0] and cluster_id == cluster[1]
2117 and nid == cluster[2]):
2121 def find_route(srv_list):
2123 frm_type = local_clusters[0][0]
2124 for srv in srv_list:
2125 debug("find_route: srv:", srv.nid, "type: ", srv.net_type)
2126 to_type = srv.net_type
2128 cluster_id = srv.cluster_id
2129 debug ('looking for route to', to_type, to)
2130 for r in local_routes:
2131 debug("find_route: ", r)
2132 if (r[3] <= to and to <= r[4]) and cluster_id == r[2]:
2133 result.append((srv, r))
2136 def get_active_target(db):
2137 target_uuid = db.getUUID()
2138 target_name = db.getName()
2139 node_name = get_select(target_name)
2141 tgt_dev_uuid = db.get_node_tgt_dev(node_name, target_uuid)
2143 tgt_dev_uuid = db.get_first_ref('active')
2146 def get_server_by_nid_uuid(db, nid_uuid):
2147 for n in db.lookup_class("network"):
2149 if net.nid_uuid == nid_uuid:
2153 ############################################################
2157 type = db.get_class()
2158 debug('Service:', type, db.getName(), db.getUUID())
2163 n = LOV(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
2164 elif type == 'network':
2166 elif type == 'routetbl':
2170 elif type == 'cobd':
2172 elif type == 'mdsdev':
2174 elif type == 'mountpoint':
2176 elif type == 'echoclient':
2178 elif type == 'mgmt':
2181 panic ("unknown service type:", type)
2185 # Prepare the system to run lustre using a particular profile
2186 # in a the configuration.
2187 # * load & the modules
2188 # * setup networking for the current node
2189 # * make sure partitions are in place and prepared
2190 # * initialize devices with lctl
2191 # Levels is important, and needs to be enforced.
2192 def for_each_profile(db, prof_list, operation):
2193 for prof_uuid in prof_list:
2194 prof_db = db.lookup(prof_uuid)
2196 panic("profile:", profile, "not found.")
2197 services = getServices(prof_db)
2200 def doWriteconf(services):
2204 if s[1].get_class() == 'mdsdev':
2205 n = newService(s[1])
2208 def doSetup(services):
2212 n = newService(s[1])
2215 def doModules(services):
2219 n = newService(s[1])
2222 def doCleanup(services):
2227 n = newService(s[1])
2228 if n.safe_to_clean():
2231 def doUnloadModules(services):
2236 n = newService(s[1])
2237 if n.safe_to_clean_modules():
2242 def doHost(lustreDB, hosts):
2243 global is_router, local_node_name
2246 node_db = lustreDB.lookup_name(h, 'node')
2250 panic('No host entry found.')
2252 local_node_name = node_db.get_val('name', 0)
2253 is_router = node_db.get_val_int('router', 0)
2254 lustre_upcall = node_db.get_val('lustreUpcall', '')
2255 portals_upcall = node_db.get_val('portalsUpcall', '')
2256 timeout = node_db.get_val_int('timeout', 0)
2257 ptldebug = node_db.get_val('ptldebug', '')
2258 subsystem = node_db.get_val('subsystem', '')
2260 find_local_clusters(node_db)
2262 find_local_routes(lustreDB)
2264 # Two step process: (1) load modules, (2) setup lustre
2265 # if not cleaning, load modules first.
2266 prof_list = node_db.get_refs('profile')
2268 if config.write_conf:
2269 for_each_profile(node_db, prof_list, doModules)
2271 for_each_profile(node_db, prof_list, doWriteconf)
2272 for_each_profile(node_db, prof_list, doUnloadModules)
2274 elif config.recover:
2275 if not (config.tgt_uuid and config.client_uuid and config.conn_uuid):
2276 raise Lustre.LconfError( "--recovery requires --tgt_uuid <UUID> " +
2277 "--client_uuid <UUID> --conn_uuid <UUID>")
2278 doRecovery(lustreDB, lctl, config.tgt_uuid, config.client_uuid,
2280 elif config.cleanup:
2282 # the command line can override this value
2284 # ugly hack, only need to run lctl commands for --dump
2285 if config.lctl_dump or config.record:
2286 for_each_profile(node_db, prof_list, doCleanup)
2289 sys_set_timeout(timeout)
2290 sys_set_ptldebug(ptldebug)
2291 sys_set_subsystem(subsystem)
2292 sys_set_lustre_upcall(lustre_upcall)
2293 sys_set_portals_upcall(portals_upcall)
2295 for_each_profile(node_db, prof_list, doCleanup)
2296 for_each_profile(node_db, prof_list, doUnloadModules)
2299 # ugly hack, only need to run lctl commands for --dump
2300 if config.lctl_dump or config.record:
2301 sys_set_timeout(timeout)
2302 sys_set_lustre_upcall(lustre_upcall)
2303 for_each_profile(node_db, prof_list, doSetup)
2307 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
2308 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
2310 for_each_profile(node_db, prof_list, doModules)
2312 sys_set_debug_path()
2313 sys_set_ptldebug(ptldebug)
2314 sys_set_subsystem(subsystem)
2315 script = config.gdb_script
2316 run(lctl.lctl, ' modules >', script)
2318 log ("The GDB module script is in", script)
2319 # pause, so user has time to break and
2322 sys_set_timeout(timeout)
2323 sys_set_lustre_upcall(lustre_upcall)
2324 sys_set_portals_upcall(portals_upcall)
2326 for_each_profile(node_db, prof_list, doSetup)
2328 def doRecovery(db, lctl, tgt_uuid, client_uuid, nid_uuid):
2329 tgt = db.lookup(tgt_uuid)
2331 raise Lustre.LconfError("doRecovery: "+ tgt_uuid +" not found.")
2332 new_uuid = get_active_target(tgt)
2334 raise Lustre.LconfError("doRecovery: no active target found for: " +
2336 net = choose_local_server(get_ost_net(db, new_uuid))
2338 raise Lustre.LconfError("Unable to find a connection to:" + new_uuid)
2340 log("Reconnecting", tgt_uuid, " to ", net.nid_uuid);
2342 oldnet = get_server_by_nid_uuid(db, nid_uuid)
2344 lctl.disconnect(oldnet)
2345 except CommandError, e:
2346 log("recover: disconnect", nid_uuid, "failed: ")
2351 except CommandError, e:
2352 log("recover: connect failed")
2355 lctl.recover(client_uuid, net.nid_uuid)
2358 def setupModulePath(cmd, portals_dir = PORTALS_DIR):
2359 base = os.path.dirname(cmd)
2360 if development_mode():
2361 if not config.lustre:
2362 debug('using objdir module paths')
2363 config.lustre = (os.path.join(base, ".."))
2364 # normalize the portals dir, using command line arg if set
2366 portals_dir = config.portals
2367 dir = os.path.join(config.lustre, portals_dir)
2368 config.portals = dir
2369 debug('config.portals', config.portals)
2370 elif config.lustre and config.portals:
2372 # if --lustre and --portals, normalize portals
2373 # can ignore POTRALS_DIR here, since it is probly useless here
2374 config.portals = os.path.join(config.lustre, config.portals)
2375 debug('config.portals B', config.portals)
2377 def sysctl(path, val):
2378 debug("+ sysctl", path, val)
2382 fp = open(os.path.join('/proc/sys', path), 'w')
2389 def sys_set_debug_path():
2390 sysctl('portals/debug_path', config.debug_path)
2392 def sys_set_lustre_upcall(upcall):
2393 # the command overrides the value in the node config
2394 if config.lustre_upcall:
2395 upcall = config.lustre_upcall
2397 upcall = config.upcall
2399 lctl.set_lustre_upcall(upcall)
2401 def sys_set_portals_upcall(upcall):
2402 # the command overrides the value in the node config
2403 if config.portals_upcall:
2404 upcall = config.portals_upcall
2406 upcall = config.upcall
2408 sysctl('portals/upcall', upcall)
2410 def sys_set_timeout(timeout):
2411 # the command overrides the value in the node config
2412 if config.timeout and config.timeout > 0:
2413 timeout = config.timeout
2414 if timeout != None and timeout > 0:
2415 lctl.set_timeout(timeout)
2417 def sys_tweak_socknal ():
2418 if config.single_socket:
2419 sysctl("socknal/typed", 0)
2421 def sys_optimize_elan ():
2422 procfiles = ["/proc/elan/config/eventint_punt_loops",
2423 "/proc/qsnet/elan3/config/eventint_punt_loops",
2424 "/proc/qsnet/elan4/config/elan4_mainint_punt_loops"]
2426 if os.access(p, os.R_OK):
2427 run ("echo 0 > " + p)
2429 def sys_set_ptldebug(ptldebug):
2431 ptldebug = config.ptldebug
2434 val = eval(ptldebug, ptldebug_names)
2435 val = "0x%x" % (val)
2436 sysctl('portals/debug', val)
2437 except NameError, e:
2440 def sys_set_subsystem(subsystem):
2441 if config.subsystem:
2442 subsystem = config.subsystem
2445 val = eval(subsystem, subsystem_names)
2446 val = "0x%x" % (val)
2447 sysctl('portals/subsystem_debug', val)
2448 except NameError, e:
2451 def sys_set_netmem_max(path, max):
2452 debug("setting", path, "to at least", max)
2460 fp = open(path, 'w')
2461 fp.write('%d\n' %(max))
2465 def sys_make_devices():
2466 if not os.access('/dev/portals', os.R_OK):
2467 run('mknod /dev/portals c 10 240')
2468 if not os.access('/dev/obd', os.R_OK):
2469 run('mknod /dev/obd c 10 241')
2472 # Add dir to the global PATH, if not already there.
2473 def add_to_path(new_dir):
2474 syspath = string.split(os.environ['PATH'], ':')
2475 if new_dir in syspath:
2477 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
2479 def default_debug_path():
2480 path = '/tmp/lustre-log'
2481 if os.path.isdir('/r'):
2486 def default_gdb_script():
2487 script = '/tmp/ogdb'
2488 if os.path.isdir('/r'):
2489 return '/r' + script
2494 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
2495 # ensure basic elements are in the system path
2496 def sanitise_path():
2497 for dir in DEFAULT_PATH:
2500 # global hack for the --select handling
2502 def init_select(args):
2503 # args = [service=nodeA,service2=nodeB service3=nodeC]
2506 list = string.split(arg, ',')
2508 srv, node = string.split(entry, '=')
2509 tgt_select[srv] = node
2511 def get_select(srv):
2512 if tgt_select.has_key(srv):
2513 return tgt_select[srv]
2517 FLAG = Lustre.Options.FLAG
2518 PARAM = Lustre.Options.PARAM
2519 INTPARAM = Lustre.Options.INTPARAM
2520 PARAMLIST = Lustre.Options.PARAMLIST
2522 ('verbose,v', "Print system commands as they are run"),
2523 ('ldapurl',"LDAP server URL, eg. ldap://localhost", PARAM),
2524 ('config', "Cluster config name used for LDAP query", PARAM),
2525 ('select', "service=nodeA,service2=nodeB ", PARAMLIST),
2526 ('node', "Load config for <nodename>", PARAM),
2527 ('cleanup,d', "Cleans up config. (Shutdown)"),
2528 ('force,f', "Forced unmounting and/or obd detach during cleanup",
2530 ('single_socket', "socknal option: only use one socket instead of bundle",
2532 ('failover',"""Used to shut down without saving state.
2533 This will allow this node to "give up" a service to a
2534 another node for failover purposes. This will not
2535 be a clean shutdown.""",
2537 ('gdb', """Prints message after creating gdb module script
2538 and sleeps for 5 seconds."""),
2539 ('noexec,n', """Prints the commands and steps that will be run for a
2540 config without executing them. This can used to check if a
2541 config file is doing what it should be doing"""),
2542 ('nomod', "Skip load/unload module step."),
2543 ('nosetup', "Skip device setup/cleanup step."),
2544 ('reformat', "Reformat all devices (without question)"),
2545 ('mkfsoptions', "Additional options for the mk*fs command line", PARAM),
2546 ('dump', "Dump the kernel debug log to file before portals is unloaded",
2548 ('write_conf', "Save all the client config information on mds."),
2549 ('record', "Write config information on mds."),
2550 ('record_log', "Name of config record log.", PARAM),
2551 ('record_device', "MDS device name that will record the config commands",
2553 ('minlevel', "Minimum level of services to configure/cleanup",
2555 ('maxlevel', """Maximum level of services to configure/cleanup
2556 Levels are aproximatly like:
2561 70 - mountpoint, echo_client, osc, mdc, lov""",
2563 ('lustre', """Base directory of lustre sources. This parameter will
2564 cause lconf to load modules from a source tree.""", PARAM),
2565 ('portals', """Portals source directory. If this is a relative path,
2566 then it is assumed to be relative to lustre. """, PARAM),
2567 ('timeout', "Set recovery timeout", INTPARAM),
2568 ('upcall', "Set both portals and lustre upcall script", PARAM),
2569 ('lustre_upcall', "Set lustre upcall script", PARAM),
2570 ('portals_upcall', "Set portals upcall script", PARAM),
2571 ('lctl_dump', "Save lctl ioctls to the dumpfile argument", PARAM),
2572 ('ptldebug', "Set the portals debug level", PARAM),
2573 ('subsystem', "Set the portals debug subsystem", PARAM),
2574 ('gdb_script', "Fullname of gdb debug script", PARAM, default_gdb_script()),
2575 ('debug_path', "Path to save debug dumps", PARAM, default_debug_path()),
2576 # Client recovery options
2577 ('recover', "Recover a device"),
2578 ('group', "The group of devices to configure or cleanup", PARAM),
2579 ('tgt_uuid', "The failed target (required for recovery)", PARAM),
2580 ('client_uuid', "The failed client (required for recovery)", PARAM),
2581 ('conn_uuid', "The failed connection (required for recovery)", PARAM),
2583 ('inactive', """The name of an inactive service, to be ignored during
2584 mounting (currently OST-only). Can be repeated.""",
2589 global lctl, config, toplevel, CONFIG_FILE
2591 # in the upcall this is set to SIG_IGN
2592 signal.signal(signal.SIGCHLD, signal.SIG_DFL)
2594 cl = Lustre.Options("lconf", "config.xml", lconf_options)
2596 config, args = cl.parse(sys.argv[1:])
2597 except Lustre.OptionError, e:
2601 setupModulePath(sys.argv[0])
2603 host = socket.gethostname()
2605 # the PRNG is normally seeded with time(), which is not so good for starting
2606 # time-synchronized clusters
2607 input = open('/dev/urandom', 'r')
2609 print 'Unable to open /dev/urandom!'
2611 seed = input.read(32)
2617 init_select(config.select)
2620 # allow config to be fetched via HTTP, but only with python2
2621 if sys.version[0] != '1' and args[0].startswith('http://'):
2624 config_file = urllib2.urlopen(args[0])
2625 except (urllib2.URLError, socket.error), err:
2626 if hasattr(err, 'args'):
2628 print "Could not access '%s': %s" %(args[0], err)
2630 elif not os.access(args[0], os.R_OK):
2631 print 'File not found or readable:', args[0]
2635 config_file = open(args[0], 'r')
2637 dom = xml.dom.minidom.parse(config_file)
2639 panic("%s does not appear to be a config file." % (args[0]))
2640 sys.exit(1) # make sure to die here, even in debug mode.
2641 CONFIG_FILE = args[0]
2642 db = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement)
2643 if not config.config:
2644 config.config = os.path.basename(args[0])# use full path?
2645 if config.config[-4:] == '.xml':
2646 config.config = config.config[:-4]
2647 elif config.ldapurl:
2648 if not config.config:
2649 panic("--ldapurl requires --config name")
2650 dn = "config=%s,fs=lustre" % (config.config)
2651 db = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl)
2652 elif config.ptldebug or config.subsystem:
2653 sys_set_ptldebug(None)
2654 sys_set_subsystem(None)
2657 print 'Missing config file or ldap URL.'
2658 print 'see lconf --help for command summary'
2663 ver = db.get_version()
2665 panic("No version found in config data, please recreate.")
2666 if ver != Lustre.CONFIG_VERSION:
2667 panic("Config version", ver, "does not match lconf version",
2668 Lustre.CONFIG_VERSION)
2672 node_list.append(config.node)
2675 node_list.append(host)
2676 node_list.append('localhost')
2678 debug("configuring for host: ", node_list)
2681 config.debug_path = config.debug_path + '-' + host
2682 config.gdb_script = config.gdb_script + '-' + host
2684 lctl = LCTLInterface('lctl')
2686 if config.lctl_dump:
2687 lctl.use_save_file(config.lctl_dump)
2690 if not (config.record_device and config.record_log):
2691 panic("When recording, both --record_log and --record_device must be specified.")
2692 lctl.clear_log(config.record_device, config.record_log)
2693 lctl.record(config.record_device, config.record_log)
2695 doHost(db, node_list)
2700 if __name__ == "__main__":
2703 except Lustre.LconfError, e:
2705 # traceback.print_exc(file=sys.stdout)
2707 except CommandError, e:
2711 if first_cleanup_error:
2712 sys.exit(first_cleanup_error)