3 # Copyright (C) 2002 Cluster File Systems, Inc.
4 # Author: Robert Read <rread@clusterfs.com>
5 # This file is part of Lustre, http://www.lustre.org.
7 # Lustre is free software; you can redistribute it and/or
8 # modify it under the terms of version 2 of the GNU General Public
9 # License as published by the Free Software Foundation.
11 # Lustre is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with Lustre; if not, write to the Free Software
18 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 # lconf - lustre configuration tool
22 # lconf is the main driver script for starting and stopping
23 # lustre filesystem services.
25 # Based in part on the XML obdctl modifications done by Brian Behlendorf
27 import sys, getopt, types
28 import string, os, stat, popen2, socket, time, random, fcntl, select
29 import re, exceptions, signal
30 import xml.dom.minidom
32 if sys.version[0] == '1':
33 from FCNTL import F_GETFL, F_SETFL
35 from fcntl import F_GETFL, F_SETFL
37 PYMOD_DIR = "/usr/lib/lustre/python"
39 def development_mode():
40 base = os.path.dirname(sys.argv[0])
41 if os.access(base+"/Makefile.am", os.R_OK):
45 if not development_mode():
46 sys.path.append(PYMOD_DIR)
52 DEFAULT_TCPBUF = 1048576
54 # Maximum number of devices to search for.
55 # (the /dev/loop* nodes need to be created beforehand)
56 MAX_LOOP_DEVICES = 256
57 PORTALS_DIR = 'portals'
60 # Please keep these uptodate with the values in portals/kp30.h
72 "warning" : (1 << 10),
76 "portals" : (1 << 14),
78 "dlmtrace" : (1 << 16),
82 "rpctrace" : (1 << 20),
83 "vfstrace" : (1 << 21),
87 "undefined" : (0 << 24),
96 "ext2obd" : (9 << 24),
97 "portals" : (10 << 24),
98 "socknal" : (11 << 24),
99 "qswnal" : (12 << 24),
100 "pinger" : (13 << 24),
101 "filter" : (14 << 24),
102 "trace" : (15 << 24),
106 "gmnal" : (19 << 24),
107 "ptlrouter" : (20 << 24),
109 "ptlbd" : (22 << 24),
113 first_cleanup_error = 0
114 def cleanup_error(rc):
115 global first_cleanup_error
116 if not first_cleanup_error:
117 first_cleanup_error = rc
119 # ============================================================
120 # debugging and error funcs
122 def fixme(msg = "this feature"):
123 raise Lustre.LconfError, msg + ' not implmemented yet.'
126 msg = string.join(map(str,args))
127 if not config.noexec:
128 raise Lustre.LconfError(msg)
133 msg = string.join(map(str,args))
138 print string.strip(s)
142 msg = string.join(map(str,args))
146 # ack, python's builtin int() does not support '0x123' syntax.
147 # eval can do it, although what a hack!
151 return eval(s, {}, {})
154 except SyntaxError, e:
155 raise ValueError("not a number")
157 raise ValueError("not a number")
159 # ============================================================
160 # locally defined exceptions
161 class CommandError (exceptions.Exception):
162 def __init__(self, cmd_name, cmd_err, rc=None):
163 self.cmd_name = cmd_name
164 self.cmd_err = cmd_err
169 if type(self.cmd_err) == types.StringType:
171 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
173 print "! %s: %s" % (self.cmd_name, self.cmd_err)
174 elif type(self.cmd_err) == types.ListType:
176 print "! %s (error %d):" % (self.cmd_name, self.rc)
178 print "! %s:" % (self.cmd_name)
179 for s in self.cmd_err:
180 print "> %s" %(string.strip(s))
185 # ============================================================
186 # handle daemons, like the acceptor
188 """ Manage starting and stopping a daemon. Assumes daemon manages
189 it's own pid file. """
191 def __init__(self, cmd):
197 log(self.command, "already running.")
199 self.path = find_prog(self.command)
201 panic(self.command, "not found.")
202 ret, out = runcmd(self.path +' '+ self.command_line())
204 raise CommandError(self.path, out, ret)
208 pid = self.read_pidfile()
210 log ("killing process", pid)
212 #time.sleep(1) # let daemon die
214 log("unable to kill", self.command, e)
216 log("unable to kill", self.command)
219 pid = self.read_pidfile()
229 def read_pidfile(self):
231 fp = open(self.pidfile(), 'r')
238 def clean_pidfile(self):
239 """ Remove a stale pidfile """
240 log("removing stale pidfile:", self.pidfile())
242 os.unlink(self.pidfile())
244 log(self.pidfile(), e)
246 class AcceptorHandler(DaemonHandler):
247 def __init__(self, port, net_type, send_mem, recv_mem, irq_aff, nid_xchg):
248 DaemonHandler.__init__(self, "acceptor")
251 self.send_mem = send_mem
252 self.recv_mem = recv_mem
254 if net_type == 'toe':
255 self.flags = self.flags + ' -N 4'
257 self.flags = self.flags + ' -i'
259 self.flags = self.flags + ' -x'
262 return "/var/run/%s-%d.pid" % (self.command, self.port)
264 def command_line(self):
265 return string.join(map(str,('-s', self.send_mem, '-r', self.recv_mem, self.flags, self.port)))
269 # start the acceptors
271 for port in acceptors.keys():
272 daemon = acceptors[port]
273 if not daemon.running():
276 def run_one_acceptor(port):
277 if acceptors.has_key(port):
278 daemon = acceptors[port]
279 if not daemon.running():
282 panic("run_one_acceptor: No acceptor defined for port:", port)
284 def stop_acceptor(port):
285 if acceptors.has_key(port):
286 daemon = acceptors[port]
291 # ============================================================
292 # handle lctl interface
295 Manage communication with lctl
298 def __init__(self, cmd):
300 Initialize close by finding the lctl binary.
302 self.lctl = find_prog(cmd)
306 debug('! lctl not found')
309 raise CommandError('lctl', "unable to find lctl binary.")
311 def use_save_file(self, file):
312 self.save_file = file
314 def set_nonblock(self, fd):
315 fl = fcntl.fcntl(fd, F_GETFL)
316 fcntl.fcntl(fd, F_SETFL, fl | os.O_NDELAY)
321 the cmds are written to stdin of lctl
322 lctl doesn't return errors when run in script mode, so
324 should modify command line to accept multiple commands, or
325 create complex command line options
329 cmds = '\n dump ' + self.save_file + cmds
331 debug("+", cmd_line, cmds)
332 if config.noexec: return (0, [])
334 child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command
335 child.tochild.write(cmds + "\n")
336 child.tochild.close()
338 # From "Python Cookbook" from O'Reilly
339 outfile = child.fromchild
340 outfd = outfile.fileno()
341 self.set_nonblock(outfd)
342 errfile = child.childerr
343 errfd = errfile.fileno()
344 self.set_nonblock(errfd)
346 outdata = errdata = ''
349 ready = select.select([outfd,errfd],[],[]) # Wait for input
350 if outfd in ready[0]:
351 outchunk = outfile.read()
352 if outchunk == '': outeof = 1
353 outdata = outdata + outchunk
354 if errfd in ready[0]:
355 errchunk = errfile.read()
356 if errchunk == '': erreof = 1
357 errdata = errdata + errchunk
358 if outeof and erreof: break
359 # end of "borrowed" code
362 if os.WIFEXITED(ret):
363 rc = os.WEXITSTATUS(ret)
366 if rc or len(errdata):
367 raise CommandError(self.lctl, errdata, rc)
370 def runcmd(self, *args):
372 run lctl using the command line
374 cmd = string.join(map(str,args))
375 debug("+", self.lctl, cmd)
376 rc, out = run(self.lctl, cmd)
378 raise CommandError(self.lctl, out, rc)
382 def network(self, net, nid):
383 """ initialized network and add "self" """
387 quit """ % (net, nid)
390 # create a new connection
391 def connect(self, srv):
392 cmds = "\n add_uuid %s %s %s" % (srv.uuid, srv.nid, srv.net_type)
393 if srv.net_type in ('tcp', 'toe') and not config.lctl_dump:
403 connect %s %d %s""" % (cmds, srv.net_type,
406 srv.hostaddr, srv.port, flags )
408 cmds = cmds + "\n quit"
412 def recover(self, dev_uuid, new_conn):
416 recover %s""" %(dev_uuid, new_conn)
419 # add a route to a range
420 def add_route(self, net, gw, lo, hi):
429 def del_route(self, net, gw, lo, hi):
437 # add a route to a host
438 def add_route_host(self, net, uuid, gw, tgt):
448 # add a route to a range
449 def del_route_host(self, net, uuid, gw, tgt):
455 quit """ % (net, uuid, tgt)
458 # disconnect one connection
459 def disconnect(self, net, nid, port, servuuid):
465 quit""" % (net, nid, servuuid)
468 def del_uuid(self, servuuid):
472 quit""" % (servuuid,)
476 def disconnectAll(self, net):
484 # create a new device with lctl
485 def newdev(self, attach, setup = ""):
490 quit""" % (attach, setup)
494 def cleanup(self, name, uuid, force, failover = 0):
495 if failover: force = 1
501 quit""" % (name, ('', 'force')[force],
502 ('', 'failover')[failover])
506 def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off,
511 lov_setconfig %s %d %d %d %s %s
512 quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
516 def dump(self, dump_file):
519 quit""" % (dump_file)
522 # get list of devices
523 def device_list(self):
525 rc, out = self.runcmd('device_list')
526 except CommandError, e:
534 def lustre_version(self):
535 rc, out = self.runcmd('version')
539 def mount_option(self, option):
544 # ============================================================
545 # Various system-level functions
546 # (ideally moved to their own module)
548 # Run a command and return the output and status.
549 # stderr is sent to /dev/null, could use popen3 to
550 # save it if necessary
553 if config.noexec: return (0, [])
554 f = os.popen(cmd + ' 2>&1')
564 cmd = string.join(map(str,args))
567 # Run a command in the background.
568 def run_daemon(*args):
569 cmd = string.join(map(str,args))
571 if config.noexec: return 0
572 f = os.popen(cmd + ' 2>&1')
580 # Determine full path to use for an external command
581 # searches dirname(argv[0]) first, then PATH
583 syspath = string.split(os.environ['PATH'], ':')
584 cmdpath = os.path.dirname(sys.argv[0])
585 syspath.insert(0, cmdpath);
587 syspath.insert(0, os.path.join(config.portals, 'utils/'))
589 prog = os.path.join(d,cmd)
590 if os.access(prog, os.X_OK):
594 # Recursively look for file starting at base dir
595 def do_find_file(base, mod):
596 fullname = os.path.join(base, mod)
597 if os.access(fullname, os.R_OK):
599 for d in os.listdir(base):
600 dir = os.path.join(base,d)
601 if os.path.isdir(dir):
602 module = do_find_file(dir, mod)
606 def find_module(src_dir, dev_dir, modname):
607 mod = '%s.o' % (modname)
608 module = src_dir +'/'+ dev_dir +'/'+ mod
610 if os.access(module, os.R_OK):
616 # is the path a block device?
623 return stat.S_ISBLK(s[stat.ST_MODE])
625 # build fs according to type
627 def mkfs(dev, devsize, fstype,jsize):
632 panic("size of filesystem on '%s' must be larger than 8MB, but is set to %s"%
634 # devsize is in 1k, and fs block count is in 4k
635 block_cnt = devsize/4
637 if fstype in ('ext3', 'extN'):
638 # ext3 journal size is in megabytes
639 if jsize: jopt = "-J size=%d" %(jsize,)
640 mkfs = 'mkfs.ext2 -j -b 4096 -F '
641 elif fstype == 'reiserfs':
642 # reiserfs journal size is in blocks
643 if jsize: jopt = "--journal_size %d" %(jsize,)
644 mkfs = 'mkreiserfs -ff'
646 print 'unsupported fs type: ', fstype
648 (ret, out) = run (mkfs, jopt, dev, block_cnt)
650 panic("Unable to build fs:", dev, string.join(out))
651 # enable hash tree indexing on fsswe
652 if fstype in ('ext3', 'extN'):
653 htree = 'echo "feature FEATURE_C5" | debugfs -w'
654 (ret, out) = run (htree, dev)
656 panic("Unable to enable htree:", dev)
658 # some systems use /dev/loopN, some /dev/loop/N
662 if not os.access(loop + str(0), os.R_OK):
664 if not os.access(loop + str(0), os.R_OK):
665 panic ("can't access loop devices")
668 # find loop device assigned to thefile
671 for n in xrange(0, MAX_LOOP_DEVICES):
673 if os.access(dev, os.R_OK):
674 (stat, out) = run('losetup', dev)
675 if out and stat == 0:
676 m = re.search(r'\((.*)\)', out[0])
677 if m and file == m.group(1):
683 # create file if necessary and assign the first free loop device
684 def init_loop(file, size, fstype, journal_size):
685 dev = find_loop(file)
687 print 'WARNING file:', file, 'already mapped to', dev
689 if config.reformat or not os.access(file, os.R_OK | os.W_OK):
691 panic("size of loopback file '%s' must be larger than 8MB, but is set to %s" % (file,size))
692 (ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size,
695 panic("Unable to create backing store:", file)
696 mkfs(file, size, fstype, journal_size)
699 # find next free loop
700 for n in xrange(0, MAX_LOOP_DEVICES):
702 if os.access(dev, os.R_OK):
703 (stat, out) = run('losetup', dev)
705 run('losetup', dev, file)
708 print "out of loop devices"
710 print "out of loop devices"
713 # undo loop assignment
714 def clean_loop(file):
715 dev = find_loop(file)
717 ret, out = run('losetup -d', dev)
719 log('unable to clean loop device:', dev, 'for file:', file)
722 # determine if dev is formatted as a <fstype> filesystem
723 def need_format(fstype, dev):
724 # FIXME don't know how to implement this
727 # initialize a block device if needed
728 def block_dev(dev, size, fstype, format, journal_size):
729 if config.noexec: return dev
730 if not is_block(dev):
731 dev = init_loop(dev, size, fstype, journal_size)
732 elif config.reformat or (need_format(fstype, dev) and format == 'yes'):
733 mkfs(dev, size, fstype, journal_size)
736 # panic("device:", dev,
737 # "not prepared, and autoformat is not set.\n",
738 # "Rerun with --reformat option to format ALL filesystems")
743 """lookup IP address for an interface"""
744 rc, out = run("/sbin/ifconfig", iface)
747 addr = string.split(out[1])[1]
748 ip = string.split(addr, ':')[1]
751 def get_local_nid(net_type, wildcard):
752 """Return the local nid."""
754 if os.access('/proc/elan/device0/position', os.R_OK):
755 local = get_local_address('elan', '*')
757 local = get_local_address(net_type, wildcard)
760 def get_local_address(net_type, wildcard):
761 """Return the local address for the network type."""
763 if net_type in ('tcp', 'toe'):
765 iface, star = string.split(wildcard, ':')
766 local = if2addr(iface)
768 panic ("unable to determine ip for:", wildcard)
770 host = socket.gethostname()
771 local = socket.gethostbyname(host)
772 elif net_type == 'elan':
773 # awk '/NodeId/ { print $2 }' '/proc/elan/device0/position'
775 fp = open('/proc/elan/device0/position', 'r')
776 lines = fp.readlines()
785 elif net_type == 'gm':
786 fixme("automatic local address for GM")
787 elif net_type == 'scimac':
788 scinode="/opt/scali/sbin/scinode"
789 if os.path.exists(scinode):
790 (rc,local) = run(scinode)
792 panic (scinode, " not found on node with scimac networking")
794 panic (scinode, " failed")
795 local=string.rstrip(local[0])
800 def is_prepared(uuid):
801 """Return true if a device exists for the uuid"""
804 if config.noexec and config.cleanup:
807 # expect this format:
808 # 1 UP ldlm ldlm ldlm_UUID 2
809 out = lctl.device_list()
811 if uuid == string.split(s)[4]:
813 except CommandError, e:
817 def is_prepared_name(name):
818 """Return true if a device exists for the name"""
821 if config.noexec and config.cleanup:
824 # expect this format:
825 # 1 UP ldlm ldlm ldlm_UUID 2
826 out = lctl.device_list()
828 if name == string.split(s)[3]:
830 except CommandError, e:
834 def is_network_prepared():
835 """If the LDLM device exists, then assume that all networking
836 has been configured"""
837 return is_prepared('ldlm_UUID')
839 def fs_is_mounted(path):
840 """Return true if path is a mounted lustre filesystem"""
842 fp = open('/proc/mounts')
843 lines = fp.readlines()
847 if a[1] == path and a[2] == 'lustre_lite':
854 # ============================================================
855 # Classes to prepare and cleanup the various objects
858 """ Base class for the rest of the modules. The default cleanup method is
859 defined here, as well as some utilitiy funcs.
861 def __init__(self, module_name, db):
863 self.module_name = module_name
864 self.name = self.db.getName()
865 self.uuid = self.db.getUUID()
866 self.kmodule_list = []
870 def info(self, *args):
871 msg = string.join(map(str,args))
872 print self.module_name + ":", self.name, self.uuid, msg
875 """ default cleanup, used for most modules """
878 lctl.cleanup(self.name, self.uuid, config.force)
879 except CommandError, e:
880 log(self.module_name, "cleanup failed: ", self.name)
884 def add_portals_module(self, dev_dir, modname):
885 """Append a module to list of modules to load."""
886 self.kmodule_list.append((config.portals, dev_dir, modname))
888 def add_lustre_module(self, dev_dir, modname):
889 """Append a module to list of modules to load."""
890 self.kmodule_list.append((config.lustre, dev_dir, modname))
892 def mod_loaded(self, modname):
893 """Check if a module is already loaded. Look in /proc/modules for it."""
894 fp = open('/proc/modules')
895 lines = fp.readlines()
897 # please forgive my tired fingers for this one
898 ret = filter(lambda word, mod=modname: word == mod,
899 map(lambda line: string.split(line)[0], lines))
902 def load_module(self):
903 """Load all the modules in the list in the order they appear."""
904 for src_dir, dev_dir, mod in self.kmodule_list:
905 # (rc, out) = run ('/sbin/lsmod | grep -s', mod)
906 if self.mod_loaded(mod) and not config.noexec:
908 log ('loading module:', mod, 'srcdir', src_dir, 'devdir', dev_dir)
910 module = find_module(src_dir, dev_dir, mod)
912 panic('module not found:', mod)
913 (rc, out) = run('/sbin/insmod', module)
915 raise CommandError('insmod', out, rc)
917 (rc, out) = run('/sbin/modprobe', mod)
919 raise CommandError('modprobe', out, rc)
921 def cleanup_module(self):
922 """Unload the modules in the list in reverse order."""
923 if not self.safe_to_clean():
925 rev = self.kmodule_list
927 for src_dir, dev_dir, mod in rev:
928 if not self.mod_loaded(mod) and not config.noexec:
931 if mod == 'portals' and config.dump:
932 lctl.dump(config.dump)
933 log('unloading module:', mod)
934 (rc, out) = run('/sbin/rmmod', mod)
936 log('! unable to unload module:', mod)
939 def safe_to_clean(self):
942 def safe_to_clean_modules(self):
943 return self.safe_to_clean()
945 class Network(Module):
946 def __init__(self,db):
947 Module.__init__(self, 'NETWORK', db)
948 self.net_type = self.db.get_val('nettype')
949 self.nid = self.db.get_val('nid', '*')
950 self.cluster_id = self.db.get_val('clusterid', "0")
951 self.port = self.db.get_val_int('port', 0)
952 self.send_mem = self.db.get_val_int('sendmem', DEFAULT_TCPBUF)
953 self.recv_mem = self.db.get_val_int('recvmem', DEFAULT_TCPBUF)
954 self.irq_affinity = self.db.get_val_int('irqaffinity', 0)
955 self.nid_exchange = self.db.get_val_int('nidexchange', 0)
958 if self.nid_exchange:
959 self.nid = get_local_nid(self.net_type, self.nid)
961 self.nid = get_local_address(self.net_type, self.nid)
963 panic("unable to set nid for", self.net_type, self.nid)
964 debug("nid:", self.nid)
966 self.hostaddr = self.db.get_val('hostaddr', self.nid)
967 if '*' in self.hostaddr:
968 self.hostaddr = get_local_address(self.net_type, self.hostaddr)
970 panic("unable to set nid for", self.net_type, self.hostaddr)
971 debug("hostaddr:", self.hostaddr)
973 self.add_portals_module("libcfs", 'portals')
974 if node_needs_router():
975 self.add_portals_module("router", 'kptlrouter')
976 if self.net_type == 'tcp':
977 self.add_portals_module("knals/socknal", 'ksocknal')
978 if self.net_type == 'toe':
979 self.add_portals_module("knals/toenal", 'ktoenal')
980 if self.net_type == 'elan':
981 self.add_portals_module("knals/qswnal", 'kqswnal')
982 if self.net_type == 'gm':
983 self.add_portals_module("knals/gmnal", 'kgmnal')
984 if self.net_type == 'scimac':
985 self.add_portals_module("knals/scimacnal", 'kscimacnal')
988 if is_network_prepared():
990 self.info(self.net_type, self.nid, self.port)
991 lctl.network(self.net_type, self.nid)
992 if self.port and node_is_router():
993 run_one_acceptor(self.port)
994 self.connect_peer_gateways()
996 def connect_peer_gateways(self):
997 for router in self.db.lookup_class('node'):
998 if router.get_val_int('router', 0):
999 # if this is a peer with a nid less than mine,
1001 for netuuid in router.get_networks():
1002 net = self.db.lookup(netuuid)
1004 if (gw.cluster_id == self.cluster_id and
1005 gw.net_type == self.net_type):
1006 # hack: compare as numbers if possible, this should all
1007 # go away once autoconnect is done.
1008 # This also conveniently prevents us from connecting to ourself.
1010 gw_nid = my_int(gw.nid)
1011 self_nid = my_int(self.nid)
1012 except ValueError, e:
1013 print "Error!", str(e)
1016 if gw_nid < self_nid:
1019 def disconnect_peer_gateways(self):
1020 for router in self.db.lookup_class('node'):
1021 if router.get_val_int('router', 0):
1022 # if this is a peer with a nid less than mine,
1024 if (gw.cluster_id == self.cluster_id and
1025 gw.net_type == self.net_type):
1026 # hack: compare as numbers if possible, this should all
1027 # go away once autoconnect is done.
1028 # This also conveniently prevents us from connecting to ourself.
1030 gw_nid = my_int(gw.nid)
1031 self_nid = my_int(self.nid)
1032 except ValueError, e:
1033 print "Error!", str(e)
1036 if gw_nid < self_nid:
1038 lctl.disconnect(router.net_type, router.nid, router.port,
1040 except CommandError, e:
1041 print "disconnectAll failed: ", self.name
1045 def safe_to_clean(self):
1046 return not is_network_prepared()
1049 self.info(self.net_type, self.nid, self.port)
1051 stop_acceptor(self.port)
1052 if node_is_router():
1053 self.disconnect_peer_gateways()
1055 lctl.disconnectAll(self.net_type)
1056 except CommandError, e:
1057 print "disconnectAll failed: ", self.name
1061 class RouteTable(Module):
1062 def __init__(self,db):
1063 Module.__init__(self, 'ROUTES', db)
1065 if is_network_prepared():
1068 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1069 lctl.add_route(net_type, gw, lo, hi)
1070 if net_type in ('tcp', 'toe') and local_net_type(net_type) and lo == hi:
1071 srvdb = self.db.nid2server(lo, net_type)
1073 panic("no server for nid", lo)
1075 srv = Network(srvdb)
1078 def safe_to_clean(self):
1079 return not is_network_prepared()
1082 if is_network_prepared():
1083 # the network is still being used, don't clean it up
1085 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1086 if net_type in ('tcp', 'toe') and local_net_type(net_type) and hi == '':
1087 srvdb = self.db.nid2server(lo, net_type)
1089 panic("no server for nid", lo)
1091 srv = Network(srvdb)
1093 lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
1094 except CommandError, e:
1095 print "disconnect failed: ", self.name
1099 lctl.del_route(net_type, gw, lo, hi)
1100 except CommandError, e:
1101 print "del_route failed: ", self.name
1106 def __init__(self,db):
1107 Module.__init__(self, 'LDLM', db)
1108 self.add_lustre_module('obdclass', 'obdclass')
1109 self.add_lustre_module('ptlrpc', 'ptlrpc')
1110 self.add_lustre_module('ldlm', 'ldlm')
1113 if is_prepared(self.uuid):
1116 lctl.newdev(attach="ldlm %s %s" % (self.name, self.uuid))
1118 def safe_to_clean(self):
1119 out = lctl.device_list()
1120 return len(out) <= 1
1123 if is_prepared(self.uuid):
1124 Module.cleanup(self)
1127 def __init__(self,db):
1128 Module.__init__(self, 'LOV', db)
1129 self.add_lustre_module('mdc', 'mdc')
1130 self.add_lustre_module('lov', 'lov')
1131 self.mds_uuid = self.db.get_first_ref('mds')
1132 mds= self.db.lookup(self.mds_uuid)
1133 self.mds_name = mds.getName()
1134 self.stripe_sz = self.db.get_val_int('stripesize', 65536)
1135 self.stripe_off = self.db.get_val_int('stripeoffset', 0)
1136 self.pattern = self.db.get_val_int('stripepattern', 0)
1137 self.devlist = self.db.get_refs('obd')
1138 self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist))
1141 for obd_uuid in self.devlist:
1142 obd = self.db.lookup(obd_uuid)
1143 osc = get_osc(obd, self.name)
1145 self.osclist.append(osc)
1147 panic('osc not found:', obd_uuid)
1150 if is_prepared(self.uuid):
1152 for osc in self.osclist:
1154 # Only ignore connect failures with --force, which
1155 # isn't implemented here yet.
1156 osc.prepare(ignore_connect_failure=0)
1157 except CommandError, e:
1158 print "Error preparing OSC %s (inactive)\n" % osc.uuid
1160 self.mdc_uuid = prepare_mdc(self.db, self.name, self.mds_uuid)
1161 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
1162 self.stripe_off, self.pattern, self.devlist, self.mds_name)
1163 lctl.newdev(attach="lov %s %s" % (self.name, self.uuid),
1164 setup ="%s" % (self.mdc_uuid))
1167 if is_prepared(self.uuid):
1168 Module.cleanup(self)
1169 for osc in self.osclist:
1171 cleanup_mdc(self.db, self.name, self.mds_uuid)
1173 def load_module(self):
1174 for osc in self.osclist:
1177 Module.load_module(self)
1179 def cleanup_module(self):
1180 Module.cleanup_module(self)
1181 for osc in self.osclist:
1182 osc.cleanup_module()
1185 class LOVConfig(Module):
1186 def __init__(self,db):
1187 Module.__init__(self, 'LOVConfig', db)
1189 self.lov_uuid = self.db.get_first_ref('lov')
1190 l = self.db.lookup(self.lov_uuid)
1195 self.info(lov.mds_uuid, lov.stripe_cnt, lov.stripe_sz, lov.stripe_off,
1196 lov.pattern, lov.devlist, lov.mds_name)
1197 lctl.lov_setconfig(lov.uuid, lov.mds_name, lov.stripe_cnt,
1198 lov.stripe_sz, lov.stripe_off, lov.pattern,
1199 string.join(lov.devlist))
1205 class MDSDEV(Module):
1206 def __init__(self,db):
1207 Module.__init__(self, 'MDSDEV', db)
1208 self.devpath = self.db.get_val('devpath','')
1209 self.size = self.db.get_val_int('devsize', 0)
1210 self.journal_size = self.db.get_val_int('journalsize', 0)
1211 self.fstype = self.db.get_val('fstype', '')
1212 # overwrite the orignal MDSDEV name and uuid with the MDS name and uuid
1213 target_uuid = self.db.get_first_ref('target')
1214 mds = self.db.lookup(target_uuid)
1215 self.name = mds.getName()
1216 self.lovconfig_uuids = mds.get_refs('lovconfig')
1217 self.filesystem_uuids = mds.get_refs('filesystem')
1218 # FIXME: if fstype not set, then determine based on kernel version
1219 self.format = self.db.get_val('autoformat', "no")
1220 if mds.get_val('failover', 0):
1221 self.failover_mds = 'f'
1223 self.failover_mds = ''
1224 active_uuid = get_active_target(mds)
1226 panic("No target device found:", target_uuid)
1227 if active_uuid == self.uuid:
1231 if self.active and config.group and config.group != ost.get_val('group'):
1234 self.target_dev_uuid = self.uuid
1235 self.uuid = target_uuid
1237 self.add_lustre_module('mds', 'mds')
1239 self.add_lustre_module('obdclass', 'fsfilt_%s' % (self.fstype))
1241 def load_module(self):
1243 Module.load_module(self)
1246 if is_prepared(self.uuid):
1249 debug(self.uuid, "not active")
1251 self.info(self.devpath, self.fstype, self.format)
1253 blkdev = block_dev(self.devpath, self.size, self.fstype, self.format,
1255 if not is_prepared('MDT_UUID'):
1256 lctl.newdev(attach="mdt %s %s" % ('MDT', 'MDT_UUID'),
1258 lctl.newdev(attach="mds %s %s" % (self.name, self.uuid),
1259 setup ="%s %s" %(blkdev, self.fstype))
1260 for uuid in self.lovconfig_uuids:
1261 db = self.db.lookup(uuid)
1262 lovconfig = LOVConfig(db)
1264 if config.mds_ost_conn:
1265 for uuid in self.filesystem_uuids:
1266 log("open clients for filesystem:", uuid)
1267 fs = self.db.lookup(uuid)
1268 obd_uuid = fs.get_first_ref('obd')
1269 client = VOSC(self.db.lookup(obd_uuid), self.name)
1273 def msd_remaining(self):
1274 out = lctl.device_list()
1276 if string.split(s)[2] in ('mds',):
1279 def safe_to_clean(self):
1282 def safe_to_clean_modules(self):
1283 return not self.msd_remaining()
1287 debug(self.uuid, "not active")
1289 if is_prepared(self.uuid):
1292 lctl.cleanup(self.name, self.uuid, config.force,
1294 except CommandError, e:
1295 log(self.module_name, "cleanup failed: ", self.name)
1298 Module.cleanup(self)
1299 if config.mds_ost_conn:
1300 for uuid in self.filesystem_uuids:
1301 log("clean clients for filesystem:", uuid)
1302 log("open clients for filesystem:", uuid)
1303 fs = self.db.lookup(uuid)
1304 obd_uuid = fs.get_first_ref('obd')
1305 client = VOSC(self.db.lookup(obd_uuid), self.name)
1307 if not self.msd_remaining() and is_prepared('MDT_UUID'):
1309 lctl.cleanup("MDT", "MDT_UUID", config.force,
1311 except CommandError, e:
1312 print "cleanup failed: ", self.name
1315 clean_loop(self.devpath)
1318 def __init__(self, db):
1319 Module.__init__(self, 'OSD', db)
1320 self.osdtype = self.db.get_val('osdtype')
1321 self.devpath = self.db.get_val('devpath', '')
1322 self.size = self.db.get_val_int('devsize', 0)
1323 self.journal_size = self.db.get_val_int('journalsize', 0)
1324 self.fstype = self.db.get_val('fstype', '')
1325 target_uuid = self.db.get_first_ref('target')
1326 ost = self.db.lookup(target_uuid)
1327 self.name = ost.getName()
1328 self.format = self.db.get_val('autoformat', 'yes')
1329 if ost.get_val('failover', 0):
1330 self.failover_ost = 'f'
1332 self.failover_ost = ''
1334 active_uuid = get_active_target(ost)
1336 panic("No target device found:", target_uuid)
1337 if active_uuid == self.uuid:
1341 if self.active and config.group and config.group != ost.get_val('group'):
1344 self.target_dev_uuid = self.uuid
1345 self.uuid = target_uuid
1347 self.add_lustre_module('ost', 'ost')
1348 # FIXME: should we default to ext3 here?
1350 self.add_lustre_module('obdclass' , 'fsfilt_%s' % (self.fstype))
1351 self.add_lustre_module(self.osdtype, self.osdtype)
1353 def load_module(self):
1355 Module.load_module(self)
1357 # need to check /proc/mounts and /etc/mtab before
1358 # formatting anything.
1359 # FIXME: check if device is already formatted.
1361 if is_prepared(self.uuid):
1364 debug(self.uuid, "not active")
1366 self.info(self.osdtype, self.devpath, self.size, self.fstype,
1367 self.format, self.journal_size)
1369 if self.osdtype == 'obdecho':
1372 blkdev = block_dev(self.devpath, self.size, self.fstype,
1373 self.format, self.journal_size)
1374 lctl.newdev(attach="%s %s %s" % (self.osdtype, self.name, self.uuid),
1375 setup ="%s %s %s" %(blkdev, self.fstype,
1377 if not is_prepared('OSS_UUID'):
1378 lctl.newdev(attach="ost %s %s" % ('OSS', 'OSS_UUID'),
1381 def osd_remaining(self):
1382 out = lctl.device_list()
1384 if string.split(s)[2] in ('obdfilter', 'obdecho'):
1387 def safe_to_clean(self):
1390 def safe_to_clean_modules(self):
1391 return not self.osd_remaining()
1395 debug(self.uuid, "not active")
1397 if is_prepared(self.uuid):
1400 lctl.cleanup(self.name, self.uuid, config.force,
1402 except CommandError, e:
1403 log(self.module_name, "cleanup failed: ", self.name)
1406 if not self.osd_remaining() and is_prepared('OSS_UUID'):
1408 lctl.cleanup("OSS", "OSS_UUID", config.force,
1410 except CommandError, e:
1411 print "cleanup failed: ", self.name
1414 if not self.osdtype == 'obdecho':
1415 clean_loop(self.devpath)
1417 # Generic client module, used by OSC and MDC
1418 class Client(Module):
1419 def __init__(self, tgtdb, module, owner):
1420 self.target_name = tgtdb.getName()
1421 self.target_uuid = tgtdb.getUUID()
1424 self.tgt_dev_uuid = get_active_target(tgtdb)
1425 if not self.tgt_dev_uuid:
1426 panic("No target device found for target:", self.target_name)
1428 self.kmodule_list = []
1432 self.module = module
1433 self.module_name = string.upper(module)
1434 self.name = '%s_%s_%s_%s' % (self.module_name, socket.gethostname(),
1435 self.target_name, owner)
1436 self.uuid = '%05x_%.19s_%05x%05x' % (int(random.random() * 1048576),
1438 int(random.random() * 1048576),
1439 int(random.random() * 1048576))
1440 self.uuid = self.uuid[0:36]
1441 self.lookup_server(self.tgt_dev_uuid)
1442 self.add_lustre_module(module, module)
1444 def lookup_server(self, srv_uuid):
1445 """ Lookup a server's network information """
1446 self._server_nets = get_ost_net(self.db, srv_uuid)
1447 if len(self._server_nets) == 0:
1448 panic ("Unable to find a server for:", srv_uuid)
1450 def get_servers(self):
1451 return self._server_nets
1453 def prepare(self, ignore_connect_failure = 0):
1454 self.info(self.target_uuid)
1455 if is_prepared_name(self.name):
1458 srv = choose_local_server(self.get_servers())
1462 srv, r = find_route(self.get_servers())
1464 lctl.add_route_host(r[0], srv.uuid, r[1], r[2])
1466 panic ("no route to", self.target_uuid)
1467 except CommandError, e:
1468 if not ignore_connect_failure:
1471 lctl.newdev(attach="%s %s %s" % (self.module, self.name, self.uuid),
1472 setup ="%s %s" %(self.target_uuid, srv.uuid))
1475 if is_prepared_name(self.name):
1476 Module.cleanup(self)
1478 srv = choose_local_server(self.get_servers())
1480 lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
1482 srv, r = find_route(self.get_servers())
1484 lctl.del_route_host(r[0], srv.uuid, r[1], r[2])
1485 except CommandError, e:
1486 log(self.module_name, "cleanup failed: ", self.name)
1492 def __init__(self, db, owner):
1493 Client.__init__(self, db, 'mdc', owner)
1496 def __init__(self, db, owner):
1497 Client.__init__(self, db, 'osc', owner)
1501 def __init__(self, db):
1502 Module.__init__(self, 'COBD', db)
1503 self.real_uuid = self.db.get_first_ref('realobd')
1504 self.cache_uuid = self.db.get_first_ref('cacheobd')
1505 self.add_lustre_module('cobd' , 'cobd')
1507 # need to check /proc/mounts and /etc/mtab before
1508 # formatting anything.
1509 # FIXME: check if device is already formatted.
1511 if is_prepared(self.uuid):
1513 self.info(self.real_uuid, self.cache_uuid)
1514 lctl.newdev(attach="cobd %s %s" % (self.name, self.uuid),
1515 setup ="%s %s" %(self.real_uuid, self.cache_uuid))
1518 # virtual interface for OSC and LOV
1520 def __init__(self,db, owner):
1521 Module.__init__(self, 'VOSC', db)
1522 if db.get_class() == 'lov':
1525 self.osc = get_osc(db, owner)
1527 return self.osc.uuid
1532 def load_module(self):
1533 self.osc.load_module()
1534 def cleanup_module(self):
1535 self.osc.cleanup_module()
1537 return self.db.get_class() != 'lov'
1538 def get_mdc_uuid(self):
1539 if self.db.get_class() == 'lov':
1540 return self.osc.mdc_uuid
1544 class ECHO_CLIENT(Module):
1545 def __init__(self,db):
1546 Module.__init__(self, 'ECHO_CLIENT', db)
1547 self.add_lustre_module('obdecho', 'obdecho')
1548 self.obd_uuid = self.db.get_first_ref('obd')
1549 obd = self.db.lookup(self.obd_uuid)
1550 self.osc = VOSC(obd, self.name)
1553 if is_prepared(self.uuid):
1555 self.osc.prepare() # XXX This is so cheating. -p
1556 self.info(self.obd_uuid)
1558 lctl.newdev(attach="echo_client %s %s" % (self.name, self.uuid),
1559 setup = self.osc.get_uuid())
1562 if is_prepared(self.uuid):
1563 Module.cleanup(self)
1566 def load_module(self):
1567 self.osc.load_module()
1568 Module.load_module(self)
1569 def cleanup_module(self):
1570 Module.cleanup_module(self)
1571 self.osc.cleanup_module()
1574 class Mountpoint(Module):
1575 def __init__(self,db):
1576 Module.__init__(self, 'MTPT', db)
1577 self.path = self.db.get_val('path')
1578 self.fs_uuid = self.db.get_first_ref('filesystem')
1579 fs = self.db.lookup(self.fs_uuid)
1580 self.mds_uuid = fs.get_first_ref('mds')
1581 self.obd_uuid = fs.get_first_ref('obd')
1582 obd = self.db.lookup(self.obd_uuid)
1583 self.vosc = VOSC(obd, self.name)
1584 if self.vosc.need_mdc():
1585 self.add_lustre_module('mdc', 'mdc')
1586 self.add_lustre_module('llite', 'llite')
1590 if fs_is_mounted(self.path):
1591 log(self.path, "already mounted.")
1594 if self.vosc.need_mdc():
1595 mdc_uuid = prepare_mdc(self.db, self.name, self.mds_uuid)
1597 mdc_uuid = self.vosc.get_mdc_uuid()
1600 panic("Unable to determine MDC UUID. Probably need to cleanup before re-mounting.")
1601 self.info(self.path, self.mds_uuid, self.obd_uuid)
1602 if config.lctl_dump:
1603 cmd = "osc=%s,mdc=%s" % (self.vosc.get_uuid(), mdc_uuid)
1604 lctl.mount_option(cmd)
1606 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s none %s" % \
1607 (self.vosc.get_uuid(), mdc_uuid, self.path)
1608 run("mkdir", self.path)
1612 if self.vosc.need_mdc():
1613 cleanup_mdc(self.db, self.name, self.mds_uuid)
1614 panic("mount failed:", self.path, ":", string.join(val))
1617 self.info(self.path, self.mds_uuid,self.obd_uuid)
1618 if fs_is_mounted(self.path):
1620 (rc, out) = run("umount", "-f", self.path)
1622 (rc, out) = run("umount", self.path)
1624 raise CommandError('umount', out, rc)
1626 if fs_is_mounted(self.path):
1627 panic("fs is still mounted:", self.path)
1630 if self.vosc.need_mdc():
1631 cleanup_mdc(self.db, self.name, self.mds_uuid)
1633 def load_module(self):
1634 self.vosc.load_module()
1635 Module.load_module(self)
1636 def cleanup_module(self):
1637 Module.cleanup_module(self)
1638 self.vosc.cleanup_module()
1641 # ============================================================
1642 # misc query functions
1644 def get_ost_net(self, osd_uuid):
1648 osd = self.lookup(osd_uuid)
1649 node_uuid = osd.get_first_ref('node')
1650 node = self.lookup(node_uuid)
1652 panic("unable to find node for osd_uuid:", osd_uuid,
1653 " node_ref:", node_uuid)
1654 for net_uuid in node.get_networks():
1655 db = node.lookup(net_uuid)
1656 srv_list.append(Network(db))
1660 # the order of iniitailization is based on level.
1661 def getServiceLevel(self):
1662 type = self.get_class()
1664 if type in ('network',):
1666 elif type in ('routetbl',):
1668 elif type in ('ldlm',):
1670 elif type in ('osd', 'cobd'):
1672 elif type in ('mdsdev',):
1674 elif type in ('mountpoint', 'echoclient'):
1677 panic("Unknown type: ", type)
1679 if ret < config.minlevel or ret > config.maxlevel:
1684 # return list of services in a profile. list is a list of tuples
1685 # [(level, db_object),]
1686 def getServices(self):
1688 for ref_class, ref_uuid in self.get_all_refs():
1689 servdb = self.lookup(ref_uuid)
1691 level = getServiceLevel(servdb)
1693 list.append((level, servdb))
1695 panic('service not found: ' + ref_uuid)
1701 ############################################################
1703 # FIXME: clean this mess up!
1705 # OSC is no longer in the xml, so we have to fake it.
1706 # this is getting ugly and begging for another refactoring
1707 def get_osc(ost_db, owner):
1708 osc = OSC(ost_db, owner)
1711 def get_mdc(db, owner, mds_uuid):
1712 mds_db = db.lookup(mds_uuid);
1714 panic("no mds:", mds_uuid)
1715 mdc = MDC(mds_db, owner)
1718 def prepare_mdc(db, owner, mds_uuid):
1719 mdc = get_mdc(db, owner, mds_uuid)
1723 def cleanup_mdc(db, owner, mds_uuid):
1724 mdc = get_mdc(db, owner, mds_uuid)
1728 ############################################################
1729 # routing ("rooting")
1731 # list of (nettype, cluster_id)
1734 def find_local_clusters(node_db):
1735 global local_clusters
1736 for netuuid in node_db.get_networks():
1737 net = node_db.lookup(netuuid)
1739 debug("add_local", netuuid)
1740 local_clusters.append((srv.net_type, srv.cluster_id))
1742 if acceptors.has_key(srv.port):
1743 panic("duplicate port:", srv.port)
1744 acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type,
1745 srv.send_mem, srv.recv_mem,
1749 # This node is a gateway.
1751 def node_is_router():
1754 # If there are any routers found in the config, then this will be true
1755 # and all nodes will load kptlrouter.
1757 def node_needs_router():
1758 return needs_router or is_router
1760 # list of (nettype, gw, tgt_cluster_id, lo, hi)
1761 # Currently, these local routes are only added to kptlrouter route
1762 # table if they are needed to connect to a specific server. This
1763 # should be changed so all available routes are loaded, and the
1764 # ptlrouter can make all the decisions.
1767 def find_local_routes(lustre):
1768 """ Scan the lustre config looking for routers . Build list of
1770 global local_routes, needs_router
1772 list = lustre.lookup_class('node')
1774 if router.get_val_int('router', 0):
1776 for (local_type, local_cluster_id) in local_clusters:
1778 for netuuid in router.get_networks():
1779 db = router.lookup(netuuid)
1780 if (local_type == db.get_val('nettype') and
1781 local_cluster_id == db.get_val('clusterid')):
1782 gw = db.get_val('nid')
1785 debug("find_local_routes: gw is", gw)
1786 for route in router.get_local_routes(local_type, gw):
1787 local_routes.append(route)
1788 debug("find_local_routes:", local_routes)
1791 def choose_local_server(srv_list):
1792 for srv in srv_list:
1793 if local_net_type(srv.net_type):
1796 def local_net_type(net_type):
1797 for cluster in local_clusters:
1798 if net_type == cluster[0]:
1802 def find_route(srv_list):
1803 frm_type = local_clusters[0][0]
1804 for srv in srv_list:
1805 debug("find_route: srv:", srv.hostaddr, "type: ", srv.net_type)
1806 to_type = srv.net_type
1807 to = srv.hostaddr # XXX should this be hostaddr, or nid?
1808 cluster_id = srv.cluster_id
1809 debug ('looking for route to', to_type, to)
1810 for r in local_routes:
1811 debug("find_route: ", r)
1812 if (r[3] <= to and to <= r[4]) and cluster_id == r[2]:
1816 def get_active_target(db):
1817 target_uuid = db.getUUID()
1818 target_name = db.getName()
1819 node_name = get_select(target_name)
1821 tgt_dev_uuid = db.get_target_device(target_uuid, node_name)
1823 tgt_dev_uuid = db.get_first_ref('active')
1827 ############################################################
1831 type = db.get_class()
1832 debug('Service:', type, db.getName(), db.getUUID())
1838 elif type == 'network':
1840 elif type == 'routetbl':
1844 elif type == 'cobd':
1846 elif type == 'mdsdev':
1848 elif type == 'mountpoint':
1850 elif type == 'echoclient':
1853 panic ("unknown service type:", type)
1857 # Prepare the system to run lustre using a particular profile
1858 # in a the configuration.
1859 # * load & the modules
1860 # * setup networking for the current node
1861 # * make sure partitions are in place and prepared
1862 # * initialize devices with lctl
1863 # Levels is important, and needs to be enforced.
1864 def for_each_profile(db, prof_list, operation):
1865 for prof_uuid in prof_list:
1866 prof_db = db.lookup(prof_uuid)
1868 panic("profile:", profile, "not found.")
1869 services = getServices(prof_db)
1872 def doSetup(services):
1876 n = newService(s[1])
1879 def doModules(services):
1883 n = newService(s[1])
1886 def doCleanup(services):
1891 n = newService(s[1])
1892 if n.safe_to_clean():
1895 def doUnloadModules(services):
1900 n = newService(s[1])
1901 if n.safe_to_clean_modules():
1906 def doHost(lustreDB, hosts):
1910 node_db = lustreDB.lookup_name(h, 'node')
1914 print 'No host entry found.'
1917 is_router = node_db.get_val_int('router', 0)
1918 lustre_upcall = node_db.get_val('lustreUpcall', '')
1919 portals_upcall = node_db.get_val('portalsUpcall', '')
1920 timeout = node_db.get_val_int('timeout', 0)
1922 find_local_clusters(node_db)
1924 find_local_routes(lustreDB)
1926 # Two step process: (1) load modules, (2) setup lustre
1927 # if not cleaning, load modules first.
1928 prof_list = node_db.get_refs('profile')
1931 if not (config.tgt_uuid and config.client_uuid and config.conn_uuid):
1932 raise Lustre.LconfError( "--recovery requires --tgt_uuid <UUID> " +
1933 "--client_uuid <UUID> --conn_uuid <UUID>")
1934 doRecovery(lustreDB, lctl, config.tgt_uuid, config.client_uuid,
1936 elif config.cleanup:
1938 # the command line can override this value
1940 # ugly hack, only need to run lctl commands for --dump
1941 if config.lctl_dump:
1942 for_each_profile(node_db, prof_list, doCleanup)
1945 sys_set_timeout(timeout)
1948 sys_set_lustre_upcall(lustre_upcall)
1949 sys_set_portals_upcall(portals_upcall)
1951 for_each_profile(node_db, prof_list, doCleanup)
1952 for_each_profile(node_db, prof_list, doUnloadModules)
1955 # ugly hack, only need to run lctl commands for --dump
1956 if config.lctl_dump:
1957 for_each_profile(node_db, prof_list, doSetup)
1961 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
1962 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
1964 for_each_profile(node_db, prof_list, doModules)
1966 sys_set_debug_path()
1969 script = config.gdb_script
1970 run(lctl.lctl, ' modules >', script)
1972 log ("The GDB module script is in", script)
1973 # pause, so user has time to break and
1976 sys_set_timeout(timeout)
1977 sys_set_lustre_upcall(lustre_upcall)
1978 sys_set_portals_upcall(portals_upcall)
1980 for_each_profile(node_db, prof_list, doSetup)
1982 def doRecovery(db, lctl, tgt_uuid, client_uuid, conn_uuid):
1983 tgt = db.lookup(tgt_uuid)
1985 raise Lustre.LconfError("doRecovery: "+ tgt_uuid +" not found.")
1986 new_uuid = get_active_target(tgt)
1988 raise Lustre.LconfError("doRecovery: no active target found for: " +
1990 net = choose_local_server(get_ost_net(db, new_uuid))
1992 raise Lustre.LconfError("Unable to find a connection to:" + new_uuid)
1993 # XXX, better to do a full disconnect here
1994 log("Reconnecting", tgt_uuid, " to ", net.uuid);
1995 lctl.del_uuid(conn_uuid)
1997 lctl.recover(client_uuid, net.uuid)
2000 def setupModulePath(cmd, portals_dir = PORTALS_DIR):
2001 base = os.path.dirname(cmd)
2002 if development_mode():
2003 if not config.lustre:
2004 config.lustre = (os.path.join(base, ".."))
2005 # normalize the portals dir, using command line arg if set
2007 portals_dir = config.portals
2008 dir = os.path.join(config.lustre, portals_dir)
2009 config.portals = dir
2010 debug('config.portals', config.portals)
2011 elif config.lustre and config.portals:
2013 # if --lustre and --portals, normalize portals
2014 # can ignore POTRALS_DIR here, since it is probly useless here
2015 config.portals = os.path.join(config.lustre, config.portals)
2016 debug('config.portals B', config.portals)
2018 def sysctl(path, val):
2019 debug("+ sysctl", path, val)
2023 fp = open(os.path.join('/proc/sys', path), 'w')
2030 def sys_set_debug_path():
2031 sysctl('portals/debug_path', config.debug_path)
2033 def sys_set_lustre_upcall(upcall):
2034 # the command overrides the value in the node config
2035 if config.lustre_upcall:
2036 upcall = config.lustre_upcall
2038 upcall = config.upcall
2040 sysctl('lustre/upcall', upcall)
2042 def sys_set_portals_upcall(upcall):
2043 # the command overrides the value in the node config
2044 if config.portals_upcall:
2045 upcall = config.portals_upcall
2047 upcall = config.upcall
2049 sysctl('portals/upcall', upcall)
2051 def sys_set_timeout(timeout):
2052 # the command overrides the value in the node config
2053 if config.timeout > 0:
2054 timeout = config.timeout
2055 if timeout != None and timeout > 0:
2056 sysctl('lustre/timeout', timeout)
2058 def sys_set_ptldebug():
2059 if config.ptldebug != None:
2061 val = eval(config.ptldebug, ptldebug_names)
2062 val = "0x%x" % (val,)
2063 sysctl('portals/debug', val)
2064 except NameError, e:
2067 def sys_set_subsystem():
2068 if config.subsystem != None:
2070 val = eval(config.ptldebug, ptldebug_names)
2071 val = "0x%x" % (val,)
2072 sysctl('portals/subsystem_debug', val)
2073 except NameError, e:
2076 def sys_set_netmem_max(path, max):
2077 debug("setting", path, "to at least", max)
2085 fp = open(path, 'w')
2086 fp.write('%d\n' %(max))
2090 def sys_make_devices():
2091 if not os.access('/dev/portals', os.R_OK):
2092 run('mknod /dev/portals c 10 240')
2093 if not os.access('/dev/obd', os.R_OK):
2094 run('mknod /dev/obd c 10 241')
2097 # Add dir to the global PATH, if not already there.
2098 def add_to_path(new_dir):
2099 syspath = string.split(os.environ['PATH'], ':')
2100 if new_dir in syspath:
2102 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
2104 def default_debug_path():
2105 path = '/tmp/lustre-log'
2106 if os.path.isdir('/r'):
2111 def default_gdb_script():
2112 script = '/tmp/ogdb'
2113 if os.path.isdir('/r'):
2114 return '/r' + script
2119 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
2120 # ensure basic elements are in the system path
2121 def sanitise_path():
2122 for dir in DEFAULT_PATH:
2125 # global hack for the --select handling
2127 def init_select(arg):
2128 # arg = "service=nodeA,service2=nodeB"
2130 list = string.split(arg, ',')
2132 srv, node = string.split(entry, '=')
2133 tgt_select[srv] = node
2135 def get_select(srv):
2136 if tgt_select.has_key(srv):
2137 return tgt_select[srv]
2141 PARAM = Lustre.Options.PARAM
2142 INTPARAM = Lustre.Options.INTPARAM
2144 ('verbose,v', "Print system commands as they are run"),
2145 ('ldapurl',"LDAP server URL, eg. ldap://localhost", PARAM),
2146 ('config', "Cluster config name used for LDAP query", PARAM),
2147 ('select', "service=nodeA,service2=nodeB ", PARAM),
2148 ('node', "Load config for <nodename>", PARAM),
2149 ('cleanup,d', "Cleans up config. (Shutdown)"),
2150 ('force,f', "Forced unmounting and/or obd detach during cleanup",
2151 Lustre.Options.FLAG, 0),
2152 ('mds_ost_conn', "Open connections to OSTs on the MDS"),
2153 ('failover',"""Used to shut down without saving state.
2154 This will allow this node to "give up" a service to a
2155 another node for failover purposes. This will not
2156 be a clean shutdown.""",
2157 Lustre.Options.FLAG, 0),
2158 ('gdb', """Prints message after creating gdb module script
2159 and sleeps for 5 seconds."""),
2160 ('noexec,n', """Prints the commands and steps that will be run for a
2161 config without executing them. This can used to check if a
2162 config file is doing what it should be doing"""),
2163 ('nomod', "Skip load/unload module step."),
2164 ('nosetup', "Skip device setup/cleanup step."),
2165 ('reformat', "Reformat all devices (without question)"),
2166 ('dump', "Dump the kernel debug log to file before portals is unloaded",
2168 ('minlevel', "Minimum level of services to configure/cleanup",
2170 ('maxlevel', """Maximum level of services to configure/cleanup
2171 Levels are aproximatly like:
2176 70 - mountpoint, echo_client, osc, mdc, lov""",
2178 ('lustre', """Base directory of lustre sources. This parameter will
2179 cause lconf to load modules from a source tree.""", PARAM),
2180 ('portals', """Portals source directory. If this is a relative path,
2181 then it is assumed to be relative to lustre. """, PARAM),
2182 ('timeout', "Set recovery timeout", PARAM),
2183 ('upcall', "Set both portals and lustre upcall script", PARAM),
2184 ('lustre_upcall', "Set lustre upcall script", PARAM),
2185 ('portals_upcall', "Set portals upcall script", PARAM),
2186 ('lctl_dump', "Save lctl ioctls to the dumpfile argument", PARAM),
2187 ('ptldebug', "Set the portals debug level", PARAM),
2188 ('subsystem', "Set the portals debug subsystem", PARAM),
2189 ('gdb_script', "Fullname of gdb debug script", PARAM, default_gdb_script()),
2190 ('debug_path', "Path to save debug dumps", PARAM, default_debug_path()),
2191 # Client recovery options
2192 ('recover', "Recover a device"),
2193 ('group', "The group of devices to configure or cleanup", PARAM),
2194 ('tgt_uuid', "The failed target (required for recovery)", PARAM),
2195 ('client_uuid', "The failed client (required for recovery)", PARAM),
2196 ('conn_uuid', "The failed connection (required for recovery)", PARAM),
2202 # in the upcall this is set to SIG_IGN
2203 signal.signal(signal.SIGCHLD, signal.SIG_DFL)
2205 cl = Lustre.Options("lconf", "config.xml", lconf_options)
2207 config, args = cl.parse(sys.argv[1:])
2208 except Lustre.OptionError, e:
2212 setupModulePath(sys.argv[0])
2214 host = socket.gethostname()
2216 # the PRNG is normally seeded with time(), which is not so good for starting
2217 # time-synchronized clusters
2218 input = open('/dev/urandom', 'r')
2220 print 'Unable to open /dev/urandom!'
2222 seed = input.read(32)
2229 if not os.access(args[0], os.R_OK):
2230 print 'File not found or readable:', args[0]
2233 dom = xml.dom.minidom.parse(args[0])
2235 panic("%s does not appear to be a config file." % (args[0]))
2236 sys.exit(1) # make sure to die here, even in debug mode.
2237 db = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement)
2238 elif config.ldapurl:
2239 if not config.config:
2240 panic("--ldapurl requires --config name")
2241 dn = "config=%s,fs=lustre" % (config.config)
2242 db = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl)
2247 ver = db.get_version()
2249 panic("No version found in config data, please recreate.")
2250 if ver != Lustre.CONFIG_VERSION:
2251 panic("Config version", ver, "does not match lconf version",
2252 Lustre.CONFIG_VERSION)
2256 node_list.append(config.node)
2259 node_list.append(host)
2260 node_list.append('localhost')
2262 debug("configuring for host: ", node_list)
2265 config.debug_path = config.debug_path + '-' + host
2266 config.gdb_script = config.gdb_script + '-' + host
2268 lctl = LCTLInterface('lctl')
2270 if config.lctl_dump:
2271 lctl.use_save_file(config.lctl_dump)
2273 doHost(db, node_list)
2275 if __name__ == "__main__":
2278 except Lustre.LconfError, e:
2280 except CommandError, e:
2284 if first_cleanup_error:
2285 sys.exit(first_cleanup_error)