From c0c1ce99bfe11128c8d1a1cafc1c4d5524cbd77a Mon Sep 17 00:00:00 2001 From: eeb Date: Tue, 11 Jan 2005 03:37:38 +0000 Subject: [PATCH] * Made openib not use the subnet manager to discover connection parameters + Openib listens on a TCP/IP port for service queries and responds with the service id, port GID and pkey. + Openib peer table entries have become (NID, IP, port) tuples, where IP and port connect to the peer's service query server. + /proc interfaces for port, listener timeout and backlog (restarts kernel listener on update). + lmc/lconf support for new openib peers, including only running acceptor if net == 'tcp' + Changed connection daemon to conduct both sides of the service query as well as IB connection establishment. Spawning several since tcp/ip reads can block for a timeout. + Added a reaper thread to do connection cleanup and timeout checks (the single connection daemon used to do that too). * Removed some unused lconf default constants * Better openib automagic compilation check. * Consistent usage (flipping + optional checksum) of all openib messages (service query, connection requests and "normal" message flow). * Exhaustive openib "are-you-still-the-same-person" checks * Fixed bug which stopped idle persistent peers from getting removed from the peer table. * Fixed some ranal bugs which became obvious when similar problems were debugged in openibnal --- lustre/utils/lconf | 35 ++++++++++++++++++++++------------- lustre/utils/lmc | 4 ++-- 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/lustre/utils/lconf b/lustre/utils/lconf index 06319fc..a4c793e 100755 --- a/lustre/utils/lconf +++ b/lustre/utils/lconf @@ -52,8 +52,6 @@ import Lustre # Global parameters MAXTCPBUF = 16777216 -DEFAULT_TCPBUF = 8388608 -DEFAULT_PORT = 988 # # Maximum number of devices to search for. # (the /dev/loop* nodes need to be created beforehand) @@ -265,6 +263,7 @@ class AcceptorHandler(DaemonHandler): def __init__(self, port, net_type): DaemonHandler.__init__(self, "acceptor") self.port = port + self.net_type = net_type self.flags = '' def pidfile(self): @@ -281,7 +280,7 @@ def run_acceptors(): return for port in acceptors.keys(): daemon = acceptors[port] - if not daemon.running(): + if daemon.net_type == 'tcp' and not daemon.running(): daemon.start() def run_one_acceptor(port): @@ -289,7 +288,7 @@ def run_one_acceptor(port): return if acceptors.has_key(port): daemon = acceptors[port] - if not daemon.running(): + if daemon.net_type == 'tcp' and not daemon.running(): daemon.start() else: panic("run_one_acceptor: No acceptor defined for port:", port) @@ -297,7 +296,7 @@ def run_one_acceptor(port): def stop_acceptor(port): if acceptors.has_key(port): daemon = acceptors[port] - if daemon.running(): + if daemon.net_type == 'tcp' and daemon.running(): daemon.stop() @@ -449,14 +448,14 @@ class LCTLInterface: self.run(cmds) def add_peer(self, net_type, nid, hostaddr, port): - if net_type in ('tcp','ra') and not config.lctl_dump: + if net_type in ('tcp','openib','ra') and not config.lctl_dump: cmds = """ network %s add_peer %s %s %d quit""" % (net_type, nid, hostaddr, port ) self.run(cmds) - elif net_type in ('openib','iib',) and not config.lctl_dump: + elif net_type in ('iib','vib') and not config.lctl_dump: cmds = """ network %s add_peer %s @@ -466,7 +465,7 @@ class LCTLInterface: def connect(self, srv): self.add_uuid(srv.net_type, srv.nid_uuid, srv.nid) - if srv.net_type in ('tcp','openib','iib',) and not config.lctl_dump: + if srv.net_type in ('tcp','openib','iib','vib') and not config.lctl_dump: if srv.hostaddr[0]: hostaddr = string.split(srv.hostaddr[0], '/')[0] self.add_peer(srv.net_type, srv.nid, hostaddr, srv.port) @@ -533,7 +532,7 @@ class LCTLInterface: quit""" % (net_type, nid, hostaddr) self.run(cmds) - elif net_type in ('openib','iib','ra') and not config.lctl_dump: + elif net_type in ('openib','iib','vib','ra') and not config.lctl_dump: cmds = """ ignore_errors network %s @@ -545,7 +544,7 @@ class LCTLInterface: # disconnect one connection def disconnect(self, srv): self.del_uuid(srv.nid_uuid) - if srv.net_type in ('tcp','openib','iib','ra') and not config.lctl_dump: + if srv.net_type in ('tcp','openib','iib','vib','ra') and not config.lctl_dump: if srv.hostaddr[0]: hostaddr = string.split(srv.hostaddr[0], '/')[0] self.del_peer(srv.net_type, srv.nid, hostaddr) @@ -972,7 +971,7 @@ def sys_get_local_nid(net_type, wildcard, cluster_id): def sys_get_local_address(net_type, wildcard, cluster_id): """Return the local address for the network type.""" local = "" - if net_type in ('tcp','openib','iib','ra'): + if net_type in ('tcp','openib','iib','vib','ra'): if ':' in wildcard: iface, star = string.split(wildcard, ':') local = if2addr(iface) @@ -1220,6 +1219,8 @@ class Network(Module): self.add_portals_module("knals/openibnal", 'kopenibnal') if self.net_type == 'iib': self.add_portals_module("knals/iibnal", 'kiibnal') + if self.net_type == 'vib': + self.add_portals_module("knals/vibnal", 'kvibnal') if self.net_type == 'lo': self.add_portals_module("knals/lonal", 'klonal') if self.net_type == 'ra': @@ -1245,6 +1246,14 @@ class Network(Module): lctl.add_interface(self.net_type, ip, netmask) if self.net_type == 'elan': sys_optimize_elan() + if self.net_type == 'openib': + if self.port == 0: + panic("no port set for", self.net_type, self.hostaddr[0]) + sysctl('/proc/sys/openibnal/port', self.port) + if self.net_type == 'ra': + if self.port == 0: + panic("no port set for", self.net_type, self.hostaddr[0]) + sysctl('/proc/sys/ranal/port', self.port) if self.port and node_is_router(): run_one_acceptor(self.port) self.connect_peer_gateways() @@ -1296,9 +1305,9 @@ class RouteTable(Module): def server_for_route(self, net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi): - # only setup connections for tcp, openib, and iib NALs + # only setup connections for tcp, ib, and ra NALs srvdb = None - if not net_type in ('tcp','openib','iib','ra'): + if not net_type in ('tcp','openib','iib','vib','ra'): return None # connect to target if route is to single node and this node is the gw diff --git a/lustre/utils/lmc b/lustre/utils/lmc index 23eeab3..ebcf2c5 100755 --- a/lustre/utils/lmc +++ b/lustre/utils/lmc @@ -646,9 +646,9 @@ def add_net(gen, lustre, options): hostaddr = get_option(options, 'hostaddr') net_type = get_option(options, 'nettype') - if net_type in ('tcp',): + if net_type in ('tcp','openib','ra'): port = get_option_int(options, 'port') - elif net_type in ('elan', 'gm', 'openib','iib','lo'): + elif net_type in ('elan','gm','iib','vib','lo'): port = 0 else: print "Unknown net_type: ", net_type -- 1.8.3.1