if test "${with_gm}" = yes; then
with_gm="-I/usr/local/gm/include"
else
- with_gm=-I"$with_gm/include"
+ with_gm="-I$with_gm/include -I$with_gm/drivers -I$with_gm/drivers/linux/gm"
fi
GMNAL="gmnal"
else
extern unsigned int portal_stack;
extern unsigned int portal_debug;
extern unsigned int portal_printk;
+extern unsigned int portal_cerror;
/* Debugging subsystems (32 bits, non-overlapping) */
#define S_UNDEFINED (1 << 0)
#define S_MDC (1 << 1)
#if 1
#define CDEBUG(mask, format, a...) \
do { \
+ if (portal_cerror == 0) \
+ break; \
CHECK_STACK(CDEBUG_STACK); \
if (!(mask) || ((mask) & (D_ERROR | D_EMERG)) || \
(portal_debug & (mask) && \
const int line);
#define LASSERT(e) ((e) ? 0 : kportal_assertion_failed( #e , __FILE__, \
__FUNCTION__, __LINE__))
+/* it would be great to dump_stack() here, but some kernels
+ * export it as show_stack() and I can't be bothered to
+ * proprely engage in that dance right now */
+#define LASSERTF(cond, fmt...) \
+ do { \
+ if (unlikely(!(cond))) { \
+ portals_debug_msg(0, D_EMERG, __FILE__, __FUNCTION__,\
+ __LINE__, CDEBUG_STACK, \
+ "ASSERTION(" #cond ") failed:" fmt);\
+ LBUG(); \
+ } \
+ } while (0)
+
#else
#define LASSERT(e)
+#define LASSERTF(cond, fmt...) do { } while (0)
#endif
#ifdef __arch_um__
} kpr_fwd_desc_t;
typedef void (*kpr_fwd_t)(void *arg, kpr_fwd_desc_t *fwd);
+typedef void (*kpr_notify_t)(void *arg, ptl_nid_t peer, int alive);
/* NAL's routing interface (Kernel Portals Routing Nal Interface) */
typedef const struct {
int kprni_nalid; /* NAL's id */
void *kprni_arg; /* Arg to pass when calling into NAL */
kpr_fwd_t kprni_fwd; /* NAL's forwarding entrypoint */
+ kpr_notify_t kprni_notify; /* NAL's notification entrypoint */
} kpr_nal_interface_t;
/* Router's routing interface (Kernel Portals Routing Router Interface) */
int (*kprri_register) (kpr_nal_interface_t *nal_interface,
void **router_arg);
- /* ask the router to find a gateway that forwards to 'nid' and is a peer
- * of the calling NAL */
- int (*kprri_lookup) (void *router_arg, ptl_nid_t nid,
+ /* ask the router to find a gateway that forwards to 'nid' and is a
+ * peer of the calling NAL; assume caller will send 'nob' bytes of
+ * payload there */
+ int (*kprri_lookup) (void *router_arg, ptl_nid_t nid, int nob,
ptl_nid_t *gateway_nid);
/* hand a packet over to the router for forwarding */
void (*kprri_fwd_done) (void *router_arg, kpr_fwd_desc_t *fwd,
int error);
+ /* notify the router about peer state */
+ void (*kprri_notify) (void *router_arg, ptl_nid_t peer,
+ int alive, time_t when);
+
/* the calling NAL is shutting down */
void (*kprri_shutdown) (void *router_arg);
typedef const struct {
int (*kprci_add_route)(int gateway_nal, ptl_nid_t gateway_nid,
ptl_nid_t lo_nid, ptl_nid_t hi_nid);
- int (*kprci_del_route)(ptl_nid_t nid);
+ int (*kprci_del_route)(int gateway_nal, ptl_nid_t gateway_nid,
+ ptl_nid_t lo_nid, ptl_nid_t hi_nid);
int (*kprci_get_route)(int index, int *gateway_nal,
- ptl_nid_t *gateway, ptl_nid_t *lo_nid,
- ptl_nid_t *hi_nid);
+ ptl_nid_t *gateway,
+ ptl_nid_t *lo_nid, ptl_nid_t *hi_nid,
+ int *alive);
+ int (*kprci_notify)(int gateway_nal, ptl_nid_t gateway_nid,
+ int alive, time_t when);
} kpr_control_interface_t;
extern kpr_control_interface_t kpr_control_interface;
}
static inline int
-kpr_lookup (kpr_router_t *router, ptl_nid_t nid, ptl_nid_t *gateway_nid)
+kpr_lookup (kpr_router_t *router, ptl_nid_t nid, int nob, ptl_nid_t *gateway_nid)
{
if (!kpr_routing (router))
- return (-EHOSTUNREACH);
+ return (-ENETUNREACH);
- return (router->kpr_interface->kprri_lookup(router->kpr_arg, nid,
+ return (router->kpr_interface->kprri_lookup(router->kpr_arg, nid, nob,
gateway_nid));
}
kpr_fwd_start (kpr_router_t *router, kpr_fwd_desc_t *fwd)
{
if (!kpr_routing (router))
- fwd->kprfd_callback (fwd->kprfd_callback_arg, -EHOSTUNREACH);
+ fwd->kprfd_callback (fwd->kprfd_callback_arg, -ENETUNREACH);
else
router->kpr_interface->kprri_fwd_start (router->kpr_arg, fwd);
}
}
static inline void
+kpr_notify (kpr_router_t *router,
+ ptl_nid_t peer, int alive, time_t when)
+{
+ if (!kpr_routing (router))
+ return;
+
+ router->kpr_interface->kprri_notify(router->kpr_arg, peer, alive, when);
+}
+
+static inline void
kpr_shutdown (kpr_router_t *router)
{
if (kpr_routing (router))
#endif /* PORTALS_PROFILING */
/* debug.c */
+void portals_run_upcall(char **argv);
void portals_run_lbug_upcall(char * file, const char *fn, const int line);
void portals_debug_dumplog(void);
int portals_debug_init(unsigned long bufsize);
# undef NDEBUG
# include <assert.h>
# define LASSERT(e) assert(e)
+# define LASSERTF(cond, args...) assert(cond)
# else
# define LASSERT(e)
+# define LASSERTF(cond, args...) do { } while (0)
# endif
# define printk(format, args...) printf (format, ## args)
# define PORTAL_ALLOC(ptr, size) do { (ptr) = malloc(size); } while (0);
# define CURRENT_TIME time(0)
#endif
+/******************************************************************************/
+/* Light-weight trace
+ * Support for temporary event tracing with minimal Heisenberg effect. */
+#define LWT_SUPPORT 1
+
+typedef struct {
+ cycles_t lwte_when;
+ char *lwte_where;
+ void *lwte_task;
+ long lwte_p1;
+ long lwte_p2;
+ long lwte_p3;
+ long lwte_p4;
+} lwt_event_t;
+
+#if LWT_SUPPORT
+#ifdef __KERNEL__
+#define LWT_EVENTS_PER_PAGE (PAGE_SIZE / sizeof (lwt_event_t))
+
+typedef struct _lwt_page {
+ struct list_head lwtp_list;
+ struct page *lwtp_page;
+ lwt_event_t *lwtp_events;
+} lwt_page_t;
+
+typedef struct {
+ int lwtc_current_index;
+ lwt_page_t *lwtc_current_page;
+} lwt_cpu_t;
+
+extern int lwt_enabled;
+extern lwt_cpu_t lwt_cpus[];
+
+extern int lwt_init (void);
+extern void lwt_fini (void);
+extern int lwt_lookup_string (int *size, char *knlptr,
+ char *usrptr, int usrsize);
+extern int lwt_control (int enable, int clear);
+extern int lwt_snapshot (int *ncpu, int *total_size,
+ void *user_ptr, int user_size);
+
+/* Note that we _don't_ define LWT_EVENT at all if LWT_SUPPORT isn't set.
+ * This stuff is meant for finding specific problems; it never stays in
+ * production code... */
+
+#define LWTSTR(n) #n
+#define LWTWHERE(f,l) f ":" LWTSTR(l)
+
+#define LWT_EVENT(p1, p2, p3, p4) \
+do { \
+ unsigned long flags; \
+ lwt_cpu_t *cpu; \
+ lwt_page_t *p; \
+ lwt_event_t *e; \
+ \
+ local_irq_save (flags); \
+ \
+ if (lwt_enabled) { \
+ cpu = &lwt_cpus[smp_processor_id()]; \
+ p = cpu->lwtc_current_page; \
+ e = &p->lwtp_events[cpu->lwtc_current_index++]; \
+ \
+ if (cpu->lwtc_current_index >= LWT_EVENTS_PER_PAGE) { \
+ cpu->lwtc_current_page = \
+ list_entry (p->lwtp_list.next, \
+ lwt_page_t, lwtp_list); \
+ cpu->lwtc_current_index = 0; \
+ } \
+ \
+ e->lwte_when = get_cycles(); \
+ e->lwte_where = LWTWHERE(__FILE__,__LINE__); \
+ e->lwte_task = current; \
+ e->lwte_p1 = (long)(p1); \
+ e->lwte_p2 = (long)(p2); \
+ e->lwte_p3 = (long)(p3); \
+ e->lwte_p4 = (long)(p4); \
+ } \
+ \
+ local_irq_restore (flags); \
+} while (0)
+#else /* __KERNEL__ */
+#define LWT_EVENT(p1,p2,p3,p4) /* no userland implementation yet */
+#endif /* __KERNEL__ */
+#endif /* LWT_SUPPORT */
+
+
#include <linux/portals_lib.h>
/*
#define IOC_PORTAL_GET_NID _IOWR('e', 39, long)
#define IOC_PORTAL_FAIL_NID _IOWR('e', 40, long)
#define IOC_PORTAL_SET_DAEMON _IOWR('e', 41, long)
-
-#define IOC_PORTAL_MAX_NR 41
+#define IOC_PORTAL_NOTIFY_ROUTER _IOWR('e', 42, long)
+#define IOC_PORTAL_LWT_CONTROL _IOWR('e', 43, long)
+#define IOC_PORTAL_LWT_SNAPSHOT _IOWR('e', 44, long)
+#define IOC_PORTAL_LWT_LOOKUP_STRING _IOWR('e', 45, long)
+#define IOC_PORTAL_MAX_NR 45
enum {
QSWNAL = 1,
int jt_ptl_nagle (int argc, char **argv);
int jt_ptl_add_route (int argc, char **argv);
int jt_ptl_del_route (int argc, char **argv);
+int jt_ptl_notify_router (int argc, char **argv);
int jt_ptl_print_routes (int argc, char **argv);
int jt_ptl_fail_nid (int argc, char **argv);
+int jt_ptl_lwt(int argc, char **argv);
int dbg_initialize(int argc, char **argv);
int jt_dbg_filter(int argc, char **argv);
int jt_ptl_nagle (int argc, char **argv);
int jt_ptl_add_route (int argc, char **argv);
int jt_ptl_del_route (int argc, char **argv);
+int jt_ptl_notify_router (int argc, char **argv);
int jt_ptl_print_routes (int argc, char **argv);
int jt_ptl_fail_nid (int argc, char **argv);
+int jt_ptl_lwt(int argc, char **argv);
int dbg_initialize(int argc, char **argv);
int jt_dbg_filter(int argc, char **argv);
modulenet_DATA = kgmnal.o
EXTRA_PROGRAMS = kgmnal
-DEFS =
-kgmnal_SOURCES = gmnal.c gmnal_cb.c gmnal.h
+DEFS = -DGM_KERNEL
+kgmnal_SOURCES = gmnal.h gmnal_api.c gmnal_cb.c gmnal_comm.c gmnal_utils.c gmnal_module.c
+++ /dev/null
-diff -ru gm-1.5.2.1_Linux/drivers/linux/gm/gm_arch.c gm-1.5.2.1_Linux-cfs/drivers/linux/gm/gm_arch.c
---- gm-1.5.2.1_Linux/drivers/linux/gm/gm_arch.c Mon Jul 1 10:35:09 2002
-+++ gm-1.5.2.1_Linux-cfs/drivers/linux/gm/gm_arch.c Thu Sep 19 14:19:38 2002
-@@ -30,6 +30,8 @@
- *
- ************************************************************************/
-
-+#define EXPORT_SYMTAB
-+
- #include <linux/config.h>
- #include <linux/module.h>
-
-@@ -4075,6 +4077,28 @@
- return 0;
- }
-
-+EXPORT_SYMBOL(gm_blocking_receive_no_spin);
-+EXPORT_SYMBOL(gm_close);
-+EXPORT_SYMBOL(gm_dma_free);
-+EXPORT_SYMBOL(gm_dma_malloc);
-+EXPORT_SYMBOL(gm_drop_sends);
-+EXPORT_SYMBOL(gm_finalize);
-+EXPORT_SYMBOL(gm_get_node_id);
-+EXPORT_SYMBOL(gm_init);
-+EXPORT_SYMBOL(gm_initialize_alarm);
-+EXPORT_SYMBOL(gm_max_node_id_in_use);
-+EXPORT_SYMBOL(gm_min_size_for_length);
-+EXPORT_SYMBOL(gm_num_receive_tokens);
-+EXPORT_SYMBOL(gm_num_send_tokens);
-+EXPORT_SYMBOL(gm_open);
-+EXPORT_SYMBOL(gm_provide_receive_buffer);
-+EXPORT_SYMBOL(gm_resume_sending);
-+EXPORT_SYMBOL(gm_send_with_callback);
-+EXPORT_SYMBOL(gm_set_acceptable_sizes);
-+EXPORT_SYMBOL(gm_set_alarm);
-+EXPORT_SYMBOL(gm_unknown);
-+
-+
- /*
- This file uses GM standard indentation.
-
-Only in gm-1.5.2.1_Linux-cfs/drivers/linux/gm: gm_arch.c~
-Only in gm-1.5.2.1_Linux-cfs/: trace
/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
* vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (c) 2003 Los Alamos National Laboratory (LANL)
+ *
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
-#ifndef _GMNAL_H
-#define _GMNAL_H
-
-#include <linux/config.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/stat.h>
-#include <linux/errno.h>
-#include <linux/locks.h>
-#include <linux/unistd.h>
-#include <linux/init.h>
-
-#include <asm/system.h>
-#include <asm/uaccess.h>
-
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/stat.h>
-#include <linux/list.h>
-#include <asm/uaccess.h>
-#include <asm/segment.h>
+
+
+/*
+ * Portals GM kernel NAL header file
+ * This file makes all declaration and prototypes
+ * for the API side and CB side of the NAL
+ */
+#ifndef __INCLUDE_GMNAL_H__
+#define __INCLUDE_GMNAL_H__
+
+#include "linux/config.h"
+#include "linux/module.h"
+#include "linux/tty.h"
+#include "linux/kernel.h"
+#include "linux/mm.h"
+#include "linux/string.h"
+#include "linux/stat.h"
+#include "linux/errno.h"
+#include "linux/locks.h"
+#include "linux/unistd.h"
+#include "linux/init.h"
+#include "linux/sem.h"
+#include "linux/vmalloc.h"
+#ifdef MODVERSIONS
+#include <linux/modversions.h>
+#endif
#define DEBUG_SUBSYSTEM S_GMNAL
-#include <linux/kp30.h>
-#include <portals/p30.h>
-#include <portals/lib-p30.h>
+#include "portals/nal.h"
+#include "portals/api.h"
+#include "portals/errno.h"
+#include "linux/kp30.h"
+#include "portals/p30.h"
+
+#include "portals/lib-nal.h"
+#include "portals/lib-p30.h"
+
+#define GM_STRONG_TYPES 1
+#include "gm.h"
+#include "gm_internal.h"
+
+
+/*
+ * Defines for the API NAL
+ */
+
+/*
+ * Small message size is configurable
+ * insmod can set small_msg_size
+ * which is used to populate nal_data.small_msg_size
+ */
+#define GMNAL_SMALL_MESSAGE 1078
+#define GMNAL_LARGE_MESSAGE_INIT 1079
+#define GMNAL_LARGE_MESSAGE_ACK 1080
+#define GMNAL_LARGE_MESSAGE_FINI 1081
+
+extern int gmnal_small_msg_size;
+extern int num_rx_threads;
+extern int num_stxds;
+#define GMNAL_SMALL_MSG_SIZE(a) a->small_msg_size
+#define GMNAL_IS_SMALL_MESSAGE(n,a,b,c) gmnal_is_small_msg(n, a, b, c)
+#define GMNAL_MAGIC 0x1234abcd
+
+
+/*
+ * Small Transmit Descriptor
+ * A structre to keep track of a small transmit operation
+ * This structure has a one-to-one relationship with a small
+ * transmit buffer (both create by gmnal_stxd_alloc).
+ * There are two free list of stxd. One for use by clients of the NAL
+ * and the other by the NAL rxthreads when doing sends.
+ * This helps prevent deadlock caused by stxd starvation.
+ */
+typedef struct _gmnal_stxd_t {
+ void *buffer;
+ int buffer_size;
+ gm_size_t gm_size;
+ int msg_size;
+ int gm_target_node;
+ int gm_priority;
+ int type;
+ struct _gmnal_data_t *nal_data;
+ lib_msg_t *cookie;
+ int niov;
+ struct iovec iov[PTL_MD_MAX_IOV];
+ struct _gmnal_srxd_t *srxd;
+ struct _gmnal_stxd_t *next;
+ int rxt;
+ int kniov;
+ struct iovec *iovec_dup;
+} gmnal_stxd_t;
+
+/*
+ * as for gmnal_stxd_t
+ * a hash table in nal_data find srxds from
+ * the rx buffer address. hash table populated at init time
+ */
+typedef struct _gmnal_srxd_t {
+ void *buffer;
+ int size;
+ gm_size_t gmsize;
+ unsigned int gm_source_node;
+ gmnal_stxd_t *source_stxd;
+ int type;
+ int nsiov;
+ int nriov;
+ struct iovec *riov;
+ int ncallbacks;
+ spinlock_t callback_lock;
+ int callback_status;
+ lib_msg_t *cookie;
+ struct _gmnal_srxd_t *next;
+ struct _gmnal_data_t *nal_data;
+} gmnal_srxd_t;
+
+/*
+ * Header which lmgnal puts at the start of each message
+ */
+typedef struct _gmnal_msghdr {
+ int magic;
+ int type;
+ unsigned int sender_node_id;
+ gmnal_stxd_t *stxd;
+ int niov;
+ } gmnal_msghdr_t;
+#define GMNAL_MSGHDR_SIZE sizeof(gmnal_msghdr_t)
+
+/*
+ * the caretaker thread (ct_thread) gets receive events
+ * (and other events) from the myrinet device via the GM2 API.
+ * caretaker thread populates one work entry for each receive event,
+ * puts it on a Q in nal_data and wakes a receive thread to
+ * process the receive.
+ * Processing a portals receive can involve a transmit operation.
+ * Because of this the caretaker thread cannot process receives
+ * as it may get deadlocked when supply of transmit descriptors
+ * is exhausted (as caretaker thread is responsible for replacing
+ * transmit descriptors on the free list)
+ */
+typedef struct _gmnal_rxtwe {
+ gm_recv_event_t *rx;
+ struct _gmnal_rxtwe *next;
+} gmnal_rxtwe_t;
+
+/*
+ * 1 receive thread started on each CPU
+ */
+#define NRXTHREADS 10 /* max number of receiver threads */
+
+typedef struct _gmnal_data_t {
+ int refcnt;
+ spinlock_t cb_lock;
+ spinlock_t stxd_lock;
+ struct semaphore stxd_token;
+ gmnal_stxd_t *stxd;
+ spinlock_t rxt_stxd_lock;
+ struct semaphore rxt_stxd_token;
+ gmnal_stxd_t *rxt_stxd;
+ spinlock_t srxd_lock;
+ struct semaphore srxd_token;
+ gmnal_srxd_t *srxd;
+ struct gm_hash *srxd_hash;
+ nal_t *nal;
+ nal_cb_t *nal_cb;
+ struct gm_port *gm_port;
+ unsigned int gm_local_nid;
+ unsigned int gm_global_nid;
+ spinlock_t gm_lock;
+ long rxthread_pid[NRXTHREADS];
+ int rxthread_stop_flag;
+ spinlock_t rxthread_flag_lock;
+ long rxthread_flag;
+ long ctthread_pid;
+ int ctthread_flag;
+ gm_alarm_t ctthread_alarm;
+ int small_msg_size;
+ int small_msg_gmsize;
+ gmnal_rxtwe_t *rxtwe_head;
+ gmnal_rxtwe_t *rxtwe_tail;
+ spinlock_t rxtwe_lock;
+ struct semaphore rxtwe_wait;
+} gmnal_data_t;
+
+/*
+ * Flags to start/stop and check status of threads
+ * each rxthread sets 1 bit (any bit) of the flag on startup
+ * and clears 1 bit when exiting
+ */
+#define GMNAL_THREAD_RESET 0
+#define GMNAL_THREAD_STOP 666
+#define GMNAL_CTTHREAD_STARTED 333
+#define GMNAL_RXTHREADS_STARTED ( (1<<num_rx_threads)-1)
+
+
+extern gmnal_data_t *global_nal_data;
+
+/*
+ * The gm_port to use for gmnal
+ */
+#define GMNAL_GM_PORT 4
+
+/*
+ * for ioctl get pid
+ */
+#define GMNAL_IOC_GET_GNID 1
+
+/*
+ * Return codes
+ */
+#define GMNAL_STATUS_OK 0
+#define GMNAL_STATUS_FAIL 1
+#define GMNAL_STATUS_NOMEM 2
-#include <gm.h>
+/*
+ * FUNCTION PROTOTYPES
+ */
+
+/*
+ * Locking macros
+ */
/*
- * Myrinet GM NAL
+ * For the Small tx and rx descriptor lists
*/
-#define NPAGES_LARGE 16
-#define NPAGES_SMALL 1
-#define MSG_LEN_LARGE NPAGES_LARGE*PAGE_SIZE
-#define MSG_LEN_SMALL NPAGES_SMALL*PAGE_SIZE
-#define MSG_SIZE_LARGE (gm_min_size_for_length(MSG_LEN_LARGE))
-#define MSG_SIZE_SMALL (gm_min_size_for_length(MSG_LEN_SMALL))
+#define GMNAL_TXD_LOCK_INIT(a) spin_lock_init(&a->stxd_lock);
+#define GMNAL_TXD_LOCK(a) spin_lock(&a->stxd_lock);
+#define GMNAL_TXD_UNLOCK(a) spin_unlock(&a->stxd_lock);
+#define GMNAL_TXD_TOKEN_INIT(a, n) sema_init(&a->stxd_token, n);
+#define GMNAL_TXD_GETTOKEN(a) down(&a->stxd_token);
+#define GMNAL_TXD_TRYGETTOKEN(a) down_trylock(&a->stxd_token)
+#define GMNAL_TXD_RETURNTOKEN(a) up(&a->stxd_token);
-#define TXMSGS 64 /* Number of Transmit Messages */
-#define ENVELOPES 8 /* Number of outstanding receive msgs */
+#define GMNAL_RXT_TXD_LOCK_INIT(a) spin_lock_init(&a->rxt_stxd_lock);
+#define GMNAL_RXT_TXD_LOCK(a) spin_lock(&a->rxt_stxd_lock);
+#define GMNAL_RXT_TXD_UNLOCK(a) spin_unlock(&a->rxt_stxd_lock);
+#define GMNAL_RXT_TXD_TOKEN_INIT(a, n) sema_init(&a->rxt_stxd_token, n);
+#define GMNAL_RXT_TXD_GETTOKEN(a) down(&a->rxt_stxd_token);
+#define GMNAL_RXT_TXD_TRYGETTOKEN(a) down_trylock(&a->rxt_stxd_token)
+#define GMNAL_RXT_TXD_RETURNTOKEN(a) up(&a->rxt_stxd_token);
-#define KGM_PORT_NUM 3
-#define KGM_HOSTNAME "kgmnal"
+#define GMNAL_RXD_LOCK_INIT(a) spin_lock_init(&a->srxd_lock);
+#define GMNAL_RXD_LOCK(a) spin_lock(&a->srxd_lock);
+#define GMNAL_RXD_UNLOCK(a) spin_unlock(&a->srxd_lock);
+#define GMNAL_RXD_TOKEN_INIT(a, n) sema_init(&a->srxd_token, n);
+#define GMNAL_RXD_GETTOKEN(a) down(&a->srxd_token);
+#define GMNAL_RXD_TRYGETTOKEN(a) down_trylock(&a->srxd_token)
+#define GMNAL_RXD_RETURNTOKEN(a) up(&a->srxd_token);
+#define GMNAL_GM_LOCK_INIT(a) spin_lock_init(&a->gm_lock);
+#define GMNAL_GM_LOCK(a) spin_lock(&a->gm_lock);
+#define GMNAL_GM_UNLOCK(a) spin_unlock(&a->gm_lock);
+#define GMNAL_CB_LOCK_INIT(a) spin_lock_init(&a->cb_lock);
-typedef struct {
- char *krx_buffer;
- unsigned long krx_len;
- unsigned int krx_size;
- unsigned int krx_priority;
- struct list_head krx_item;
-} kgmnal_rx_t;
+/*
+ * Memory Allocator
+ */
+
+/*
+ * API NAL
+ */
+int gmnal_api_forward(nal_t *, int, void *, size_t, void *, size_t);
+
+int gmnal_api_shutdown(nal_t *, int);
+
+int gmnal_api_validate(nal_t *, void *, size_t);
+
+void gmnal_api_yield(nal_t *);
+
+void gmnal_api_lock(nal_t *, unsigned long *);
+
+void gmnal_api_unlock(nal_t *, unsigned long *);
+
+
+#define GMNAL_INIT_NAL(a) do { \
+ a->forward = gmnal_api_forward; \
+ a->shutdown = gmnal_api_shutdown; \
+ a->validate = NULL; \
+ a->yield = gmnal_api_yield; \
+ a->lock = gmnal_api_lock; \
+ a->unlock = gmnal_api_unlock; \
+ a->timeout = NULL; \
+ a->refct = 1; \
+ a->nal_data = NULL; \
+ } while (0)
+
+
+/*
+ * CB NAL
+ */
+
+int gmnal_cb_send(nal_cb_t *, void *, lib_msg_t *, ptl_hdr_t *,
+ int, ptl_nid_t, ptl_pid_t, unsigned int, struct iovec *, size_t);
+
+int gmnal_cb_send_pages(nal_cb_t *, void *, lib_msg_t *, ptl_hdr_t *,
+ int, ptl_nid_t, ptl_pid_t, unsigned int, ptl_kiov_t *, size_t);
+
+int gmnal_cb_recv(nal_cb_t *, void *, lib_msg_t *,
+ unsigned int, struct iovec *, size_t, size_t);
+
+int gmnal_cb_recv_pages(nal_cb_t *, void *, lib_msg_t *,
+ unsigned int, ptl_kiov_t *, size_t, size_t);
+
+int gmnal_cb_read(nal_cb_t *, void *private, void *, user_ptr, size_t);
+
+int gmnal_cb_write(nal_cb_t *, void *private, user_ptr, void *, size_t);
+
+int gmnal_cb_callback(nal_cb_t *, void *, lib_eq_t *, ptl_event_t *);
+
+void *gmnal_cb_malloc(nal_cb_t *, size_t);
+
+void gmnal_cb_free(nal_cb_t *, void *, size_t);
+
+void gmnal_cb_unmap(nal_cb_t *, unsigned int, struct iovec*, void **);
+
+int gmnal_cb_map(nal_cb_t *, unsigned int, struct iovec*, void **);
+
+void gmnal_cb_printf(nal_cb_t *, const char *fmt, ...);
+
+void gmnal_cb_cli(nal_cb_t *, unsigned long *);
+
+void gmnal_cb_sti(nal_cb_t *, unsigned long *);
+
+int gmnal_cb_dist(nal_cb_t *, ptl_nid_t, unsigned long *);
+
+nal_t *gmnal_init(int, ptl_pt_index_t, ptl_ac_index_t, ptl_pid_t rpid);
+
+void gmnal_fini(void);
+
+
+
+#define GMNAL_INIT_NAL_CB(a) do { \
+ a->cb_send = gmnal_cb_send; \
+ a->cb_send_pages = gmnal_cb_send_pages; \
+ a->cb_recv = gmnal_cb_recv; \
+ a->cb_recv_pages = gmnal_cb_recv_pages; \
+ a->cb_read = gmnal_cb_read; \
+ a->cb_write = gmnal_cb_write; \
+ a->cb_callback = gmnal_cb_callback; \
+ a->cb_malloc = gmnal_cb_malloc; \
+ a->cb_free = gmnal_cb_free; \
+ a->cb_map = NULL; \
+ a->cb_unmap = NULL; \
+ a->cb_printf = gmnal_cb_printf; \
+ a->cb_cli = gmnal_cb_cli; \
+ a->cb_sti = gmnal_cb_sti; \
+ a->cb_dist = gmnal_cb_dist; \
+ a->nal_data = NULL; \
+ } while (0)
+
+
+/*
+ * Small Transmit and Receive Descriptor Functions
+ */
+int gmnal_alloc_stxd(gmnal_data_t *);
+void gmnal_free_stxd(gmnal_data_t *);
+gmnal_stxd_t* gmnal_get_stxd(gmnal_data_t *, int);
+void gmnal_return_stxd(gmnal_data_t *, gmnal_stxd_t *);
+
+int gmnal_alloc_srxd(gmnal_data_t *);
+void gmnal_free_srxd(gmnal_data_t *);
+gmnal_srxd_t* gmnal_get_srxd(gmnal_data_t *, int);
+void gmnal_return_srxd(gmnal_data_t *, gmnal_srxd_t *);
+
+/*
+ * general utility functions
+ */
+gmnal_srxd_t *gmnal_rxbuffer_to_srxd(gmnal_data_t *, void*);
+void gmnal_stop_rxthread(gmnal_data_t *);
+void gmnal_stop_ctthread(gmnal_data_t *);
+void gmnal_small_tx_callback(gm_port_t *, void *, gm_status_t);
+void gmnal_drop_sends_callback(gm_port_t *, void *, gm_status_t);
+char *gmnal_gm_error(gm_status_t);
+char *gmnal_rxevent(gm_recv_event_t*);
+int gmnal_is_small_msg(gmnal_data_t*, int, struct iovec*, int);
+void gmnal_yield(int);
+int gmnal_start_kernel_threads(gmnal_data_t *);
+
+
+/*
+ * Communication functions
+ */
+
+/*
+ * Receive threads
+ */
+int gmnal_ct_thread(void *); /* caretaker thread */
+int gmnal_rx_thread(void *); /* receive thread */
+int gmnal_pre_receive(gmnal_data_t*, gm_recv_t*, int);
+int gmnal_rx_bad(gmnal_data_t *, gm_recv_t *, gmnal_srxd_t *);
+int gmnal_rx_requeue_buffer(gmnal_data_t *, gmnal_srxd_t *);
+int gmnal_add_rxtwe(gmnal_data_t *, gm_recv_event_t *);
+gmnal_rxtwe_t * gmnal_get_rxtwe(gmnal_data_t *);
+void gmnal_remove_rxtwe(gmnal_data_t *);
+
+
+/*
+ * Small messages
+ */
+int gmnal_small_rx(nal_cb_t *, void *, lib_msg_t *, unsigned int,
+ struct iovec *, size_t, size_t);
+int gmnal_small_tx(nal_cb_t *, void *, lib_msg_t *, ptl_hdr_t *,
+ int, ptl_nid_t, ptl_pid_t,
+ unsigned int, struct iovec*, int);
+void gmnal_small_tx_callback(gm_port_t *, void *, gm_status_t);
+
+
+
+/*
+ * Large messages
+ */
+int gmnal_large_rx(nal_cb_t *, void *, lib_msg_t *, unsigned int,
+ struct iovec *, size_t, size_t);
-typedef struct {
- nal_cb_t *ktx_nal;
- void *ktx_private;
- lib_msg_t *ktx_cookie;
- char *ktx_buffer;
- size_t ktx_len;
- unsigned long ktx_size;
- int ktx_ndx;
- unsigned int ktx_priority;
- unsigned int ktx_tgt_node;
- unsigned int ktx_tgt_port_id;
-} kgmnal_tx_t;
+int gmnal_large_tx(nal_cb_t *, void *, lib_msg_t *, ptl_hdr_t *,
+ int, ptl_nid_t, ptl_pid_t, unsigned int,
+ struct iovec*, int);
+void gmnal_large_tx_callback(gm_port_t *, void *, gm_status_t);
-typedef struct {
- char kgm_init;
- char kgm_shuttingdown;
- struct gm_port *kgm_port;
- struct list_head kgm_list;
- ptl_nid_t kgm_nid;
- nal_cb_t *kgm_cb;
- struct kgm_trans *kgm_trans;
- struct tq_struct kgm_ready_tq;
- spinlock_t kgm_dispatch_lock;
- spinlock_t kgm_update_lock;
- spinlock_t kgm_send_lock;
-} kgmnal_data_t;
+int gmnal_remote_get(gmnal_srxd_t *, int, struct iovec*, int,
+ struct iovec*);
-int kgm_init(kgmnal_data_t *kgm_data);
-int kgmnal_recv_thread(void *);
-int gm_return_mynid(void);
-void kgmnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd);
+void gmnal_remote_get_callback(gm_port_t *, void *, gm_status_t);
-extern kgmnal_data_t kgmnal_data;
-extern nal_t kgmnal_api;
-extern nal_cb_t kgmnal_lib;
+int gmnal_copyiov(int, gmnal_srxd_t *, int, struct iovec*, int,
+ struct iovec*);
-#endif /* _GMNAL_H */
+void gmnal_large_tx_ack(gmnal_data_t *, gmnal_srxd_t *);
+void gmnal_large_tx_ack_callback(gm_port_t *, void *, gm_status_t);
+void gmnal_large_tx_ack_received(gmnal_data_t *, gmnal_srxd_t *);
+#endif /*__INCLUDE_GMNAL_H__*/
/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
* vim:expandtab:shiftwidth=8:tabstop=8:
*
- * Based on ksocknal and qswnal
+ * Copyright (c) 2003 Los Alamos National Laboratory (LANL)
*
- * Copyright (C) 2002 Cluster File Systems, Inc.
- * Author: Robert Read <rread@datarithm.net>
+ * This file is part of Lustre, http://www.lustre.org/
*
- * This file is part of Portals, http://www.sf.net/projects/sandiaportals/
- *
- * Portals is free software; you can redistribute it and/or
+ * Lustre is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*
- * Portals is distributed in the hope that it will be useful,
+ * Lustre is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with Portals; if not, write to the Free Software
+ * along with Lustre; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
-/* TODO
- * preallocate send buffers, store on list
- * put receive buffers on queue, handle with receive threads
- * use routing
- */
-
-#include "gmnal.h"
-
-extern kgmnal_rx_t *kgm_add_recv(kgmnal_data_t *,int);
-
-static kgmnal_tx_t *
-get_trans(void)
-{
- kgmnal_tx_t *t;
- PORTAL_ALLOC(t, (sizeof(kgmnal_tx_t)));
- return t;
-}
-
-static void
-put_trans(kgmnal_tx_t *t)
-{
- PORTAL_FREE(t, sizeof(kgmnal_tx_t));
-}
-
-int
-kgmnal_ispeer (ptl_nid_t nid)
-{
- unsigned int gmnid = (unsigned int)nid;
- unsigned int nnids;
-
- gm_max_node_id_in_use(kgmnal_data.kgm_port, &nnids);
-
- return ((ptl_nid_t)gmnid == nid &&/* didn't lose high bits on conversion ? */
- gmnid < nnids); /* it's in this machine */
-}
/*
- * LIB functions follow
- *
+ * This file implements the nal cb functions
*/
-static int
-kgmnal_read (nal_cb_t *nal, void *private, void *dst_addr, user_ptr src_addr,
- size_t len)
-{
- CDEBUG(D_NET, "0x%Lx: reading %ld bytes from %p -> %p\n",
- nal->ni.nid, (long)len, src_addr, dst_addr );
- memcpy( dst_addr, src_addr, len );
- return 0;
-}
-
-static int
-kgmnal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, void *src_addr,
- size_t len)
-{
- CDEBUG(D_NET, "0x%Lx: writing %ld bytes from %p -> %p\n",
- nal->ni.nid, (long)len, src_addr, dst_addr );
- memcpy( dst_addr, src_addr, len );
- return 0;
-}
-static void *
-kgmnal_malloc(nal_cb_t *nal, size_t len)
-{
- void *buf;
- PORTAL_ALLOC(buf, len);
- return buf;
-}
+#include "gmnal.h"
-static void
-kgmnal_free(nal_cb_t *nal, void *buf, size_t len)
+int gmnal_cb_recv(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie,
+ unsigned int niov, struct iovec *iov, size_t mlen,
+ size_t rlen)
{
- PORTAL_FREE(buf, len);
+ gmnal_srxd_t *srxd = (gmnal_srxd_t*)private;
+ int status = PTL_OK;
+
+
+ CDEBUG(D_TRACE, "gmnal_cb_recv nal_cb [%p], private[%p], cookie[%p],
+ niov[%d], iov [%p], mlen["LPSZ"], rlen["LPSZ"]\n",
+ nal_cb, private, cookie, niov, iov, mlen, rlen);
+
+ switch(srxd->type) {
+ case(GMNAL_SMALL_MESSAGE):
+ CDEBUG(D_INFO, "gmnal_cb_recv got small message\n");
+ status = gmnal_small_rx(nal_cb, private, cookie, niov,
+ iov, mlen, rlen);
+ break;
+ case(GMNAL_LARGE_MESSAGE_INIT):
+ CDEBUG(D_INFO, "gmnal_cb_recv got large message init\n");
+ status = gmnal_large_rx(nal_cb, private, cookie, niov,
+ iov, mlen, rlen);
+ }
+
+
+ CDEBUG(D_INFO, "gmnal_cb_recv gmnal_return status [%d]\n", status);
+ return(status);
}
-static void
-kgmnal_printf(nal_cb_t *nal, const char *fmt, ...)
+int gmnal_cb_recv_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie,
+ unsigned int kniov, ptl_kiov_t *kiov, size_t mlen,
+ size_t rlen)
{
- va_list ap;
- char msg[256];
-
- if (portal_debug & D_NET) {
- va_start( ap, fmt );
- vsnprintf( msg, sizeof(msg), fmt, ap );
- va_end( ap );
-
- printk("CPUId: %d %s",smp_processor_id(), msg);
- }
+ gmnal_srxd_t *srxd = (gmnal_srxd_t*)private;
+ int status = PTL_OK;
+ struct iovec *iovec = NULL, *iovec_dup = NULL;
+ int i = 0;
+
+
+ CDEBUG(D_TRACE, "gmnal_cb_recv_pages nal_cb [%p],private[%p],
+ cookie[%p], kniov[%d], kiov [%p], mlen["LPSZ"], rlen["LPSZ"]\n",
+ nal_cb, private, cookie, kniov, kiov, mlen, rlen);
+
+ if (srxd->type == GMNAL_SMALL_MESSAGE) {
+ PORTAL_ALLOC(iovec, sizeof(struct iovec)*kniov);
+ if (!iovec) {
+ CDEBUG(D_ERROR, "Can't malloc\n");
+ return(GMNAL_STATUS_FAIL);
+ }
+ iovec_dup = iovec;
+
+ /*
+ * map each page and create an iovec for it
+ */
+ for (i=0; i<kniov; i++) {
+ CDEBUG(D_INFO, "processing kniov [%d] [%p]\n", i, kiov);
+ CDEBUG(D_INFO, "kniov page [%p] len [%d] offset[%d]\n",
+ kiov->kiov_page, kiov->kiov_len,
+ kiov->kiov_offset);
+ iovec->iov_len = kiov->kiov_len;
+ CDEBUG(D_INFO, "Calling kmap[%p]", kiov->kiov_page);
+
+ iovec->iov_base = kmap(kiov->kiov_page) +
+ kiov->kiov_offset;
+
+ CDEBUG(D_INFO, "iov_base is [%p]\n", iovec->iov_base);
+ iovec++;
+ kiov++;
+ }
+ CDEBUG(D_INFO, "calling gmnal_small_rx\n");
+ status = gmnal_small_rx(nal_cb, private, cookie, kniov,
+ iovec_dup, mlen, rlen);
+ PORTAL_FREE(iovec_dup, sizeof(struct iovec)*kniov);
+ }
+
+
+ CDEBUG(D_INFO, "gmnal_return status [%d]\n", status);
+ return(status);
}
-static void
-kgmnal_cli(nal_cb_t *nal, unsigned long *flags)
+int gmnal_cb_send(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie,
+ ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+ unsigned int niov, struct iovec *iov, size_t len)
{
- kgmnal_data_t *data= nal->nal_data;
- spin_lock_irqsave(&data->kgm_dispatch_lock,*flags);
+ gmnal_data_t *nal_data;
+
+
+ CDEBUG(D_TRACE, "gmnal_cb_send niov[%d] len["LPSZ"] nid["LPU64"]\n",
+ niov, len, nid);
+ nal_data = nal_cb->nal_data;
+
+ if (GMNAL_IS_SMALL_MESSAGE(nal_data, niov, iov, len)) {
+ CDEBUG(D_INFO, "This is a small message send\n");
+ gmnal_small_tx(nal_cb, private, cookie, hdr, type, nid, pid,
+ niov, iov, len);
+ } else {
+ CDEBUG(D_ERROR, "Large message send it is not supported\n");
+ lib_finalize(nal_cb, private, cookie);
+ return(PTL_FAIL);
+ gmnal_large_tx(nal_cb, private, cookie, hdr, type, nid, pid,
+ niov, iov, len);
+ }
+ return(PTL_OK);
}
-
-static void
-kgmnal_sti(nal_cb_t *nal, unsigned long *flags)
+int gmnal_cb_send_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie,
+ ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, unsigned int kniov, ptl_kiov_t *kiov, size_t len)
{
- kgmnal_data_t *data= nal->nal_data;
- spin_unlock_irqrestore(&data->kgm_dispatch_lock,*flags);
+ int i = 0;
+ gmnal_data_t *nal_data;
+ struct iovec *iovec = NULL, *iovec_dup = NULL;
+
+ CDEBUG(D_TRACE, "gmnal_cb_send_pages nid ["LPU64"] niov[%d] len["LPSZ"]\n", nid, kniov, len);
+ nal_data = nal_cb->nal_data;
+ PORTAL_ALLOC(iovec, kniov*sizeof(struct iovec));
+ iovec_dup = iovec;
+ if (GMNAL_IS_SMALL_MESSAGE(nal_data, 0, NULL, len)) {
+ CDEBUG(D_INFO, "This is a small message send\n");
+
+ for (i=0; i<kniov; i++) {
+ CDEBUG(D_INFO, "processing kniov [%d] [%p]\n", i, kiov);
+ CDEBUG(D_INFO, "kniov page [%p] len [%d] offset[%d]\n",
+ kiov->kiov_page, kiov->kiov_len,
+ kiov->kiov_offset);
+
+ iovec->iov_base = kmap(kiov->kiov_page)
+ + kiov->kiov_offset;
+
+ iovec->iov_len = kiov->kiov_len;
+ iovec++;
+ kiov++;
+ }
+ gmnal_small_tx(nal_cb, private, cookie, hdr, type, nid,
+ pid, kniov, iovec_dup, len);
+ } else {
+ CDEBUG(D_ERROR, "Large message send it is not supported yet\n");
+ return(PTL_FAIL);
+ for (i=0; i<kniov; i++) {
+ CDEBUG(D_INFO, "processing kniov [%d] [%p]\n", i, kiov);
+ CDEBUG(D_INFO, "kniov page [%p] len [%d] offset[%d]\n",
+ kiov->kiov_page, kiov->kiov_len,
+ kiov->kiov_offset);
+
+ iovec->iov_base = kmap(kiov->kiov_page)
+ + kiov->kiov_offset;
+ iovec->iov_len = kiov->kiov_len;
+ iovec++;
+ kiov++;
+ }
+ gmnal_large_tx(nal_cb, private, cookie, hdr, type, nid,
+ pid, kniov, iovec, len);
+ }
+ PORTAL_FREE(iovec_dup, kniov*sizeof(struct iovec));
+ return(PTL_OK);
}
-
-static int
-kgmnal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist)
+int gmnal_cb_read(nal_cb_t *nal_cb, void *private, void *dst,
+ user_ptr src, size_t len)
{
- /* network distance doesn't mean much for this nal */
- if ( nal->ni.nid == nid ) {
- *dist = 0;
- } else {
- *dist = 1;
- }
-
- return 0;
+ gm_bcopy(src, dst, len);
+ return(PTL_OK);
}
-/* FIXME rmr: add rounting code here */
-static void
-kgmnal_tx_done(kgmnal_tx_t *trans, int error)
-{
- lib_finalize(trans->ktx_nal, trans->ktx_private, trans->ktx_cookie);
-
- gm_dma_free(kgmnal_data.kgm_port, trans->ktx_buffer);
-
- trans->ktx_buffer = NULL;
- trans->ktx_len = 0;
-
- put_trans(trans);
-}
-static char * gm_error_strings[GM_NUM_STATUS_CODES] = {
- [GM_SUCCESS] = "GM_SUCCESS",
- [GM_SEND_TIMED_OUT] = "GM_SEND_TIMED_OUT",
- [GM_SEND_REJECTED] = "GM_SEND_REJECTED",
- [GM_SEND_TARGET_PORT_CLOSED] = "GM_SEND_TARGET_PORT_CLOSED",
- [GM_SEND_TARGET_NODE_UNREACHABLE] = "GM_SEND_TARGET_NODE_UNREACHABLE",
- [GM_SEND_DROPPED] = "GM_SEND_DROPPED",
- [GM_SEND_PORT_CLOSED] = "GM_SEND_PORT_CLOSED",
-};
-
-inline char * get_error(int status)
+int gmnal_cb_write(nal_cb_t *nal_cb, void *private, user_ptr dst,
+ void *src, size_t len)
{
- if (gm_error_strings[status] != NULL)
- return gm_error_strings[status];
- else
- return "Unknown error";
+ gm_bcopy(src, dst, len);
+ return(PTL_OK);
}
-static void
-kgmnal_errhandler(struct gm_port *p, void *context, gm_status_t status)
+int gmnal_cb_callback(nal_cb_t *nal_cb, void *private, lib_eq_t *eq,
+ ptl_event_t *ev)
{
- CDEBUG(D_NET,"error callback: ktx %p status %d\n", context, status);
-}
-static void
-kgmnal_txhandler(struct gm_port *p, void *context, gm_status_t status)
-{
- kgmnal_tx_t *ktx = (kgmnal_tx_t *)context;
- int err = 0;
-
- LASSERT (p != NULL);
- LASSERT (ktx != NULL);
-
- CDEBUG(D_NET,"ktx %p status %d nid 0x%x pid %d\n", ktx, status,
- ktx->ktx_tgt_node, ktx->ktx_tgt_port_id);
-
- switch((int)status) {
- case GM_SUCCESS: /* normal */
- break;
- case GM_SEND_TIMED_OUT: /* application error */
- case GM_SEND_REJECTED: /* size of msg unacceptable */
- case GM_SEND_TARGET_PORT_CLOSED:
- CERROR("%s (%d):\n", get_error(status), status);
- gm_resume_sending(kgmnal_data.kgm_port, ktx->ktx_priority,
- ktx->ktx_tgt_node, ktx->ktx_tgt_port_id,
- kgmnal_errhandler, NULL);
- err = -EIO;
- break;
- case GM_SEND_TARGET_NODE_UNREACHABLE:
- case GM_SEND_PORT_CLOSED:
- CERROR("%s (%d):\n", get_error(status), status);
- gm_drop_sends(kgmnal_data.kgm_port, ktx->ktx_priority,
- ktx->ktx_tgt_node, ktx->ktx_tgt_port_id,
- kgmnal_errhandler, NULL);
- err = -EIO;
- break;
- case GM_SEND_DROPPED:
- CERROR("%s (%d):\n", get_error(status), status);
- err = -EIO;
- break;
- default:
- CERROR("Unknown status: %d\n", status);
- err = -EIO;
- break;
- }
-
- kgmnal_tx_done(ktx, err);
+ if (eq->event_callback != NULL) {
+ CDEBUG(D_INFO, "found callback\n");
+ eq->event_callback(ev);
+ }
+
+ return(PTL_OK);
}
-/*
- */
-
-static int
-kgmnal_send(nal_cb_t *nal,
- void *private,
- lib_msg_t *cookie,
- ptl_hdr_t *hdr,
- int type,
- ptl_nid_t nid,
- ptl_pid_t pid,
- int options,
- unsigned int niov,
- lib_md_iov_t *iov,
- size_t len)
+void *gmnal_cb_malloc(nal_cb_t *nal_cb, size_t len)
{
- /*
- * ipnal assumes that this is the private as passed to lib_dispatch..
- * so do we :/
- */
- kgmnal_tx_t *ktx=NULL;
- int rc=0;
- void * buf;
- int buf_len = sizeof(ptl_hdr_t) + len;
- int buf_size = 0;
-
- LASSERT ((options & PTL_MD_KIOV) == 0);
-
- PROF_START(gmnal_send);
-
-
- CDEBUG(D_NET, "sending %d bytes from %p to nid: 0x%Lx pid %d\n",
- len, iov, nid, KGM_PORT_NUM);
-
- /* ensure there is an available tx handle */
-
- /* save transaction info to trans for later finalize and cleanup */
- ktx = get_trans();
- if (ktx == NULL) {
- rc = -ENOMEM;
- goto send_exit;
- }
-
- /* hmmm... GM doesn't support vectored write, so need to allocate buffer to coalesce
- header and data.
- Also, memory must be dma'able or registered with GM. */
-
- if (buf_len <= MSG_LEN_SMALL) {
- buf_size = MSG_SIZE_SMALL;
- } else if (buf_len <= MSG_LEN_LARGE) {
- buf_size = MSG_SIZE_LARGE;
- } else {
- printk("kgmnal:request exceeds TX MTU size (%d).\n",
- MSG_SIZE_LARGE);
- rc = -1;
- goto send_exit;
- }
-
- buf = gm_dma_malloc(kgmnal_data.kgm_port, buf_len);
- if (buf == NULL) {
- rc = -ENOMEM;
- goto send_exit;
- }
- memcpy(buf, hdr, sizeof(ptl_hdr_t));
-
- if (len != 0)
- lib_copy_iov2buf(((char *)buf) + sizeof (ptl_hdr_t),
- options, niov, iov, len);
-
- ktx->ktx_nal = nal;
- ktx->ktx_private = private;
- ktx->ktx_cookie = cookie;
- ktx->ktx_len = buf_len;
- ktx->ktx_size = buf_size;
- ktx->ktx_buffer = buf;
- ktx->ktx_priority = GM_LOW_PRIORITY;
- ktx->ktx_tgt_node = nid;
- ktx->ktx_tgt_port_id = KGM_PORT_NUM;
-
- CDEBUG(D_NET, "gm_send %d bytes (size %d) from %p to nid: 0x%Lx "
- "pid %d pri %d\n", buf_len, buf_size, iov, nid, KGM_PORT_NUM,
- GM_LOW_PRIORITY);
-
- gm_send_with_callback(kgmnal_data.kgm_port, buf, buf_size,
- buf_len, GM_LOW_PRIORITY,
- nid, KGM_PORT_NUM,
- kgmnal_txhandler, ktx);
-
- PROF_FINISH(gmnal_send);
- send_exit:
- return rc;
-}
-void
-kgmnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd)
-{
- CERROR ("forwarding not implemented\n");
+ void *ptr = NULL;
+ CDEBUG(D_TRACE, "gmnal_cb_malloc len["LPSZ"]\n", len);
+ PORTAL_ALLOC(ptr, len);
+ return(ptr);
}
-void
-kqswnal_fwd_callback (void *arg, int error)
+void gmnal_cb_free(nal_cb_t *nal_cb, void *buf, size_t len)
{
- CERROR ("forwarding not implemented\n");
+ CDEBUG(D_TRACE, "gmnal_cb_free :: buf[%p] len["LPSZ"]\n", buf, len);
+ PORTAL_FREE(buf, len);
+ return;
}
-
-static inline void
-kgmnal_requeue_rx(kgmnal_rx_t *krx)
+void gmnal_cb_unmap(nal_cb_t *nal_cb, unsigned int niov, struct iovec *iov,
+ void **addrkey)
{
- gm_provide_receive_buffer(kgmnal_data.kgm_port, krx->krx_buffer,
- krx->krx_size, krx->krx_priority);
+ return;
}
-/* Process a received portals packet */
-
-/* Receive Interrupt Handler */
-static void kgmnal_rx(kgmnal_data_t *kgm, unsigned long len, unsigned int size,
- void * buf, unsigned int pri)
+int gmnal_cb_map(nal_cb_t *nal_cb, unsigned int niov, struct iovec *iov,
+ void**addrkey)
{
- ptl_hdr_t *hdr = buf;
- kgmnal_rx_t krx;
-
- CDEBUG(D_NET,"buf %p, len %ld\n", buf, len);
-
- if ( len < sizeof( ptl_hdr_t ) ) {
- /* XXX what's this for? */
- if (kgm->kgm_shuttingdown)
- return;
- CERROR("kgmnal: did not receive complete portal header, "
- "len= %ld", len);
- gm_provide_receive_buffer(kgm->kgm_port, buf, size, pri);
- return;
- }
-
- /* might want to use seperate threads to handle receive */
- krx.krx_buffer = buf;
- krx.krx_len = len;
- krx.krx_size = size;
- krx.krx_priority = pri;
-
- if ( hdr->dest_nid == kgmnal_lib.ni.nid ) {
- PROF_START(lib_parse);
- lib_parse(&kgmnal_lib, (ptl_hdr_t *)krx.krx_buffer, &krx);
- PROF_FINISH(lib_parse);
- } else if (kgmnal_ispeer(hdr->dest_nid)) {
- /* should have gone direct to peer */
- CERROR("dropping packet from 0x%llx to 0x%llx: target is "
- "a peer", hdr->src_nid, hdr->dest_nid);
- kgmnal_requeue_rx(&krx);
- } else {
- /* forward to gateway */
- CERROR("forwarding not implemented yet");
- kgmnal_requeue_rx(&krx);
- }
-
- return;
+ return(PTL_OK);
}
-
-static int kgmnal_recv(nal_cb_t *nal,
- void *private,
- lib_msg_t *cookie,
- int options,
- unsigned int niov,
- lib_md_iov_t *iov,
- size_t mlen,
- size_t rlen)
+void gmnal_cb_printf(nal_cb_t *nal_cb, const char *fmt, ...)
{
- kgmnal_rx_t *krx = private;
-
- LASSERT ((options & PTL_MD_KIOV) == 0);
-
- CDEBUG(D_NET,"mlen=%d, rlen=%d\n", mlen, rlen);
-
- /* What was actually received must be >= what sender claims to
- * have sent. This is an LASSERT, since lib-move doesn't
- * check cb return code yet. */
- LASSERT (krx->krx_len >= sizeof (ptl_hdr_t) + rlen);
- LASSERT (mlen <= rlen);
-
- PROF_START(gmnal_recv);
-
- if(mlen != 0) {
- PROF_START(memcpy);
- lib_copy_buf2iov (options, niov, iov,
- krx->krx_buffer + sizeof (ptl_hdr_t), mlen);
- PROF_FINISH(memcpy);
- }
-
- PROF_START(lib_finalize);
- lib_finalize(nal, private, cookie);
- PROF_FINISH(lib_finalize);
-
- kgmnal_requeue_rx(krx);
-
- PROF_FINISH(gmnal_recv);
-
- return rlen;
+ CDEBUG(D_TRACE, "gmnal_cb_printf\n");
+ printk(fmt);
+ return;
}
-
-static void kgmnal_shutdown(void * none)
+void gmnal_cb_cli(nal_cb_t *nal_cb, unsigned long *flags)
{
- CERROR("called\n");
- return;
+ gmnal_data_t *nal_data = (gmnal_data_t*)nal_cb->nal_data;
+
+ spin_lock_irqsave(&nal_data->cb_lock, *flags);
+ return;
}
-/*
- * Set terminate and use alarm to wake up the recv thread.
- */
-static void recv_shutdown(kgmnal_data_t *kgm)
+void gmnal_cb_sti(nal_cb_t *nal_cb, unsigned long *flags)
{
- gm_alarm_t alarm;
+ gmnal_data_t *nal_data = (gmnal_data_t*)nal_cb->nal_data;
- kgm->kgm_shuttingdown = 1;
- gm_initialize_alarm(&alarm);
- gm_set_alarm(kgm->kgm_port, &alarm, 1, kgmnal_shutdown, NULL);
+ spin_unlock_irqrestore(&nal_data->cb_lock, *flags);
+ return;
}
-int kgmnal_end(kgmnal_data_t *kgm)
+int gmnal_cb_dist(nal_cb_t *nal_cb, ptl_nid_t nid, unsigned long *dist)
{
+ CDEBUG(D_TRACE, "gmnal_cb_dist\n");
+ if (dist)
+ *dist = 27;
+ return(PTL_OK);
+}
- /* wait for sends to finish ? */
- /* remove receive buffers */
- /* shutdown receive thread */
- recv_shutdown(kgm);
- return 0;
-}
-
-/* Used only for the spinner */
-int kgmnal_recv_thread(void *arg)
-{
- kgmnal_data_t *kgm = arg;
-
- LASSERT(kgm != NULL);
-
- kportal_daemonize("kgmnal_rx");
-
- while(1) {
- gm_recv_event_t *e;
- int priority = GM_LOW_PRIORITY;
- if (kgm->kgm_shuttingdown)
- break;
-
- e = gm_blocking_receive_no_spin(kgm->kgm_port);
- if (e == NULL) {
- CERROR("gm_blocking_receive returned NULL\n");
- break;
- }
-
- switch(gm_ntohc(e->recv.type)) {
- case GM_HIGH_RECV_EVENT:
- priority = GM_HIGH_PRIORITY;
- /* fall through */
- case GM_RECV_EVENT:
- kgmnal_rx(kgm, gm_ntohl(e->recv.length),
- gm_ntohc(e->recv.size),
- gm_ntohp(e->recv.buffer), priority);
- break;
- case GM_ALARM_EVENT:
- CERROR("received alarm");
- gm_unknown(kgm->kgm_port, e);
- break;
- case GM_BAD_SEND_DETECTED_EVENT: /* ?? */
- CERROR("received bad send!\n");
- break;
- default:
- gm_unknown(kgm->kgm_port, e);
- }
- }
-
- CERROR("shuttting down.\n");
- return 0;
-}
-nal_cb_t kgmnal_lib = {
- nal_data: &kgmnal_data, /* NAL private data */
- cb_send: kgmnal_send,
- cb_recv: kgmnal_recv,
- cb_read: kgmnal_read,
- cb_write: kgmnal_write,
- cb_malloc: kgmnal_malloc,
- cb_free: kgmnal_free,
- cb_printf: kgmnal_printf,
- cb_cli: kgmnal_cli,
- cb_sti: kgmnal_sti,
- cb_dist: kgmnal_dist
-};
+EXPORT_SYMBOL(gmnal_cb_send);
+EXPORT_SYMBOL(gmnal_cb_send_pages);
+EXPORT_SYMBOL(gmnal_cb_recv);
+EXPORT_SYMBOL(gmnal_cb_recv_pages);
+EXPORT_SYMBOL(gmnal_cb_read);
+EXPORT_SYMBOL(gmnal_cb_write);
+EXPORT_SYMBOL(gmnal_cb_cli);
+EXPORT_SYMBOL(gmnal_cb_sti);
+EXPORT_SYMBOL(gmnal_cb_dist);
+EXPORT_SYMBOL(gmnal_cb_printf);
+EXPORT_SYMBOL(gmnal_cb_map);
+EXPORT_SYMBOL(gmnal_cb_unmap);
+EXPORT_SYMBOL(gmnal_cb_callback);
+EXPORT_SYMBOL(gmnal_cb_free);
+EXPORT_SYMBOL(gmnal_cb_malloc);
+++ /dev/null
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- * Based on ksocknal and qswnal
- *
- * Copyright (C) 2002 Cluster File Systems, Inc.
- * Author: Robert Read <rread@datarithm.net>
- *
- * This file is part of Portals, http://www.sf.net/projects/sandiaportals/
- *
- * Portals is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * Portals is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Portals; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include "gmnal.h"
-
-ptl_handle_ni_t kgmnal_ni;
-nal_t kgmnal_api;
-
-kgmnal_data_t kgmnal_data;
-int gmnal_debug = 0;
-
-kpr_nal_interface_t kqswnal_router_interface = {
- kprni_nalid: GMNAL,
- kprni_arg: NULL,
- kprni_fwd: kgmnal_fwd_packet,
-};
-
-static int kgmnal_forward(nal_t *nal,
- int id,
- void *args, size_t args_len,
- void *ret, size_t ret_len)
-{
- kgmnal_data_t *k = nal->nal_data;
- nal_cb_t *nal_cb = k->kgm_cb;
-
- LASSERT (nal == &kgmnal_api);
- LASSERT (k == &kgmnal_data);
- LASSERT (nal_cb == &kgmnal_lib);
-
- lib_dispatch(nal_cb, k, id, args, ret); /* nal needs k */
- return PTL_OK;
-}
-
-static void kgmnal_lock(nal_t *nal, unsigned long *flags)
-{
- kgmnal_data_t *k = nal->nal_data;
- nal_cb_t *nal_cb = k->kgm_cb;
-
-
- LASSERT (nal == &kgmnal_api);
- LASSERT (k == &kgmnal_data);
- LASSERT (nal_cb == &kgmnal_lib);
-
- nal_cb->cb_cli(nal_cb,flags);
-}
-
-static void kgmnal_unlock(nal_t *nal, unsigned long *flags)
-{
- kgmnal_data_t *k = nal->nal_data;
- nal_cb_t *nal_cb = k->kgm_cb;
-
-
- LASSERT (nal == &kgmnal_api);
- LASSERT (k == &kgmnal_data);
- LASSERT (nal_cb == &kgmnal_lib);
-
- nal_cb->cb_sti(nal_cb,flags);
-}
-
-static int kgmnal_shutdown(nal_t *nal, int ni)
-{
- LASSERT (nal == &kgmnal_api);
- return 0;
-}
-
-static void kgmnal_yield( nal_t *nal )
-{
- LASSERT (nal == &kgmnal_api);
-
- if (current->need_resched)
- schedule();
- return;
-}
-
-kgmnal_rx_t *kgm_add_recv(kgmnal_data_t *data,int ndx)
-{
- kgmnal_rx_t *conn;
-
- PORTAL_ALLOC(conn, sizeof(kgmnal_rx_t));
- /* Check for out of mem here */
- if (conn==NULL) {
- printk("kgm_add_recv: memory alloc failed\n");
- return NULL;
- }
-
- list_add(&conn->krx_item,(struct list_head *)&data->kgm_list);
- // conn->ndx=ndx;
- // conn->len=conn->ptlhdr_copied=0;
- // conn->loopback=0;
- return conn;
-}
-
-static nal_t *kgmnal_init(int interface, ptl_pt_index_t ptl_size,
- ptl_ac_index_t ac_size, ptl_pid_t requested_pid)
-{
- unsigned int nnids;
-
- gm_max_node_id_in_use(kgmnal_data.kgm_port, &nnids);
-
- CDEBUG(D_NET, "calling lib_init with nid 0x%Lx of %d\n",
- kgmnal_data.kgm_nid, nnids);
- lib_init(&kgmnal_lib, kgmnal_data.kgm_nid, 0, nnids,ptl_size, ac_size);
- return &kgmnal_api;
-}
-
-static void /*__exit*/
-kgmnal_finalize(void)
-{
- struct list_head *tmp;
-
- PORTAL_SYMBOL_UNREGISTER (kgmnal_ni);
- PtlNIFini(kgmnal_ni);
- lib_fini(&kgmnal_api);
-
- if (kgmnal_data.kgm_port) {
- gm_close(kgmnal_data.kgm_port);
- }
-
- /* FIXME: free dma buffers */
- /* FIXME: kill receiver thread */
-
- PORTAL_FREE (kgmnal_data.kgm_trans, bsizeof(kgmnal_tx_t)*TXMSGS);
-
- list_for_each(tmp, &kgmnal_data.kgm_list) {
- kgmnal_rx_t *conn;
- conn = list_entry(tmp, kgmnal_rx_t, krx_item);
- CDEBUG(D_IOCTL, "freeing conn %p\n",conn);
- tmp = tmp->next;
- list_del(&conn->krx_item);
- PORTAL_FREE(conn, sizeof(*conn));
- }
-
- CDEBUG (D_MALLOC, "done kmem %d\n", atomic_read (&portal_kmemory));
-
- return;
-}
-
-static int __init
-kgmnal_initialize(void)
-{
- int rc;
- int ntok;
- unsigned long sizemask;
- unsigned int nid;
-
- CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read (&portal_kmemory));
-
- kgmnal_api.forward = kgmnal_forward;
- kgmnal_api.shutdown = kgmnal_shutdown;
- kgmnal_api.yield = kgmnal_yield;
- kgmnal_api.validate = NULL; /* our api validate is a NOOP */
- kgmnal_api.lock= kgmnal_lock;
- kgmnal_api.unlock= kgmnal_unlock;
- kgmnal_api.nal_data = &kgmnal_data;
-
- kgmnal_lib.nal_data = &kgmnal_data;
-
- memset(&kgmnal_data, 0, sizeof(kgmnal_data));
-
- INIT_LIST_HEAD(&kgmnal_data.kgm_list);
- kgmnal_data.kgm_cb = &kgmnal_lib;
-
- /* Allocate transmit descriptors */
- PORTAL_ALLOC (kgmnal_data.kgm_trans, sizeof(kgmnal_tx_t)*TXMSGS);
- if (kgmnal_data.kgm_trans==NULL) {
- printk("kgmnal: init: failed to allocate transmit "
- "descriptors\n");
- return -1;
- }
- memset(kgmnal_data.kgm_trans,-1,sizeof(kgmnal_tx_t)*(TXMSGS));
-
- spin_lock_init(&kgmnal_data.kgm_dispatch_lock);
- spin_lock_init(&kgmnal_data.kgm_update_lock);
- spin_lock_init(&kgmnal_data.kgm_send_lock);
-
- /* Do the receiver and xmtr allocation */
-
- rc = gm_init();
- if (rc != GM_SUCCESS) {
- CERROR("gm_init failed: %d\n", rc);
- return -1;
- }
-
- rc = gm_open(&kgmnal_data.kgm_port, 0 , KGM_PORT_NUM, KGM_HOSTNAME,
- GM_API_VERSION_1_1);
- if (rc != GM_SUCCESS) {
- gm_finalize();
- kgmnal_data.kgm_port = NULL;
- CERROR("gm_open failed: %d\n", rc);
- return -1;
- }
- gm_get_node_id(kgmnal_data.kgm_port, &nid);
- kgmnal_data.kgm_nid = nid;
- /* Allocate 2 different sizes of buffers. For new, use half
- the tokens for each. */
- ntok = gm_num_receive_tokens(kgmnal_data.kgm_port)/2;
- CDEBUG(D_NET, "gmnal_init: creating %d large %d byte recv buffers\n",
- ntok, MSG_LEN_LARGE);
- while (ntok-- > 0) {
- void * buffer = gm_dma_malloc(kgmnal_data.kgm_port,
- MSG_LEN_LARGE);
- if (buffer == NULL) {
- CERROR("gm_init failed: %d\n", rc);
- return (-ENOMEM);
- }
- CDEBUG(D_NET, " add buffer: port %p buf %p len %d size %d "
- "pri %d\n ", kgmnal_data.kgm_port, buffer,
- MSG_LEN_LARGE, MSG_SIZE_LARGE, GM_LOW_PRIORITY);
-
- gm_provide_receive_buffer(kgmnal_data.kgm_port, buffer,
- MSG_SIZE_LARGE, GM_LOW_PRIORITY);
- }
-
- ntok = gm_num_receive_tokens(kgmnal_data.kgm_port)/2;
- CDEBUG(D_NET, "gmnal_init: creating %d small %d byte recv buffers\n",
- ntok, MSG_LEN_SMALL);
- while (ntok-- > 0) {
- void * buffer = gm_dma_malloc(kgmnal_data.kgm_port,
- MSG_LEN_SMALL);
- if (buffer == NULL) {
- CERROR("gm_init failed: %d\n", rc);
- return (-ENOMEM);
- }
- CDEBUG(D_NET, " add buffer: port %p buf %p len %d size %d "
- "pri %d\n ", kgmnal_data.kgm_port, buffer,
- MSG_LEN_SMALL, MSG_SIZE_SMALL, GM_LOW_PRIORITY);
-
- gm_provide_receive_buffer(kgmnal_data.kgm_port, buffer,
- MSG_SIZE_SMALL, GM_LOW_PRIORITY);
-
- }
- sizemask = (1 << MSG_SIZE_LARGE) | (1 << MSG_SIZE_SMALL);
- CDEBUG(D_NET, "gm_set_acceptable_sizes port %p pri %d mask 0x%x\n",
- kgmnal_data.kgm_port, GM_LOW_PRIORITY, sizemask);
- gm_set_acceptable_sizes(kgmnal_data.kgm_port, GM_LOW_PRIORITY,
- sizemask);
- gm_set_acceptable_sizes(kgmnal_data.kgm_port, GM_HIGH_PRIORITY, 0);
-
- /* Initialize Network Interface */
- rc = PtlNIInit(kgmnal_init, 32, 4, 0, &kgmnal_ni);
- if (rc) {
- CERROR("PtlNIInit failed %d\n", rc);
- return (-ENOMEM);
- }
-
- /* Start receiver thread */
- kernel_thread(kgmnal_recv_thread, &kgmnal_data, 0);
-
- PORTAL_SYMBOL_REGISTER(kgmnal_ni);
-
- kgmnal_data.kgm_init = 1;
-
- return 0;
-}
-
-MODULE_AUTHOR("Robert Read <rread@datarithm.net>");
-MODULE_DESCRIPTION("Kernel Myrinet GM NAL v0.1");
-MODULE_LICENSE("GPL");
-
-module_init (kgmnal_initialize);
-module_exit (kgmnal_finalize);
-
-EXPORT_SYMBOL (kgmnal_ni);
kprni_nalid: QSWNAL,
kprni_arg: NULL,
kprni_fwd: kqswnal_fwd_packet,
+ kprni_notify: NULL, /* we're connectionless */
};
CDEBUG (D_MALLOC, "done kmem %d\n", atomic_read(&portal_kmemory));
- printk (KERN_INFO "Routing QSW NAL unloaded (final mem %d)\n",
+ printk (KERN_INFO "Lustre: Routing QSW NAL unloaded (final mem %d)\n",
atomic_read(&portal_kmemory));
}
PORTAL_SYMBOL_REGISTER(kqswnal_ni);
kqswnal_data.kqn_init = KQN_INIT_ALL;
- printk(KERN_INFO "Routing QSW NAL loaded on node %d of %d "
+ printk(KERN_INFO "Lustre: Routing QSW NAL loaded on node %d of %d "
"(Routing %s, initial mem %d)\n",
kqswnal_data.kqn_elanid, kqswnal_data.kqn_nnodes,
kpr_routing (&kqswnal_data.kqn_router) ? "enabled" : "disabled",
void *ktx_args[2]; /* completion passthru */
E3_Addr ktx_ebuffer; /* elan address of ktx_buffer */
char *ktx_buffer; /* pre-allocated contiguous buffer for hdr + small payloads */
+ unsigned long ktx_launchtime; /* when (in jiffies) the transmit was launched */
/* debug/info fields */
pid_t ktx_launcher; /* pid of launching process */
}
void
+kqswnal_notify_peer_down(kqswnal_tx_t *ktx)
+{
+ struct timeval now;
+ time_t then;
+
+ do_gettimeofday (&now);
+ then = now.tv_sec - (jiffies - ktx->ktx_launchtime)/HZ;
+
+ kpr_notify(&kqswnal_data.kqn_router, ktx->ktx_nid, 0, then);
+}
+
+void
kqswnal_unmap_tx (kqswnal_tx_t *ktx)
{
if (ktx->ktx_nmappedpages == 0)
if (status != EP_SUCCESS)
{
- CERROR ("kqswnal: Transmit failed with %d\n", status);
+ CERROR ("Tx completion to "LPX64" failed: %d\n",
+ ktx->ktx_nid, status);
+
+ kqswnal_notify_peer_down(ktx);
status = -EIO;
} else if (ktx->ktx_state == KTX_GETTING) {
int dest = kqswnal_nid2elanid (ktx->ktx_nid);
long flags;
int rc;
-
+
+ ktx->ktx_launchtime = jiffies;
+
LASSERT (dest >= 0); /* must be a peer */
if (ktx->ktx_state == KTX_GETTING) {
LASSERT (KQSW_OPTIMIZE_GETS);
ktx, ktx->ktx_frags.iov, ktx->ktx_nfrag);
}
- if (rc != ENOMEM)
- return (rc);
-
- /* can't allocate ep txd => queue for later */
+ switch (rc) {
+ case ESUCCESS: /* success */
+ return (0);
- LASSERT (in_interrupt()); /* not called by thread (not looping) */
+ case ENOMEM: /* can't allocate ep txd => queue for later */
+ LASSERT (in_interrupt());
- spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
+ spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
- list_add_tail (&ktx->ktx_delayed_list, &kqswnal_data.kqn_delayedtxds);
- if (waitqueue_active (&kqswnal_data.kqn_sched_waitq))
- wake_up (&kqswnal_data.kqn_sched_waitq);
+ list_add_tail (&ktx->ktx_delayed_list, &kqswnal_data.kqn_delayedtxds);
+ if (waitqueue_active (&kqswnal_data.kqn_sched_waitq))
+ wake_up (&kqswnal_data.kqn_sched_waitq);
- spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags);
+ spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags);
+ return (0);
- return (0);
+ default: /* fatal error */
+ CERROR ("Tx to "LPX64" failed: %d\n", ktx->ktx_nid, rc);
+ kqswnal_notify_peer_down(ktx);
+ return (rc);
+ }
}
-
static char *
hdr_type_string (ptl_hdr_t *hdr)
{
targetnid = nid;
if (kqswnal_nid2elanid (nid) < 0) { /* Can't send direct: find gateway? */
- rc = kpr_lookup (&kqswnal_data.kqn_router, nid, &targetnid);
+ rc = kpr_lookup (&kqswnal_data.kqn_router, nid,
+ sizeof (ptl_hdr_t) + payload_nob, &targetnid);
if (rc != 0) {
CERROR("Can't route to "LPX64": router error %d\n",
nid, rc);
#if KQSW_OPTIMIZE_GETS
if (type == PTL_MSG_REPLY &&
ep_rxd_isrpc(((kqswnal_rx_t *)private)->krx_rxd)) {
+ if (nid != targetnid ||
+ kqswnal_nid2elanid(nid) !=
+ ep_rxd_node(((kqswnal_rx_t *)private)->krx_rxd)) {
+ CERROR("Optimized reply nid conflict: "
+ "nid "LPX64" via "LPX64" elanID %d\n",
+ nid, targetnid,
+ ep_rxd_node(((kqswnal_rx_t *)private)->krx_rxd));
+ return(PTL_FAIL);
+ }
+
/* peer expects RPC completion with GET data */
rc = kqswnal_dma_reply (ktx,
payload_niov, payload_iov,
return (PTL_FAIL);
}
- CDEBUG(D_NET, "send to "LPSZ" bytes to "LPX64"\n", payload_nob, targetnid);
+ CDEBUG(D_NET, "sent "LPSZ" bytes to "LPX64" via "LPX64"\n",
+ payload_nob, nid, targetnid);
return (PTL_OK);
}
vsnprintf( msg, sizeof(msg), fmt, ap );
va_end( ap );
- printk("CPUId: %d %s",smp_processor_id(), msg);
+ printk("Lustre: CPUId: %d %s",smp_processor_id(), msg);
}
}
kprni_nalid: SOCKNAL,
kprni_arg: &ksocknal_data,
kprni_fwd: ksocknal_fwd_packet,
+ kprni_notify: ksocknal_notify,
};
+#define SOCKNAL_SYSCTL 200
+
+#define SOCKNAL_SYSCTL_TIMEOUT 1
+#define SOCKNAL_SYSCTL_EAGER_ACK 2
+#define SOCKNAL_SYSCTL_ZERO_COPY 3
+
+static ctl_table ksocknal_ctl_table[] = {
+ {SOCKNAL_SYSCTL_TIMEOUT, "timeout",
+ &ksocknal_data.ksnd_io_timeout, sizeof (int),
+ 0644, NULL, &proc_dointvec},
+ {SOCKNAL_SYSCTL_EAGER_ACK, "eager_ack",
+ &ksocknal_data.ksnd_eager_ack, sizeof (int),
+ 0644, NULL, &proc_dointvec},
+#if SOCKNAL_ZC
+ {SOCKNAL_SYSCTL_EAGER_ACK, "zero_copy",
+ &ksocknal_data.ksnd_zc_min_frag, sizeof (int),
+ 0644, NULL, &proc_dointvec},
+#endif
+ { 0 }
+};
+
+static ctl_table ksocknal_top_ctl_table[] = {
+ {SOCKNAL_SYSCTL, "socknal", NULL, 0, 0555, ksocknal_ctl_table},
+ { 0 }
+};
int
ksocknal_api_forward(nal_t *nal, int id, void *args, size_t args_len,
snprintf (cmdline, sizeof (cmdline),
"echo %d > /proc/irq/%u/smp_affinity", 1 << info->ksni_sched, irq);
- printk (KERN_INFO "Binding irq %u to CPU %d with cmd: %s\n",
+ printk (KERN_INFO "Lustre: Binding irq %u to CPU %d with cmd: %s\n",
irq, info->ksni_sched, cmdline);
/* FIXME: Find a better method of setting IRQ affinity...
ksock_route_t *
ksocknal_create_route (__u32 ipaddr, int port, int buffer_size,
- int irq_affinity, int xchange_nids, int nonagel)
+ int nonagel, int xchange_nids, int irq_affinity, int eager)
{
ksock_route_t *route;
atomic_set (&route->ksnr_refcount, 1);
route->ksnr_sharecount = 0;
route->ksnr_peer = NULL;
- route->ksnr_timeout = jiffies_64;
+ route->ksnr_timeout = jiffies;
route->ksnr_retry_interval = SOCKNAL_MIN_RECONNECT_INTERVAL;
route->ksnr_ipaddr = ipaddr;
route->ksnr_port = port;
route->ksnr_irq_affinity = irq_affinity;
route->ksnr_xchange_nids = xchange_nids;
route->ksnr_nonagel = nonagel;
+ route->ksnr_eager = eager;
route->ksnr_connecting = 0;
route->ksnr_deleted = 0;
route->ksnr_generation = 0;
int
ksocknal_add_route (ptl_nid_t nid, __u32 ipaddr, int port, int bufnob,
- int nonagle, int xchange_nids, int bind_irq, int share)
+ int nonagle, int xchange_nids, int bind_irq,
+ int share, int eager)
{
unsigned long flags;
ksock_peer_t *peer;
if (peer == NULL)
return (-ENOMEM);
- route = ksocknal_create_route (ipaddr, port, bufnob,
- nonagle, xchange_nids, bind_irq);
+ route = ksocknal_create_route (ipaddr, port, bufnob, nonagle,
+ xchange_nids, bind_irq, eager);
if (route == NULL) {
ksocknal_put_peer (peer);
return (-ENOMEM);
if (conn != NULL) {
if (!keep_conn)
- ksocknal_close_conn_locked (conn);
+ ksocknal_close_conn_locked (conn, 0);
else {
/* keeping the conn; just dissociate it and route... */
conn->ksnc_route = NULL;
struct sockaddr_in sin;
int len = sizeof (sin);
int rc;
-
- rc = ksocknal_getconnsock (conn);
- LASSERT (rc == 0);
rc = conn->ksnc_sock->ops->getname (conn->ksnc_sock,
(struct sockaddr *)&sin, &len, 2);
+ /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */
+ LASSERT (!conn->ksnc_closing);
LASSERT (len <= sizeof (sin));
- ksocknal_putconnsock (conn);
if (rc != 0) {
CERROR ("Error %d getting sock peer IP\n", rc);
ksocknal_conn_irq (ksock_conn_t *conn)
{
int irq = 0;
- int rc;
struct dst_entry *dst;
- rc = ksocknal_getconnsock (conn);
- LASSERT (rc == 0);
-
dst = sk_dst_get (conn->ksnc_sock->sk);
if (dst != NULL) {
if (dst->dev != NULL) {
dst_release (dst);
}
- ksocknal_putconnsock (conn);
+ /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */
+ LASSERT (!conn->ksnc_closing);
return (irq);
}
int rc;
/* NB, sock has an associated file since (a) this connection might
- * have been created in userland and (b) we need the refcounting so
- * that we don't close the socket while I/O is being done on it. */
+ * have been created in userland and (b) we need to refcount the
+ * socket so that we don't close it while I/O is being done on
+ * it, and sock->file has that pre-cooked... */
LASSERT (sock->file != NULL);
+ LASSERT (file_count(sock->file) > 0);
- rc = ksocknal_set_linger (sock);
+ rc = ksocknal_setup_sock (sock);
if (rc != 0)
return (rc);
ksocknal_new_packet (conn, 0);
INIT_LIST_HEAD (&conn->ksnc_tx_queue);
-#if SOCKNAL_ZC
- INIT_LIST_HEAD (&conn->ksnc_tx_pending);
-#endif
conn->ksnc_tx_ready = 0;
conn->ksnc_tx_scheduled = 0;
atomic_set (&conn->ksnc_tx_nob, 0);
conn->ksnc_peer = peer;
atomic_inc (&peer->ksnp_refcount);
+ peer->ksnp_last_alive = jiffies;
+ peer->ksnp_error = 0;
list_add (&conn->ksnc_list, &peer->ksnp_conns);
atomic_inc (&conn->ksnc_refcount);
}
void
-ksocknal_close_conn_locked (ksock_conn_t *conn)
+ksocknal_close_conn_locked (ksock_conn_t *conn, int error)
{
/* This just does the immmediate housekeeping, and queues the
* connection for the reaper to terminate.
ksock_peer_t *peer = conn->ksnc_peer;
ksock_route_t *route;
+ LASSERT (peer->ksnp_error == 0);
LASSERT (!conn->ksnc_closing);
conn->ksnc_closing = 1;
atomic_inc (&ksocknal_data.ksnd_nclosing_conns);
/* ksnd_deathrow_conns takes over peer's ref */
list_del (&conn->ksnc_list);
- if (list_empty (&peer->ksnp_conns) &&
- list_empty (&peer->ksnp_routes)) {
- /* I've just closed last conn belonging to a
- * non-autoconnecting peer */
- ksocknal_unlink_peer_locked (peer);
+ if (list_empty (&peer->ksnp_conns)) {
+ /* No more connections to this peer */
+
+ peer->ksnp_error = error; /* stash last conn close reason */
+
+ if (list_empty (&peer->ksnp_routes)) {
+ /* I've just closed last conn belonging to a
+ * non-autoconnecting peer */
+ ksocknal_unlink_peer_locked (peer);
+ }
}
spin_lock (&ksocknal_data.ksnd_reaper_lock);
}
int
-ksocknal_close_conn_unlocked (ksock_conn_t *conn)
+ksocknal_close_conn_unlocked (ksock_conn_t *conn, int why)
{
unsigned long flags;
int did_it = 0;
if (!conn->ksnc_closing) {
did_it = 1;
- ksocknal_close_conn_locked (conn);
+ ksocknal_close_conn_locked (conn, why);
}
write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags);
* ksnc_refcount will eventually hit zero, and then the reaper will
* destroy it. */
unsigned long flags;
+ ksock_peer_t *peer = conn->ksnc_peer;
+ struct timeval now;
+ time_t then = 0;
+ int notify = 0;
/* serialise with callbacks */
write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags);
conn->ksnc_scheduler->kss_nconns--;
+ if (peer->ksnp_error != 0) {
+ /* peer's last conn closed in error */
+ LASSERT (list_empty (&peer->ksnp_conns));
+
+ /* convert peer's last-known-alive timestamp from jiffies */
+ do_gettimeofday (&now);
+ then = now.tv_sec - (jiffies - peer->ksnp_last_alive)/HZ;
+ notify = 1;
+ }
+
write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags);
/* The socket is closed on the final put; either here, or in
* immediately, aborting anything buffered in it. Any hung
* zero-copy transmits will therefore complete in finite time. */
ksocknal_putconnsock (conn);
+
+ if (notify)
+ kpr_notify (&ksocknal_data.ksnd_router, peer->ksnp_nid,
+ 0, then);
}
void
LASSERT (conn->ksnc_route == NULL);
LASSERT (!conn->ksnc_tx_scheduled);
LASSERT (!conn->ksnc_rx_scheduled);
-#if SOCKNAL_ZC
- LASSERT (list_empty (&conn->ksnc_tx_pending));
-#endif
+
/* complete queued packets */
while (!list_empty (&conn->ksnc_tx_queue)) {
ksock_tx_t *tx = list_entry (conn->ksnc_tx_queue.next,
continue;
rc = 0;
- ksocknal_close_conn_locked (conn);
+ ksocknal_close_conn_locked (conn, 0);
}
}
}
return (rc);
}
+void
+ksocknal_notify (void *arg, ptl_nid_t gw_nid, int alive)
+{
+ /* The router is telling me she's been notified of a change in
+ * gateway state.... */
+
+ CDEBUG (D_NET, "gw "LPX64" %s\n", gw_nid, alive ? "up" : "down");
+
+ if (!alive) {
+ /* If the gateway crashed, close all open connections... */
+ ksocknal_close_conn (gw_nid, 0);
+ return;
+ }
+
+ /* ...otherwise do nothing. We can only establish new connections
+ * if we have autroutes, and these connect on demand. */
+}
+
#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
struct tcp_opt *sock2tcp_opt(struct sock *sk)
{
data->ioc_wait = route->ksnr_sharecount;
data->ioc_flags = (route->ksnr_nonagel ? 1 : 0) |
(route->ksnr_xchange_nids ? 2 : 0) |
- (route->ksnr_irq_affinity ? 4 : 0);
+ (route->ksnr_irq_affinity ? 4 : 0) |
+ (route->ksnr_eager ? 8 : 0);
ksocknal_put_route (route);
}
break;
case NAL_CMD_ADD_AUTOCONN: {
rc = ksocknal_add_route (data->ioc_nid, data->ioc_id,
data->ioc_misc, data->ioc_size,
- (data->ioc_flags & 1) != 0,
- (data->ioc_flags & 2) != 0,
- (data->ioc_flags & 4) != 0,
- (data->ioc_flags & 8) != 0);
+ (data->ioc_flags & 0x01) != 0,
+ (data->ioc_flags & 0x02) != 0,
+ (data->ioc_flags & 0x04) != 0,
+ (data->ioc_flags & 0x08) != 0,
+ (data->ioc_flags & 0x10) != 0);
break;
}
case NAL_CMD_DEL_AUTOCONN: {
LASSERT (0);
case SOCKNAL_INIT_ALL:
+#if CONFIG_SYSCTL
+ if (ksocknal_data.ksnd_sysctl != NULL)
+ unregister_sysctl_table (ksocknal_data.ksnd_sysctl);
+#endif
kportal_nal_unregister(SOCKNAL);
PORTAL_SYMBOL_UNREGISTER (ksocknal_ni);
/* fall through */
CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
atomic_read (&portal_kmemory));
- printk(KERN_INFO "Routing socket NAL unloaded (final mem %d)\n",
+ printk(KERN_INFO "Lustre: Routing socket NAL unloaded (final mem %d)\n",
atomic_read(&portal_kmemory));
}
/* packet descriptor must fit in a router descriptor's scratchpad */
LASSERT(sizeof (ksock_tx_t) <= sizeof (kprfd_scratch_t));
+ /* the following must be sizeof(int) for proc_dointvec() */
+ LASSERT(sizeof (ksocknal_data.ksnd_io_timeout) == sizeof (int));
+ LASSERT(sizeof (ksocknal_data.ksnd_eager_ack) == sizeof (int));
LASSERT (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING);
memset (&ksocknal_data, 0, sizeof (ksocknal_data)); /* zero pointers */
+ ksocknal_data.ksnd_io_timeout = SOCKNAL_IO_TIMEOUT;
+ ksocknal_data.ksnd_eager_ack = SOCKNAL_EAGER_ACK;
+#if SOCKNAL_ZC
+ ksocknal_data.ksnd_zc_min_frag = SOCKNAL_ZC_MIN_FRAG;
+#endif
+
ksocknal_data.ksnd_peer_hash_size = SOCKNAL_PEER_HASH_SIZE;
PORTAL_ALLOC (ksocknal_data.ksnd_peers,
sizeof (struct list_head) * ksocknal_data.ksnd_peer_hash_size);
PORTAL_SYMBOL_REGISTER(ksocknal_ni);
+#ifdef CONFIG_SYSCTL
+ /* Press on regardless even if registering sysctl doesn't work */
+ ksocknal_data.ksnd_sysctl = register_sysctl_table (ksocknal_top_ctl_table, 0);
+#endif
/* flag everything initialised */
ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL;
- printk(KERN_INFO "Routing socket NAL loaded (Routing %s, initial "
- "mem %d)\n",
+ printk(KERN_INFO "Lustre: Routing socket NAL loaded "
+ "(Routing %s, initial mem %d)\n",
kpr_routing (&ksocknal_data.ksnd_router) ?
"enabled" : "disabled", pkmem);
#include <linux/stat.h>
#include <linux/list.h>
#include <linux/kmod.h>
+#include <linux/sysctl.h>
#include <asm/uaccess.h>
#include <asm/segment.h>
#include <asm/div64.h>
#define SOCKNAL_MIN_RECONNECT_INTERVAL HZ /* first failed connection retry... */
#define SOCKNAL_MAX_RECONNECT_INTERVAL (60*HZ) /* ...exponentially increasing to this */
-#define SOCKNAL_IO_TIMEOUT (60*HZ) /* default comms timeout */
+/* default vals for runtime tunables */
+#define SOCKNAL_IO_TIMEOUT 50 /* default comms timeout (seconds) */
+#define SOCKNAL_EAGER_ACK 1 /* default eager ack (boolean) */
+#define SOCKNAL_ZC_MIN_FRAG (2<<10) /* default smallest zerocopy fragment */
+
+#define SOCKNAL_USE_KEEPALIVES 0 /* use tcp/ip keepalive? */
#define SOCKNAL_PEER_HASH_SIZE 101 /* # peer lists */
# define SOCKNAL_MAX_FWD_PAYLOAD (64<<10) /* biggest payload I can forward */
#endif
-#define SOCKNAL_ZC_MIN_FRAG (2<<10) /* default smallest zerocopy fragment */
-
#define SOCKNAL_NLTXS 128 /* # normal transmit messages */
#define SOCKNAL_NNBLK_LTXS 128 /* # transmit messages reserved if can't block */
#define SOCKNAL_TX_LOW_WATER(sk) (((sk)->sk_sndbuf*8)/10)
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-# define jiffies_64 jiffies
-#endif
-
#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,72))
# define sk_data_ready data_ready
# define sk_write_space write_space
typedef struct {
int ksnd_init; /* initialisation state */
+ int ksnd_io_timeout; /* "stuck" socket timeout (seconds) */
+ int ksnd_eager_ack; /* make TCP ack eagerly? */
+#if SOCKNAL_ZC
+ unsigned int ksnd_zc_min_frag; /* minimum zero copy frag size */
+#endif
+ struct ctl_table_header *ksnd_sysctl; /* sysctl interface */
rwlock_t ksnd_global_lock; /* stabilize peer/conn ops */
struct list_head *ksnd_peers; /* hash table of all my known peers */
typedef struct /* transmit packet */
{
struct list_head tx_list; /* queue on conn for transmission etc */
- __u64 tx_deadline; /* when (in jiffies) tx times out */
char tx_isfwd; /* forwarding / sourced here */
int tx_nob; /* # packet bytes */
int tx_resid; /* residual bytes */
__u32 ksnc_ipaddr; /* peer's IP */
int ksnc_port; /* peer's port */
int ksnc_closing; /* being shut down */
-
+
/* READER */
struct list_head ksnc_rx_list; /* where I enq waiting input or a forwarding descriptor */
- __u64 ksnc_rx_deadline; /* when receive times out */
+ unsigned long ksnc_rx_deadline; /* when (in jiffies) receive times out */
+ int ksnc_rx_started; /* started receiving a message */
int ksnc_rx_ready; /* data ready to read */
int ksnc_rx_scheduled; /* being progressed */
int ksnc_rx_state; /* what is being read */
/* WRITER */
struct list_head ksnc_tx_list; /* where I enq waiting for output space */
struct list_head ksnc_tx_queue; /* packets waiting to be sent */
-#if SOCKNAL_ZC
- struct list_head ksnc_tx_pending; /* zc packets pending callback */
-#endif
+ unsigned long ksnc_tx_deadline; /* when (in jiffies) tx times out */
atomic_t ksnc_tx_nob; /* # bytes queued */
int ksnc_tx_ready; /* write space */
int ksnc_tx_scheduled; /* being progressed */
struct ksock_peer *ksnr_peer; /* owning peer */
atomic_t ksnr_refcount; /* # users */
int ksnr_sharecount; /* lconf usage counter */
- __u64 ksnr_timeout; /* when reconnection can happen next */
+ unsigned long ksnr_timeout; /* when (in jiffies) reconnection can happen next */
unsigned int ksnr_retry_interval; /* how long between retries */
__u32 ksnr_ipaddr; /* an IP address for this peer */
int ksnr_port; /* port to connect to */
unsigned int ksnr_irq_affinity:1; /* set affinity? */
unsigned int ksnr_xchange_nids:1; /* do hello protocol? */
unsigned int ksnr_nonagel:1; /* disable nagle? */
- unsigned int ksnr_connecting; /* autoconnect in progress? */
- unsigned int ksnr_deleted; /* been removed from peer? */
+ unsigned int ksnr_eager:1; /* connect eagery? */
+ unsigned int ksnr_connecting:1; /* autoconnect in progress? */
+ unsigned int ksnr_deleted:1; /* been removed from peer? */
int ksnr_generation; /* connection incarnation # */
ksock_conn_t *ksnr_conn; /* NULL/active connection */
} ksock_route_t;
ptl_nid_t ksnp_nid; /* who's on the other end(s) */
atomic_t ksnp_refcount; /* # users */
int ksnp_closing; /* being closed */
+ int ksnp_error; /* errno on closing last conn */
struct list_head ksnp_conns; /* all active connections */
struct list_head ksnp_routes; /* routes */
struct list_head ksnp_tx_queue; /* waiting packets */
+ unsigned long ksnp_last_alive; /* when (in jiffies) I was last alive */
} ksock_peer_t;
-
extern nal_cb_t ksocknal_lib;
extern ksock_nal_data_t ksocknal_data;
int single, int keep_conn);
extern int ksocknal_create_conn (ptl_nid_t nid, ksock_route_t *route,
struct socket *sock, int bind_irq);
-extern void ksocknal_close_conn_locked (ksock_conn_t *conn);
-extern int ksocknal_close_conn_unlocked (ksock_conn_t *conn);
+extern void ksocknal_close_conn_locked (ksock_conn_t *conn, int why);
+extern int ksocknal_close_conn_unlocked (ksock_conn_t *conn, int why);
extern void ksocknal_terminate_conn (ksock_conn_t *conn);
extern void ksocknal_destroy_conn (ksock_conn_t *conn);
extern void ksocknal_put_conn (ksock_conn_t *conn);
extern void ksocknal_tx_done (ksock_tx_t *tx, int asynch);
extern void ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd);
extern void ksocknal_fmb_callback (void *arg, int error);
+extern void ksocknal_notify (void *arg, ptl_nid_t gw_nid, int alive);
extern int ksocknal_thread_start (int (*fn)(void *arg), void *arg);
extern int ksocknal_new_packet (ksock_conn_t *conn, int skip);
extern int ksocknal_scheduler (void *arg);
extern void ksocknal_write_space(struct sock *sk);
extern int ksocknal_autoconnectd (void *arg);
extern int ksocknal_reaper (void *arg);
-extern int ksocknal_set_linger (struct socket *sock);
+extern int ksocknal_setup_sock (struct socket *sock);
#include "socknal.h"
-int ksocknal_io_timeout = SOCKNAL_IO_TIMEOUT;
-#if SOCKNAL_ZC
-int ksocknal_do_zc = 1;
-int ksocknal_zc_min_frag = SOCKNAL_ZC_MIN_FRAG;
-#endif
-
/*
* LIB functions follow
*
struct iovec *iov = tx->tx_iov;
int fragsize = iov->iov_len;
unsigned long vaddr = (unsigned long)iov->iov_base;
- int more = !list_empty (&conn->ksnc_tx_queue) |
+ int more = (!list_empty (&conn->ksnc_tx_queue)) |
(tx->tx_niov > 1) |
(tx->tx_nkiov > 1);
#if SOCKNAL_ZC
LASSERT (tx->tx_niov > 0);
#if SOCKNAL_ZC
- if (ksocknal_do_zc &&
+ if (zcsize >= ksocknal_data.ksnd_zc_min_frag &&
(sock->sk->route_caps & NETIF_F_SG) &&
(sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) &&
- zcsize >= ksocknal_zc_min_frag &&
(page = ksocknal_kvaddr_to_page (vaddr)) != NULL) {
CDEBUG(D_NET, "vaddr %p, page %p->%p + offset %x for %d\n",
int fragsize = kiov->kiov_len;
struct page *page = kiov->kiov_page;
int offset = kiov->kiov_offset;
- int more = !list_empty (&conn->ksnc_tx_queue) |
+ int more = (!list_empty (&conn->ksnc_tx_queue)) |
(tx->tx_nkiov > 1);
int rc;
LASSERT (tx->tx_nkiov > 0);
#if SOCKNAL_ZC
- if (ksocknal_do_zc &&
+ if (fragsize >= ksocknal_data.ksnd_zc_min_frag &&
(sock->sk->route_caps & NETIF_F_SG) &&
- (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) &&
- fragsize >= ksocknal_zc_min_frag) {
+ (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM))) {
CDEBUG(D_NET, "page %p + offset %x for %d\n",
page, offset, fragsize);
set_fs (KERNEL_DS);
rc = sock_sendmsg(sock, &msg, fragsize);
set_fs (oldmm);
+
kunmap (page);
}
break;
}
+ /* Consider the connection alive since we managed to chuck
+ * more data into it. Really, we'd like to consider it
+ * alive only when the peer ACKs something, but
+ * write_space() only gets called back while SOCK_NOSPACE
+ * is set. Instead, we presume peer death has occurred if
+ * the socket doesn't drain within a timout */
+ conn->ksnc_tx_deadline = jiffies +
+ ksocknal_data.ksnd_io_timeout * HZ;
+ conn->ksnc_peer->ksnp_last_alive = jiffies;
+
if (tx->tx_resid == 0) { /* sent everything */
rc = 0;
break;
RETURN (rc);
}
+void
+ksocknal_eager_ack (ksock_conn_t *conn)
+{
+ int opt = 1;
+ mm_segment_t oldmm = get_fs();
+ struct socket *sock = conn->ksnc_sock;
+
+ /* Remind the socket to ACK eagerly. If I don't, the socket might
+ * think I'm about to send something it could piggy-back the ACK
+ * on, introducing delay in completing zero-copy sends in my
+ * peer. */
+
+ set_fs(KERNEL_DS);
+ sock->ops->setsockopt (sock, SOL_TCP, TCP_QUICKACK,
+ (char *)&opt, sizeof (opt));
+ set_fs(oldmm);
+}
+
int
ksocknal_recv_iov (ksock_conn_t *conn)
{
if (rc <= 0)
return (rc);
+ /* received something... */
+ conn->ksnc_peer->ksnp_last_alive = jiffies;
+ conn->ksnc_rx_deadline = jiffies +
+ ksocknal_data.ksnd_io_timeout * HZ;
+ mb(); /* order with setting rx_started */
+ conn->ksnc_rx_started = 1;
+
conn->ksnc_rx_nob_wanted -= rc;
conn->ksnc_rx_nob_left -= rc;
rc = sock_recvmsg (conn->ksnc_sock, &msg, fragsize, MSG_DONTWAIT);
/* NB this is just a boolean............................^ */
set_fs (oldmm);
+
kunmap (page);
if (rc <= 0)
return (rc);
+ /* received something... */
+ conn->ksnc_peer->ksnp_last_alive = jiffies;
+ conn->ksnc_rx_deadline = jiffies +
+ ksocknal_data.ksnd_io_timeout * HZ;
+ mb(); /* order with setting rx_started */
+ conn->ksnc_rx_started = 1;
+
conn->ksnc_rx_nob_wanted -= rc;
conn->ksnc_rx_nob_left -= rc;
rc = -ESHUTDOWN;
break;
}
-
+
if (conn->ksnc_rx_niov != 0)
rc = ksocknal_recv_iov (conn);
else
rc = ksocknal_recv_kiov (conn);
-
+
if (rc <= 0) {
/* error/EOF or partial receive */
- if (rc == -EAGAIN)
+ if (rc == -EAGAIN) {
rc = 1;
+ } else if (rc == 0 && conn->ksnc_rx_started) {
+ /* EOF in the middle of a message */
+ rc = -EPROTO;
+ }
break;
}
+ /* Completed a fragment */
+
if (conn->ksnc_rx_nob_wanted == 0) {
+ /* Completed a message segment (header or payload) */
+ if (ksocknal_data.ksnd_eager_ack &&
+ (conn->ksnc_rx_state == SOCKNAL_RX_BODY ||
+ conn->ksnc_rx_state == SOCKNAL_RX_BODY_FWD)) {
+ /* Remind the socket to ack eagerly... */
+ ksocknal_eager_ack(conn);
+ }
rc = 1;
break;
}
spin_lock_irqsave (&sched->kss_lock, flags);
- list_del (&tx->tx_list); /* remove from kss_zctxpending_list */
list_add_tail (&tx->tx_list, &sched->kss_zctxdone_list);
if (waitqueue_active (&sched->kss_waitq))
wake_up (&sched->kss_waitq);
{
#if SOCKNAL_ZC
if (atomic_read (&tx->tx_zccd.zccd_count) != 1) {
- unsigned long flags;
ksock_conn_t *conn = tx->tx_conn;
- ksock_sched_t *sched = conn->ksnc_scheduler;
/* zccd skbufs are still in-flight. First take a ref on
* conn, so it hangs about for ksocknal_tx_done... */
atomic_inc (&conn->ksnc_refcount);
- /* Stash it for timeout...
- * NB We have to hold a lock to stash the tx, and we have
- * stash it before we zcc_put(), but we have to _not_ hold
- * this lock when we zcc_put(), otherwise we could deadlock
- * if it turns out to be the last put. Aaaaarrrrggghhh! */
- spin_lock_irqsave (&sched->kss_lock, flags);
- list_add_tail (&tx->tx_list, &conn->ksnc_tx_pending);
- spin_unlock_irqrestore (&sched->kss_lock, flags);
-
/* ...then drop the initial ref on zccd, so the zero copy
* callback can occur */
zccd_put (&tx->tx_zccd);
void
ksocknal_process_transmit (ksock_sched_t *sched, unsigned long *irq_flags)
{
- ksock_conn_t *conn;
- ksock_tx_t *tx;
- int rc;
-
+ ksock_conn_t *conn;
+ ksock_tx_t *tx;
+ int rc;
+
LASSERT (!list_empty (&sched->kss_tx_conns));
conn = list_entry(sched->kss_tx_conns.next, ksock_conn_t, ksnc_tx_list);
list_del (&conn->ksnc_tx_list);
CDEBUG (D_NET, "send(%d) %d\n", tx->tx_resid, rc);
if (rc != 0) {
- if (ksocknal_close_conn_unlocked (conn)) {
+ if (ksocknal_close_conn_unlocked (conn, rc)) {
/* I'm the first to close */
CERROR ("[%p] Error %d on write to "LPX64" ip %08x:%d\n",
conn, rc, conn->ksnc_peer->ksnp_nid,
spin_lock_irqsave (&sched->kss_lock, *irq_flags);
} else if (tx->tx_resid == 0) {
-
/* everything went; assume more can go, and avoid
* write_space locking */
conn->ksnc_tx_ready = 1;
return (NULL);
}
- rc = kpr_lookup (&ksocknal_data.ksnd_router, nid, &target_nid);
+ rc = kpr_lookup (&ksocknal_data.ksnd_router, nid, tx->tx_nob,
+ &target_nid);
if (rc != 0) {
CERROR ("Can't route to "LPX64": router error %d\n", nid, rc);
return (NULL);
#endif
spin_lock_irqsave (&sched->kss_lock, flags);
-
- tx->tx_deadline = jiffies_64 + ksocknal_io_timeout;
+
list_add_tail (&tx->tx_list, &conn->ksnc_tx_queue);
if (conn->ksnc_tx_ready && /* able to send */
}
ksock_route_t *
-ksocknal_find_connectable_route_locked (ksock_peer_t *peer)
+ksocknal_find_connectable_route_locked (ksock_peer_t *peer, int eager_only)
{
struct list_head *tmp;
ksock_route_t *route;
if (route->ksnr_conn == NULL && /* not connected */
!route->ksnr_connecting && /* not connecting */
- route->ksnr_timeout <= jiffies_64) /* OK to retry */
+ (!eager_only || route->ksnr_eager) && /* wants to connect */
+ time_after_eq (jiffies, route->ksnr_timeout)) /* OK to retry */
return (route);
}
ksock_conn_t *conn;
ksock_route_t *route;
rwlock_t *g_lock;
-
+
/* Ensure the frags we've been given EXACTLY match the number of
* bytes we want to send. Many TCP/IP stacks disregard any total
* size parameters passed to them and just look at the frags.
return (PTL_FAIL);
}
- /* Any routes need to be connected? (need write lock if so) */
- if (ksocknal_find_connectable_route_locked (peer) == NULL) {
+ if (ksocknal_find_connectable_route_locked(peer, 1) == NULL) {
conn = ksocknal_find_conn_locked (tx, peer);
if (conn != NULL) {
+ /* I've got no unconnected autoconnect routes that
+ * need to be connected, and I do have an actual
+ * connection... */
ksocknal_queue_tx_locked (tx, conn);
read_unlock (g_lock);
return (PTL_OK);
}
}
- /* need a write lock now to change peer state... */
+ /* Making one or more connections; I'll need a write lock... */
atomic_inc (&peer->ksnp_refcount); /* +1 ref for me while I unlock */
read_unlock (g_lock);
}
ksocknal_put_peer (peer); /* drop ref I got above */
- /* I may launch autoconnects, now we're write locked... */
- while ((route = ksocknal_find_connectable_route_locked (peer)) != NULL)
+
+ for (;;) {
+ /* launch all eager autoconnections */
+ route = ksocknal_find_connectable_route_locked (peer, 1);
+ if (route == NULL)
+ break;
+
ksocknal_launch_autoconnect_locked (route);
+ }
conn = ksocknal_find_conn_locked (tx, peer);
if (conn != NULL) {
+ /* Connection exists; queue message on it */
ksocknal_queue_tx_locked (tx, conn);
write_unlock_irqrestore (g_lock, flags);
return (PTL_OK);
}
-
+
if (ksocknal_find_connecting_route_locked (peer) == NULL) {
- /* no routes actually connecting now */
- write_unlock_irqrestore (g_lock, flags);
- return (PTL_FAIL);
+ /* no autoconnect routes actually connecting now. Scrape
+ * the barrel for non-eager autoconnects */
+ route = ksocknal_find_connectable_route_locked (peer, 0);
+ if (route != NULL) {
+ ksocknal_launch_autoconnect_locked (route);
+ } else {
+ write_unlock_irqrestore (g_lock, flags);
+ return (PTL_FAIL);
+ }
}
+ /* At least 1 connection is being established; queue the message... */
list_add_tail (&tx->tx_list, &peer->ksnp_tx_queue);
write_unlock_irqrestore (g_lock, flags);
CDEBUG (D_NET, "routed packet from "LPX64" to "LPX64": OK\n",
NTOH__u64 (hdr->src_nid), NTOH__u64 (hdr->dest_nid));
+ /* drop peer ref taken on init */
+ ksocknal_put_peer (fmb->fmb_peer);
+
spin_lock_irqsave (&fmp->fmp_lock, flags);
list_add (&fmb->fmb_list, &fmp->fmp_idle_fmbs);
spin_unlock_irqrestore (&fmp->fmp_lock, flags);
- /* drop peer ref taken on init */
- ksocknal_put_peer (fmb->fmb_peer);
-
if (conn == NULL)
return;
conn->ksnc_cookie = fmb; /* stash fmb for later */
conn->ksnc_rx_state = SOCKNAL_RX_BODY_FWD; /* read in the payload */
- conn->ksnc_rx_deadline = jiffies_64 + ksocknal_io_timeout; /* start timeout */
/* payload is desc's iov-ed buffer, but skipping the hdr */
LASSERT (niov <= sizeof (conn->ksnc_rx_iov_space) /
dest_nid, body_len);
ksocknal_new_packet (conn, 0); /* on to new packet */
- ksocknal_close_conn_unlocked (conn); /* give up on conn */
+ ksocknal_close_conn_unlocked (conn, -EINVAL); /* give up on conn */
return;
}
int skipped;
if (nob_to_skip == 0) { /* right at next packet boundary now */
+ conn->ksnc_rx_started = 0;
+ mb (); /* racing with timeout thread */
+
conn->ksnc_rx_state = SOCKNAL_RX_HEADER;
conn->ksnc_rx_nob_wanted = sizeof (ptl_hdr_t);
conn->ksnc_rx_nob_left = sizeof (ptl_hdr_t);
rc = ksocknal_recvmsg(conn);
if (rc <= 0) {
- if (ksocknal_close_conn_unlocked (conn)) {
+ if (ksocknal_close_conn_unlocked (conn, rc)) {
/* I'm the first to close */
if (rc < 0)
CERROR ("[%p] Error %d on read from "LPX64" ip %08x:%d\n",
conn, rc, conn->ksnc_peer->ksnp_nid,
conn->ksnc_ipaddr, conn->ksnc_port);
else
- CERROR ("[%p] EOF from "LPX64" ip %08x:%d\n",
- conn, conn->ksnc_peer->ksnp_nid,
- conn->ksnc_ipaddr, conn->ksnc_port);
+ CWARN ("[%p] EOF from "LPX64" ip %08x:%d\n",
+ conn, conn->ksnc_peer->ksnp_nid,
+ conn->ksnc_ipaddr, conn->ksnc_port);
}
goto out;
}
+
if (conn->ksnc_rx_nob_wanted != 0) /* short read */
goto out; /* try again later */
/* sets wanted_len, iovs etc */
lib_parse(&ksocknal_lib, &conn->ksnc_hdr, conn);
- /* start timeout (lib is waiting for finalize) */
- conn->ksnc_rx_deadline = jiffies_64 + ksocknal_io_timeout;
-
if (conn->ksnc_rx_nob_wanted != 0) { /* need to get payload? */
conn->ksnc_rx_state = SOCKNAL_RX_BODY;
goto try_read; /* go read the payload */
case SOCKNAL_RX_BODY:
/* payload all received */
- conn->ksnc_rx_deadline = 0; /* cancel timeout */
lib_finalize(&ksocknal_lib, NULL, conn->ksnc_cookie);
/* Fall through */
NTOH__u64 (conn->ksnc_hdr.dest_nid),
conn->ksnc_rx_nob_left);
- /* cancel timeout (only needed it while fmb allocated) */
- conn->ksnc_rx_deadline = 0;
-
/* forward the packet. NB ksocknal_init_fmb() put fmb into
* conn->ksnc_cookie */
fmb = (ksock_fmb_t *)conn->ksnc_cookie;
int id = sched - ksocknal_data.ksnd_schedulers;
char name[16];
- snprintf (name, sizeof (name),"ksocknald[%d]", id);
+ snprintf (name, sizeof (name),"ksocknald_%02d", id);
kportal_daemonize (name);
kportal_blockallsigs ();
#if (CONFIG_SMP && CPU_AFFINITY)
- if ((cpu_online_map & (1 << id)) != 0)
+ if ((cpu_online_map & (1 << id)) != 0) {
+#if 1
current->cpus_allowed = (1 << id);
- else
+#else
+ set_cpus_allowed (current, 1<<id);
+#endif
+ } else {
CERROR ("Can't set CPU affinity for %s\n", name);
+ }
#endif /* CONFIG_SMP && CPU_AFFINITY */
spin_lock_irqsave (&sched->kss_lock, flags);
if (conn == NULL) { /* raced with ksocknal_close_sock */
LASSERT (sk->sk_data_ready != &ksocknal_data_ready);
sk->sk_data_ready (sk, n);
- } else if (!conn->ksnc_rx_ready) { /* new news */
+ goto out;
+ }
+
+ if (!conn->ksnc_rx_ready) { /* new news */
/* Set ASAP in case of concurrent calls to me */
conn->ksnc_rx_ready = 1;
spin_unlock_irqrestore (&sched->kss_lock, flags);
}
+ out:
read_unlock (&ksocknal_data.ksnd_global_lock);
EXIT;
if (conn == NULL) { /* raced with ksocknal_close_sock */
LASSERT (sk->sk_write_space != &ksocknal_write_space);
sk->sk_write_space (sk);
- } else if (tcp_wspace(sk) >= SOCKNAL_TX_LOW_WATER(sk)) { /* got enough space */
+
+ read_unlock (&ksocknal_data.ksnd_global_lock);
+ return;
+ }
+
+ if (tcp_wspace(sk) >= SOCKNAL_TX_LOW_WATER(sk)) { /* got enough space */
clear_bit (SOCK_NOSPACE, &sk->sk_socket->flags);
if (!conn->ksnc_tx_ready) { /* new news */
}
int
-ksocknal_set_linger (struct socket *sock)
+ksocknal_setup_sock (struct socket *sock)
{
mm_segment_t oldmm = get_fs ();
int rc;
CERROR ("Can't set SO_LINGER2: %d\n", rc);
return (rc);
}
+
+#if SOCKNAL_USE_KEEPALIVES
+ /* Keepalives: If 3/4 of the timeout elapses, start probing every
+ * second until the timeout elapses. */
+
+ option = (ksocknal_data.ksnd_io_timeout * 3) / 4;
+ set_fs (KERNEL_DS);
+ rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPIDLE,
+ (char *)&option, sizeof (option));
+ set_fs (oldmm);
+ if (rc != 0) {
+ CERROR ("Can't set TCP_KEEPIDLE: %d\n", rc);
+ return (rc);
+ }
+
+ option = 1;
+ set_fs (KERNEL_DS);
+ rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPINTVL,
+ (char *)&option, sizeof (option));
+ set_fs (oldmm);
+ if (rc != 0) {
+ CERROR ("Can't set TCP_KEEPINTVL: %d\n", rc);
+ return (rc);
+ }
+ option = ksocknal_data.ksnd_io_timeout / 4;
+ set_fs (KERNEL_DS);
+ rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPCNT,
+ (char *)&option, sizeof (option));
+ set_fs (oldmm);
+ if (rc != 0) {
+ CERROR ("Can't set TCP_KEEPINTVL: %d\n", rc);
+ return (rc);
+ }
+
+ option = 1;
+ set_fs (KERNEL_DS);
+ rc = sock_setsockopt (sock, SOL_SOCKET, SO_KEEPALIVE,
+ (char *)&option, sizeof (option));
+ set_fs (oldmm);
+ if (rc != 0) {
+ CERROR ("Can't set SO_KEEPALIVE: %d\n", rc);
+ return (rc);
+ }
+#endif
return (0);
}
{
struct sockaddr_in peer_addr;
mm_segment_t oldmm = get_fs();
- __u64 n;
struct timeval tv;
int fd;
struct socket *sock;
}
/* Ugh; have to map_fd for compatibility with sockets passed in
- * from userspace. And we actually need the refcounting that
- * this gives you :) */
+ * from userspace. And we actually need the sock->file refcounting
+ * that this gives you :) */
fd = sock_map_fd (sock);
if (fd < 0) {
/* NB the fd now owns the ref on sock->file */
LASSERT (sock->file != NULL);
LASSERT (file_count(sock->file) == 1);
-
+
/* Set the socket timeouts, so our connection attempt completes in
* finite time */
- tv.tv_sec = ksocknal_io_timeout / HZ;
- n = ksocknal_io_timeout % HZ;
- n = n * 1000000 + HZ - 1;
- do_div (n, HZ);
- tv.tv_usec = n;
+ tv.tv_sec = ksocknal_data.ksnd_io_timeout;
+ tv.tv_usec = 0;
set_fs (KERNEL_DS);
rc = sock_setsockopt (sock, SOL_SOCKET, SO_SNDTIMEO,
(char *)&tv, sizeof (tv));
set_fs (oldmm);
if (rc != 0) {
- CERROR ("Can't set send timeout %d (in HZ): %d\n",
- ksocknal_io_timeout, rc);
+ CERROR ("Can't set send timeout %d: %d\n",
+ ksocknal_data.ksnd_io_timeout, rc);
goto out;
}
(char *)&tv, sizeof (tv));
set_fs (oldmm);
if (rc != 0) {
- CERROR ("Can't set receive timeout %d (in HZ): %d\n",
- ksocknal_io_timeout, rc);
+ CERROR ("Can't set receive timeout %d: %d\n",
+ ksocknal_data.ksnd_io_timeout, rc);
goto out;
}
route->ksnr_connecting = 0;
LASSERT (route->ksnr_retry_interval != 0);
- route->ksnr_timeout = jiffies_64 + route->ksnr_retry_interval;
+ route->ksnr_timeout = jiffies + route->ksnr_retry_interval;
route->ksnr_retry_interval = MIN (route->ksnr_retry_interval * 2,
SOCKNAL_MAX_RECONNECT_INTERVAL);
ksock_route_t *route;
int rc;
- snprintf (name, sizeof (name), "ksocknal_ad[%ld]", id);
+ snprintf (name, sizeof (name), "ksocknal_ad%02ld", id);
kportal_daemonize (name);
kportal_blockallsigs ();
ksocknal_find_timed_out_conn (ksock_peer_t *peer)
{
/* We're called with a shared lock on ksnd_global_lock */
- unsigned long flags;
ksock_conn_t *conn;
struct list_head *ctmp;
- ksock_tx_t *tx;
- struct list_head *ttmp;
ksock_sched_t *sched;
list_for_each (ctmp, &peer->ksnp_conns) {
conn = list_entry (ctmp, ksock_conn_t, ksnc_list);
sched = conn->ksnc_scheduler;
-
- if (conn->ksnc_rx_deadline != 0 &&
- conn->ksnc_rx_deadline <= jiffies_64)
- goto timed_out;
- spin_lock_irqsave (&sched->kss_lock, flags);
+ /* Don't need the {get,put}connsock dance to deref ksnc_sock... */
+ LASSERT (!conn->ksnc_closing);
- list_for_each (ttmp, &conn->ksnc_tx_queue) {
- tx = list_entry (ttmp, ksock_tx_t, tx_list);
- LASSERT (tx->tx_deadline != 0);
-
- if (tx->tx_deadline <= jiffies_64)
- goto timed_out_locked;
+ if (conn->ksnc_rx_started &&
+ time_after_eq (jiffies, conn->ksnc_rx_deadline)) {
+ /* Timed out incomplete incoming message */
+ atomic_inc (&conn->ksnc_refcount);
+ CERROR ("Timed out RX from "LPX64" %p\n",
+ peer->ksnp_nid, conn);
+ return (conn);
}
-#if SOCKNAL_ZC
- list_for_each (ttmp, &conn->ksnc_tx_pending) {
- tx = list_entry (ttmp, ksock_tx_t, tx_list);
- LASSERT (tx->tx_deadline != 0);
-
- if (tx->tx_deadline <= jiffies_64)
- goto timed_out_locked;
+
+ if ((!list_empty (&conn->ksnc_tx_queue) ||
+ conn->ksnc_sock->sk->wmem_queued != 0) &&
+ time_after_eq (jiffies, conn->ksnc_tx_deadline)) {
+ /* Timed out messages queued for sending, or
+ * messages buffered in the socket's send buffer */
+ atomic_inc (&conn->ksnc_refcount);
+ CERROR ("Timed out TX to "LPX64" %s%d %p\n",
+ peer->ksnp_nid,
+ list_empty (&conn->ksnc_tx_queue) ? "" : "Q ",
+ conn->ksnc_sock->sk->wmem_queued, conn);
+ return (conn);
}
-#endif
- spin_unlock_irqrestore (&sched->kss_lock, flags);
- continue;
-
- timed_out_locked:
- spin_unlock_irqrestore (&sched->kss_lock, flags);
- timed_out:
- atomic_inc (&conn->ksnc_refcount);
- return (conn);
}
return (NULL);
}
void
-ksocknal_check_peer_timeouts (struct list_head *peers)
+ksocknal_check_peer_timeouts (int idx)
{
+ struct list_head *peers = &ksocknal_data.ksnd_peers[idx];
struct list_head *ptmp;
ksock_peer_t *peer;
ksock_conn_t *conn;
if (conn != NULL) {
read_unlock (&ksocknal_data.ksnd_global_lock);
- if (ksocknal_close_conn_unlocked (conn)) {
+ if (ksocknal_close_conn_unlocked (conn, -ETIMEDOUT)) {
/* I actually closed... */
CERROR ("Timeout out conn->"LPX64" ip %x:%d\n",
peer->ksnp_nid, conn->ksnc_ipaddr,
unsigned long flags;
ksock_conn_t *conn;
int timeout;
+ int i;
int peer_index = 0;
- __u64 deadline = jiffies_64;
+ unsigned long deadline = jiffies;
kportal_daemonize ("ksocknal_reaper");
kportal_blockallsigs ();
spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags);
- while ((timeout = deadline - jiffies_64) <= 0) {
- /* Time to check for timeouts on a few more peers */
- ksocknal_check_peer_timeouts (&ksocknal_data.ksnd_peers[peer_index]);
+ /* careful with the jiffy wrap... */
+ while ((timeout = ((int)deadline - (int)jiffies)) <= 0) {
+ const int n = 4;
+ const int p = 1;
+ int chunk = ksocknal_data.ksnd_peer_hash_size;
+
+ /* Time to check for timeouts on a few more peers: I do
+ * checks every 'p' seconds on a proportion of the peer
+ * table and I need to check every connection 'n' times
+ * within a timeout interval, to ensure I detect a
+ * timeout on any connection within (n+1)/n times the
+ * timeout interval. */
+
+ if (ksocknal_data.ksnd_io_timeout > n * p)
+ chunk = (chunk * n * p) /
+ ksocknal_data.ksnd_io_timeout;
+ if (chunk == 0)
+ chunk = 1;
+
+ for (i = 0; i < chunk; i++) {
+ ksocknal_check_peer_timeouts (peer_index);
+ peer_index = (peer_index + 1) %
+ ksocknal_data.ksnd_peer_hash_size;
+ }
- peer_index = (peer_index + 1) % SOCKNAL_PEER_HASH_SIZE;
- deadline += HZ;
+ deadline += p * HZ;
}
add_wait_queue (&ksocknal_data.ksnd_reaper_waitq, &wait);
CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
atomic_read (&portal_kmemory));
- printk(KERN_INFO "Routing socket NAL unloaded (final mem %d)\n",
+ printk(KERN_INFO "Lustre: Routing socket NAL unloaded (final mem %d)\n",
atomic_read(&portal_kmemory));
}
/* flag everything initialised */
ktoenal_data.ksnd_init = SOCKNAL_INIT_ALL;
- printk(KERN_INFO"Routing TOE NAL loaded (Routing %s, initial mem %d)\n",
+ printk(KERN_INFO "Lustre: Routing TOE NAL loaded (Routing %s, initial mem %d)\n",
kpr_routing(&ktoenal_data.ksnd_router) ? "enabled" : "disabled",
pkmem);
if ((conn = ktoenal_get_conn (nid)) == NULL)
{
/* It's not a peer; try to find a gateway */
- rc = kpr_lookup (&ktoenal_data.ksnd_router, nid, &gatewaynid);
+ rc = kpr_lookup (&ktoenal_data.ksnd_router, nid, payload_niov,
+ &gatewaynid);
if (rc != 0)
{
CERROR ("Can't route to "LPX64": router error %d\n", nid, rc);
echo timestamp > link-stamp
DEFS =
-portals_SOURCES = $(LINKS) module.c proc.c debug.c
+portals_SOURCES = $(LINKS) module.c proc.c debug.c lwt.c
# Don't distribute any patched files.
dist-hook:
PTR_ERR(file));
GOTO(out, PTR_ERR(file));
} else {
- printk(KERN_ALERT "dumping log to %s ... writing ...\n",
+ printk(KERN_ALERT "LustreError: dumping log to %s ... writing ...\n",
debug_file_name);
}
} else {
rc = file->f_op->write(file, debug_buf, debug_off,&file->f_pos);
}
- printk("wrote %d bytes\n", rc);
+ printk("LustreError: wrote %d bytes\n", rc);
set_fs(oldfs);
rc = file->f_op->fsync(file, file->f_dentry, 1);
CERROR("cannot open %s for logging", debug_daemon_file_path);
GOTO(out1, PTR_ERR(file));
} else {
- printk(KERN_ALERT "daemon dumping log to %s ... writing ...\n",
+ printk(KERN_ALERT "LustreError: daemon dumping log to %s ... writing ...\n",
debug_daemon_file_path);
}
size, &file->f_pos);
if (rc < 0) {
printk(KERN_ALERT
- "Debug_daemon write error %d\n", rc);
+ "LustreError: Debug_daemon write error %d\n", rc);
goto out;
}
start += rc;
rc = file->f_op->fsync(file, file->f_dentry, 1);
if (rc < 0) {
printk(KERN_ALERT
- "Debug_daemon sync error %d\n", rc);
+ "LustreError: Debug_daemon sync error %d\n", rc);
goto out;
}
if (debug_daemon_state.stopped)
while (start1 < end1) {
int count = MIN(1024, end1 - start1);
- printk("%*s", count, start1);
+ printk("LustreError: %*s", count, start1);
start1 += 1024;
}
while (start2 < end2) {
int count = MIN(1024, end2 - start2);
- printk("%*s", count, start2);
+ printk("LustreError: %*s", count, start2);
start2 += 1024;
}
}
rc = kernel_thread(portals_do_debug_dumplog,
NULL, CLONE_VM | CLONE_FS | CLONE_FILES);
if (rc < 0) {
- printk(KERN_ERR "cannot start dump thread\n");
+ printk(KERN_ERR "LustreError: cannot start dump thread\n");
return;
}
sleep_on(&debug_ctlwq);
debug_daemon_state.lctl_event = 0;
rc = kernel_thread(portals_debug_daemon, NULL, 0);
if (rc < 0) {
- printk(KERN_ERR "cannot start debug daemon thread\n");
+ printk(KERN_ERR "LustreError: cannot start debug daemon thread\n");
strncpy(debug_daemon_file_path, "\0", 1);
return rc;
}
unsigned long debug_off;
if (debug_buf == NULL) {
- printk("portals_debug_msg: debug_buf is NULL!\n");
+ printk("LustreError: portals_debug_msg: debug_buf is NULL!\n");
return;
}
max_nob = debug_size - debug_off + DEBUG_OVERFLOW;
if (max_nob <= 0) {
spin_unlock_irqrestore(&portals_debug_lock, flags);
- printk("logic error in portals_debug_msg: <0 bytes to write\n");
+ printk("LustreError: logic error in portals_debug_msg: <0 bytes to write\n");
return;
}
/* Print to console, while msg is contiguous in debug_buf */
/* NB safely terminated see above */
if ((mask & D_EMERG) != 0)
- printk(KERN_EMERG "%s", debug_buf + debug_off + prefix_nob);
+ printk(KERN_EMERG "LustreError: %s",
+ debug_buf + debug_off + prefix_nob);
if ((mask & D_ERROR) != 0)
- printk(KERN_ERR "%s", debug_buf + debug_off + prefix_nob);
+ printk(KERN_ERR "LustreError: %s",
+ debug_buf + debug_off + prefix_nob);
else if (portal_printk)
- printk("<%d>%s", portal_printk, debug_buf+debug_off+prefix_nob);
+ printk("<%d>LustreError: %s", portal_printk, debug_buf+debug_off+prefix_nob);
base_offset = debug_off & 0xFFFF;
debug_off += prefix_nob + msg_nob;
void portals_debug_set_level(unsigned int debug_level)
{
- printk("Setting portals debug level to %08x\n", debug_level);
+ printk("Lustre: Setting portals debug level to %08x\n", debug_level);
portal_debug = debug_level;
}
+void portals_run_upcall(char **argv)
+{
+ int rc;
+ int argc;
+ char *envp[] = {
+ "HOME=/",
+ "PATH=/sbin:/bin:/usr/sbin:/usr/bin",
+ NULL};
+ ENTRY;
+
+ argv[0] = portals_upcall;
+ argc = 1;
+ while (argv[argc] != NULL)
+ argc++;
+
+ LASSERT(argc >= 2);
+
+ rc = call_usermodehelper(argv[0], argv, envp);
+ if (rc < 0) {
+ CERROR("Error %d invoking portals upcall %s %s%s%s%s%s%s%s%s; "
+ "check /proc/sys/portals/upcall\n",
+ rc, argv[0], argv[1],
+ argc < 3 ? "" : ",", argc < 3 ? "" : argv[2],
+ argc < 4 ? "" : ",", argc < 4 ? "" : argv[3],
+ argc < 5 ? "" : ",", argc < 5 ? "" : argv[4],
+ argc < 6 ? "" : ",...");
+ } else {
+ CERROR("Invoked portals upcall %s %s%s%s%s%s%s%s%s\n",
+ argv[0], argv[1],
+ argc < 3 ? "" : ",", argc < 3 ? "" : argv[2],
+ argc < 4 ? "" : ",", argc < 4 ? "" : argv[3],
+ argc < 5 ? "" : ",", argc < 5 ? "" : argv[4],
+ argc < 6 ? "" : ",...");
+ }
+}
+
void portals_run_lbug_upcall(char *file, const char *fn, const int line)
{
char *argv[6];
- char *envp[3];
char buf[32];
- int rc;
ENTRY;
snprintf (buf, sizeof buf, "%d", line);
- argv[0] = portals_upcall;
argv[1] = "LBUG";
argv[2] = file;
argv[3] = (char *)fn;
argv[4] = buf;
argv[5] = NULL;
- envp[0] = "HOME=/";
- envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
- envp[2] = NULL;
-
- rc = USERMODEHELPER(argv[0], argv, envp);
- if (rc < 0) {
- CERROR("Error invoking lbug upcall %s %s %s %s %s: %d; check "
- "/proc/sys/portals/upcall\n",
- argv[0], argv[1], argv[2], argv[3], argv[4], rc);
-
- } else {
- CERROR("Invoked upcall %s %s %s %s %s\n",
- argv[0], argv[1], argv[2], argv[3], argv[4]);
- }
+ portals_run_upcall (argv);
}
-
EXPORT_SYMBOL(portals_debug_dumplog);
EXPORT_SYMBOL(portals_debug_msg);
EXPORT_SYMBOL(portals_debug_set_level);
+EXPORT_SYMBOL(portals_run_upcall);
EXPORT_SYMBOL(portals_run_lbug_upcall);
}
static int
-kportal_add_route(int gateway_nalid, ptl_nid_t gateway_nid, ptl_nid_t lo_nid,
- ptl_nid_t hi_nid)
+kportal_add_route(int gateway_nalid, ptl_nid_t gateway_nid,
+ ptl_nid_t lo_nid, ptl_nid_t hi_nid)
{
int rc;
kpr_control_interface_t *ci;
}
static int
-kportal_del_route(ptl_nid_t target)
+kportal_del_route(int gw_nalid, ptl_nid_t gw_nid,
+ ptl_nid_t lo, ptl_nid_t hi)
{
int rc;
kpr_control_interface_t *ci;
if (ci == NULL)
return (-ENODEV);
- rc = ci->kprci_del_route (target);
+ rc = ci->kprci_del_route (gw_nalid, gw_nid, lo, hi);
+
+ PORTAL_SYMBOL_PUT(kpr_control_interface);
+ return (rc);
+}
+
+static int
+kportal_notify_router (int gw_nalid, ptl_nid_t gw_nid,
+ int alive, time_t when)
+{
+ int rc;
+ kpr_control_interface_t *ci;
+
+ ci = (kpr_control_interface_t *)PORTAL_SYMBOL_GET(kpr_control_interface);
+ if (ci == NULL)
+ return (-ENODEV);
+
+ rc = ci->kprci_notify (gw_nalid, gw_nid, alive, when);
PORTAL_SYMBOL_PUT(kpr_control_interface);
return (rc);
static int
kportal_get_route(int index, __u32 *gateway_nalidp, ptl_nid_t *gateway_nidp,
- ptl_nid_t *lo_nidp, ptl_nid_t *hi_nidp)
+ ptl_nid_t *lo_nidp, ptl_nid_t *hi_nidp, int *alivep)
{
int gateway_nalid;
ptl_nid_t gateway_nid;
ptl_nid_t lo_nid;
ptl_nid_t hi_nid;
+ int alive;
int rc;
kpr_control_interface_t *ci;
if (ci == NULL)
return (-ENODEV);
- rc = ci->kprci_get_route(index, &gateway_nalid, &gateway_nid, &lo_nid,
- &hi_nid);
+ rc = ci->kprci_get_route(index, &gateway_nalid, &gateway_nid,
+ &lo_nid, &hi_nid, &alive);
if (rc == 0) {
- CDEBUG(D_IOCTL, "got route [%d] %d "LPX64":"LPX64" - "LPX64"\n",
- index, gateway_nalid, gateway_nid, lo_nid, hi_nid);
+ CDEBUG(D_IOCTL, "got route [%d] %d "LPX64":"LPX64" - "LPX64", %s\n",
+ index, gateway_nalid, gateway_nid, lo_nid, hi_nid,
+ alive ? "up" : "down");
*gateway_nalidp = (__u32)gateway_nalid;
- *gateway_nidp = (__u32)gateway_nid;
- *lo_nidp = (__u32)lo_nid;
- *hi_nidp = (__u32)hi_nid;
+ *gateway_nidp = gateway_nid;
+ *lo_nidp = lo_nid;
+ *hi_nidp = hi_nid;
+ *alivep = alive;
}
PORTAL_SYMBOL_PUT (kpr_control_interface);
case IOC_PORTAL_ADD_ROUTE:
CDEBUG(D_IOCTL, "Adding route: [%d] "LPU64" : "LPU64" - "LPU64"\n",
- data->ioc_nal, data->ioc_nid, data->ioc_nid2,
- data->ioc_nid3);
+ data->ioc_nal, data->ioc_nid,
+ data->ioc_nid2, data->ioc_nid3);
err = kportal_add_route(data->ioc_nal, data->ioc_nid,
- MIN (data->ioc_nid2, data->ioc_nid3),
- MAX (data->ioc_nid2, data->ioc_nid3));
+ data->ioc_nid2, data->ioc_nid3);
break;
case IOC_PORTAL_DEL_ROUTE:
- CDEBUG (D_IOCTL, "Removing route to "LPU64"\n", data->ioc_nid);
- err = kportal_del_route (data->ioc_nid);
+ CDEBUG (D_IOCTL, "Removing routes via [%d] "LPU64" : "LPU64" - "LPU64"\n",
+ data->ioc_nal, data->ioc_nid,
+ data->ioc_nid2, data->ioc_nid3);
+ err = kportal_del_route (data->ioc_nal, data->ioc_nid,
+ data->ioc_nid2, data->ioc_nid3);
break;
+ case IOC_PORTAL_NOTIFY_ROUTER: {
+ CDEBUG (D_IOCTL, "Notifying peer [%d] "LPU64" %s @ %ld\n",
+ data->ioc_nal, data->ioc_nid,
+ data->ioc_flags ? "Enabling" : "Disabling",
+ (time_t)data->ioc_nid3);
+
+ err = kportal_notify_router (data->ioc_nal, data->ioc_nid,
+ data->ioc_flags,
+ (time_t)data->ioc_nid3);
+ break;
+ }
+
case IOC_PORTAL_GET_ROUTE:
CDEBUG (D_IOCTL, "Getting route [%d]\n", data->ioc_count);
err = kportal_get_route(data->ioc_count, &data->ioc_nal,
- &data->ioc_nid, &data->ioc_nid2,
- &data->ioc_nid3);
+ &data->ioc_nid,
+ &data->ioc_nid2, &data->ioc_nid3,
+ &data->ioc_flags);
if (err == 0)
if (copy_to_user((char *)arg, data, sizeof (*data)))
err = -EFAULT;
kportal_put_ni (data->ioc_nal);
break;
}
-
+#if LWT_SUPPORT
+ case IOC_PORTAL_LWT_CONTROL:
+ err = lwt_control (data->ioc_flags, data->ioc_misc);
+ break;
+
+ case IOC_PORTAL_LWT_SNAPSHOT:
+ err = lwt_snapshot (&data->ioc_count, &data->ioc_misc,
+ data->ioc_pbuf1, data->ioc_plen1);
+ if (err == 0 &&
+ copy_to_user((char *)arg, data, sizeof (*data)))
+ err = -EFAULT;
+ break;
+
+ case IOC_PORTAL_LWT_LOOKUP_STRING:
+ err = lwt_lookup_string (&data->ioc_count, data->ioc_pbuf1,
+ data->ioc_pbuf2, data->ioc_plen2);
+ if (err == 0 &&
+ copy_to_user((char *)arg, data, sizeof (*data)))
+ err = -EFAULT;
+ break;
+#endif
default:
err = -EINVAL;
break;
rc = portals_debug_init(5 * 1024 * 1024);
if (rc < 0) {
- printk(KERN_ERR "portals_debug_init: %d\n", rc);
+ printk(KERN_ERR "LustreError: portals_debug_init: %d\n", rc);
return (rc);
}
+#if LWT_SUPPORT
+ rc = lwt_init();
+ if (rc != 0) {
+ CERROR("lwt_init: error %d\n", rc);
+ goto cleanup_debug;
+ }
+#endif
sema_init(&nal_cmd_sem, 1);
rc = misc_register(&portal_dev);
if (rc) {
CERROR("misc_register: error %d\n", rc);
- goto cleanup_debug;
+ goto cleanup_lwt;
}
rc = PtlInit();
PtlFini();
cleanup_deregister:
misc_deregister(&portal_dev);
+ cleanup_lwt:
+#if LWT_SUPPORT
+ lwt_fini();
+#endif
cleanup_debug:
portals_debug_cleanup();
return rc;
if (rc)
CERROR("misc_deregister error %d\n", rc);
+#if LWT_SUPPORT
+ lwt_fini();
+#endif
+
if (atomic_read(&portal_kmemory) != 0)
CERROR("Portals memory leaked: %d bytes\n",
atomic_read(&portal_kmemory));
rc = portals_debug_cleanup();
if (rc)
- printk(KERN_ERR "portals_debug_cleanup: %d\n", rc);
+ printk(KERN_ERR "LustreError: portals_debug_cleanup: %d\n", rc);
}
EXPORT_SYMBOL(lib_dispatch);
EXPORT_SYMBOL(portal_debug);
EXPORT_SYMBOL(portal_stack);
EXPORT_SYMBOL(portal_printk);
+EXPORT_SYMBOL(portal_cerror);
EXPORT_SYMBOL(PtlEQWait);
EXPORT_SYMBOL(PtlEQFree);
EXPORT_SYMBOL(PtlEQGet);
#define PSDEV_DEBUG 1 /* control debugging */
#define PSDEV_SUBSYSTEM_DEBUG 2 /* control debugging */
#define PSDEV_PRINTK 3 /* force all errors to console */
-#define PSDEV_DEBUG_PATH 4 /* crashdump log location */
-#define PSDEV_DEBUG_DUMP_PATH 5 /* crashdump tracelog location */
-#define PSDEV_PORTALS_UPCALL 6 /* User mode upcall script */
+#define PSDEV_CONSOLE 4 /* allow _any_ messages to console */
+#define PSDEV_DEBUG_PATH 5 /* crashdump log location */
+#define PSDEV_DEBUG_DUMP_PATH 6 /* crashdump tracelog location */
+#define PSDEV_PORTALS_UPCALL 7 /* User mode upcall script */
-#define PORTALS_PRIMARY_CTLCNT 6
+#define PORTALS_PRIMARY_CTLCNT 7
static struct ctl_table portals_table[PORTALS_PRIMARY_CTLCNT + 1] = {
{PSDEV_DEBUG, "debug", &portal_debug, sizeof(int), 0644, NULL,
&proc_dointvec},
sizeof(int), 0644, NULL, &proc_dointvec},
{PSDEV_PRINTK, "printk", &portal_printk, sizeof(int), 0644, NULL,
&proc_dointvec},
+ {PSDEV_CONSOLE, "console", &portal_cerror, sizeof(int), 0644, NULL,
+ &proc_dointvec},
{PSDEV_DEBUG_PATH, "debug_path", debug_file_path,
sizeof(debug_file_path), 0644, NULL, &proc_dostring, &sysctl_string},
{PSDEV_DEBUG_DUMP_PATH, "debug_daemon_path", debug_daemon_file_path,
int ptl_init;
unsigned int portal_subsystem_debug = ~0 - (S_PORTALS | S_QSWNAL | S_SOCKNAL | S_GMNAL);
unsigned int portal_debug = ~0;
+unsigned int portal_cerror = 1;
unsigned int portal_printk;
unsigned int portal_stack;
": simulated failure\n",
nal->ni.nid, hdr_type_string (hdr),
hdr->src_nid);
+ lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr));
return (-1);
}
#include "router.h"
LIST_HEAD(kpr_routes);
+LIST_HEAD(kpr_gateways);
LIST_HEAD(kpr_nals);
unsigned long long kpr_fwd_bytes;
kprri_lookup: kpr_lookup_target,
kprri_fwd_start: kpr_forward_packet,
kprri_fwd_done: kpr_complete_packet,
+ kprri_notify: kpr_nal_notify,
kprri_shutdown: kpr_shutdown_nal,
kprri_deregister: kpr_deregister_nal,
};
kprci_add_route: kpr_add_route,
kprci_del_route: kpr_del_route,
kprci_get_route: kpr_get_route,
+ kprci_notify: kpr_sys_notify,
};
int
struct list_head *e;
kpr_nal_entry_t *ne;
- CDEBUG (D_OTHER, "Registering NAL %d\n", nalif->kprni_nalid);
+ CDEBUG (D_NET, "Registering NAL %d\n", nalif->kprni_nalid);
PORTAL_ALLOC (ne, sizeof (*ne));
if (ne == NULL)
}
void
+kpr_do_upcall (void *arg)
+{
+ kpr_upcall_t *u = (kpr_upcall_t *)arg;
+ char nalstr[10];
+ char nidstr[36];
+ char whenstr[36];
+ char *argv[] = {
+ NULL,
+ "ROUTER_NOTIFY",
+ nalstr,
+ nidstr,
+ u->kpru_alive ? "up" : "down",
+ whenstr,
+ NULL};
+
+ snprintf (nalstr, sizeof(nalstr), "%d", u->kpru_nal_id);
+ snprintf (nidstr, sizeof(nidstr), LPX64, u->kpru_nid);
+ snprintf (whenstr, sizeof(whenstr), "%ld", u->kpru_when);
+
+ portals_run_upcall (argv);
+
+ kfree (u);
+}
+
+void
+kpr_upcall (int gw_nalid, ptl_nid_t gw_nid, int alive, time_t when)
+{
+ /* May be in arbitrary context */
+ kpr_upcall_t *u = kmalloc (sizeof (kpr_upcall_t), GFP_ATOMIC);
+
+ if (u == NULL) {
+ CERROR ("Upcall out of memory: nal %d nid "LPX64" %s\n",
+ gw_nalid, gw_nid, alive ? "up" : "down");
+ return;
+ }
+
+ u->kpru_nal_id = gw_nalid;
+ u->kpru_nid = gw_nid;
+ u->kpru_alive = alive;
+ u->kpru_when = when;
+
+ prepare_work (&u->kpru_tq, kpr_do_upcall, u);
+ schedule_work (&u->kpru_tq);
+}
+
+int
+kpr_do_notify (int byNal, int gateway_nalid, ptl_nid_t gateway_nid,
+ int alive, time_t when)
+{
+ unsigned long flags;
+ int rc = -ENOENT;
+ kpr_nal_entry_t *ne = NULL;
+ kpr_gateway_entry_t *ge = NULL;
+ struct timeval now;
+ struct list_head *e;
+ struct list_head *n;
+
+ CDEBUG (D_ERROR, "%s notifying [%d] "LPX64": %s\n",
+ byNal ? "NAL" : "userspace",
+ gateway_nalid, gateway_nid, alive ? "up" : "down");
+
+ /* can't do predictions... */
+ do_gettimeofday (&now);
+ if (when > now.tv_sec) {
+ CWARN ("Ignoring prediction from %s of [%d] "LPX64" %s "
+ "%ld seconds in the future\n",
+ byNal ? "NAL" : "userspace",
+ gateway_nalid, gateway_nid,
+ alive ? "up" : "down",
+ when - now.tv_sec);
+ return (EINVAL);
+ }
+
+ LASSERT (when <= now.tv_sec);
+
+ /* Serialise with lookups (i.e. write lock) */
+ write_lock_irqsave(&kpr_rwlock, flags);
+
+ list_for_each_safe (e, n, &kpr_gateways) {
+
+ ge = list_entry(e, kpr_gateway_entry_t, kpge_list);
+ if ((gateway_nalid != 0 &&
+ ge->kpge_nalid != gateway_nalid) ||
+ ge->kpge_nid != gateway_nid)
+ continue;
+
+ rc = 0;
+ break;
+ }
+
+ if (rc != 0) {
+ /* gateway not found */
+ write_unlock_irqrestore(&kpr_rwlock, flags);
+ CDEBUG (D_NET, "Gateway not found\n");
+ return (rc);
+ }
+
+ if (when < ge->kpge_timestamp) {
+ /* out of date information */
+ write_unlock_irqrestore (&kpr_rwlock, flags);
+ CDEBUG (D_NET, "Out of date\n");
+ return (0);
+ }
+
+ /* update timestamp */
+ ge->kpge_timestamp = when;
+
+ if ((!ge->kpge_alive) == (!alive)) {
+ /* new date for old news */
+ write_unlock_irqrestore (&kpr_rwlock, flags);
+ CDEBUG (D_NET, "Old news\n");
+ return (0);
+ }
+
+ ge->kpge_alive = alive;
+ CDEBUG(D_NET, "set "LPX64" [%p] %d\n", gateway_nid, ge, alive);
+
+ if (alive) {
+ /* Reset all gateway weights so the newly-enabled gateway
+ * doesn't have to play catch-up */
+ list_for_each_safe (e, n, &kpr_gateways) {
+ kpr_gateway_entry_t *ge = list_entry(e, kpr_gateway_entry_t,
+ kpge_list);
+ atomic_set (&ge->kpge_weight, 0);
+ }
+ }
+
+ if (!byNal) {
+ /* userland notified me: notify NAL? */
+ ne = kpr_find_nal_entry_locked (ge->kpge_nalid);
+ if (ne != NULL) {
+ if (ne->kpne_shutdown ||
+ ne->kpne_interface.kprni_notify == NULL) {
+ /* no need to notify */
+ ne = NULL;
+ } else {
+ /* take a ref on this NAL until notifying
+ * it has completed... */
+ atomic_inc (&ne->kpne_refcount);
+ }
+ }
+ }
+
+ write_unlock_irqrestore(&kpr_rwlock, flags);
+
+ if (ne != NULL) {
+ ne->kpne_interface.kprni_notify (ne->kpne_interface.kprni_arg,
+ gateway_nid, alive);
+ /* 'ne' can disappear now... */
+ atomic_dec (&ne->kpne_refcount);
+ }
+
+ if (byNal) {
+ /* It wasn't userland that notified me... */
+ CWARN ("Upcall: NAL %d NID "LPX64" is %s\n",
+ gateway_nalid, gateway_nid,
+ alive ? "alive" : "dead");
+ kpr_upcall (gateway_nalid, gateway_nid, alive, when);
+ } else {
+ CDEBUG (D_NET, " NOT Doing upcall\n");
+ }
+
+ return (0);
+}
+
+void
+kpr_nal_notify (void *arg, ptl_nid_t peer, int alive, time_t when)
+{
+ kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg;
+
+ kpr_do_notify (1, ne->kpne_interface.kprni_nalid, peer, alive, when);
+}
+
+void
kpr_shutdown_nal (void *arg)
{
unsigned long flags;
kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg;
- CDEBUG (D_OTHER, "Shutting down NAL %d\n", ne->kpne_interface.kprni_nalid);
+ CDEBUG (D_NET, "Shutting down NAL %d\n", ne->kpne_interface.kprni_nalid);
LASSERT (!ne->kpne_shutdown);
LASSERT (!in_interrupt());
unsigned long flags;
kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg;
- CDEBUG (D_OTHER, "Deregister NAL %d\n", ne->kpne_interface.kprni_nalid);
+ CDEBUG (D_NET, "Deregister NAL %d\n", ne->kpne_interface.kprni_nalid);
LASSERT (ne->kpne_shutdown); /* caller must have issued shutdown already */
LASSERT (atomic_read (&ne->kpne_refcount) == 0); /* can't be busy */
PORTAL_MODULE_UNUSE;
}
+int
+kpr_ge_isbetter (kpr_gateway_entry_t *ge1, kpr_gateway_entry_t *ge2)
+{
+ const int significant_bits = 0x00ffffff;
+ /* We use atomic_t to record/compare route weights for
+ * load-balancing. Here we limit ourselves to only using
+ * 'significant_bits' when we do an 'after' comparison */
+
+ int diff = (atomic_read (&ge1->kpge_weight) -
+ atomic_read (&ge2->kpge_weight)) & significant_bits;
+ int rc = (diff > (significant_bits >> 1));
+
+ CDEBUG(D_NET, "[%p]"LPX64"=%d %s [%p]"LPX64"=%d\n",
+ ge1, ge1->kpge_nid, atomic_read (&ge1->kpge_weight),
+ rc ? ">" : "<",
+ ge2, ge2->kpge_nid, atomic_read (&ge2->kpge_weight));
+
+ return (rc);
+}
+
+void
+kpr_update_weight (kpr_gateway_entry_t *ge, int nob)
+{
+ int weight = 1 + (nob + sizeof (ptl_hdr_t)/2)/sizeof (ptl_hdr_t);
+
+ /* We've chosen this route entry (i.e. gateway) to forward payload
+ * of length 'nob'; update the route's weight to make it less
+ * favoured. Note that the weight is 1 plus the payload size
+ * rounded and scaled to the portals header size, so we get better
+ * use of the significant bits in kpge_weight. */
+
+ CDEBUG(D_NET, "gateway [%p]"LPX64" += %d\n", ge,
+ ge->kpge_nid, weight);
+
+ atomic_add (weight, &ge->kpge_weight);
+}
int
-kpr_lookup_target (void *arg, ptl_nid_t target_nid, ptl_nid_t *gateway_nidp)
+kpr_lookup_target (void *arg, ptl_nid_t target_nid, int nob,
+ ptl_nid_t *gateway_nidp)
{
- kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg;
- struct list_head *e;
- int rc = -ENOENT;
+ kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg;
+ struct list_head *e;
+ kpr_route_entry_t *re;
+ kpr_gateway_entry_t *ge = NULL;
+ int rc = -ENOENT;
+
+ /* Caller wants to know if 'target_nid' can be reached via a gateway
+ * ON HER OWN NETWORK */
- CDEBUG (D_OTHER, "lookup "LPX64" from NAL %d\n", target_nid, ne->kpne_interface.kprni_nalid);
+ CDEBUG (D_NET, "lookup "LPX64" from NAL %d\n", target_nid,
+ ne->kpne_interface.kprni_nalid);
if (ne->kpne_shutdown) /* caller is shutting down */
return (-ENOENT);
/* Search routes for one that has a gateway to target_nid on the callers network */
- for (e = kpr_routes.next; e != &kpr_routes; e = e->next)
- {
- kpr_route_entry_t *re = list_entry (e, kpr_route_entry_t, kpre_list);
+ list_for_each (e, &kpr_routes) {
+ re = list_entry (e, kpr_route_entry_t, kpre_list);
if (re->kpre_lo_nid > target_nid ||
re->kpre_hi_nid < target_nid)
/* found table entry */
- if (re->kpre_gateway_nalid != ne->kpne_interface.kprni_nalid) /* different NAL */
- rc = -EHOSTUNREACH;
- else
- {
- rc = 0;
- *gateway_nidp = re->kpre_gateway_nid;
- }
- break;
+ if (re->kpre_gateway->kpge_nalid != ne->kpne_interface.kprni_nalid ||
+ !re->kpre_gateway->kpge_alive) {
+ /* different NAL or gateway down */
+ rc = -EHOSTUNREACH;
+ continue;
+ }
+
+ if (ge == NULL ||
+ kpr_ge_isbetter (re->kpre_gateway, ge))
+ ge = re->kpre_gateway;
}
+ if (ge != NULL) {
+ kpr_update_weight (ge, nob);
+ *gateway_nidp = ge->kpge_nid;
+ rc = 0;
+ }
+
read_unlock (&kpr_rwlock);
- CDEBUG (D_OTHER, "lookup "LPX64" from NAL %d: %d ("LPX64")\n",
+ /* NB can't deref 're' now; it might have been removed! */
+
+ CDEBUG (D_NET, "lookup "LPX64" from NAL %d: %d ("LPX64")\n",
target_nid, ne->kpne_interface.kprni_nalid, rc,
(rc == 0) ? *gateway_nidp : (ptl_nid_t)0);
return (rc);
}
+kpr_nal_entry_t *
+kpr_find_nal_entry_locked (int nal_id)
+{
+ struct list_head *e;
+
+ /* Called with kpr_rwlock held */
+
+ list_for_each (e, &kpr_nals) {
+ kpr_nal_entry_t *ne = list_entry (e, kpr_nal_entry_t, kpne_list);
+
+ if (nal_id != ne->kpne_interface.kprni_nalid) /* no match */
+ continue;
+
+ return (ne);
+ }
+
+ return (NULL);
+}
+
void
kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd)
{
- kpr_nal_entry_t *src_ne = (kpr_nal_entry_t *)arg;
- ptl_nid_t target_nid = fwd->kprfd_target_nid;
- int nob = fwd->kprfd_nob;
- struct list_head *e;
-
- CDEBUG (D_OTHER, "forward [%p] "LPX64" from NAL %d\n", fwd,
+ kpr_nal_entry_t *src_ne = (kpr_nal_entry_t *)arg;
+ ptl_nid_t target_nid = fwd->kprfd_target_nid;
+ int nob = fwd->kprfd_nob;
+ kpr_gateway_entry_t *ge = NULL;
+ kpr_nal_entry_t *dst_ne = NULL;
+ struct list_head *e;
+ kpr_route_entry_t *re;
+ kpr_nal_entry_t *tmp_ne;
+
+ CDEBUG (D_NET, "forward [%p] "LPX64" from NAL %d\n", fwd,
target_nid, src_ne->kpne_interface.kprni_nalid);
LASSERT (nob >= sizeof (ptl_hdr_t)); /* at least got a packet header */
/* Search routes for one that has a gateway to target_nid NOT on the caller's network */
- for (e = kpr_routes.next; e != &kpr_routes; e = e->next)
- {
- kpr_route_entry_t *re = list_entry (e, kpr_route_entry_t, kpre_list);
+ list_for_each (e, &kpr_routes) {
+ re = list_entry (e, kpr_route_entry_t, kpre_list);
if (re->kpre_lo_nid > target_nid || /* no match */
re->kpre_hi_nid < target_nid)
continue;
- CDEBUG (D_OTHER, "forward [%p] "LPX64" from NAL %d: match "LPX64" on NAL %d\n", fwd,
- target_nid, src_ne->kpne_interface.kprni_nalid,
- re->kpre_gateway_nid, re->kpre_gateway_nalid);
+ if (re->kpre_gateway->kpge_nalid == src_ne->kpne_interface.kprni_nalid)
+ continue; /* don't route to same NAL */
- if (re->kpre_gateway_nalid == src_ne->kpne_interface.kprni_nalid)
- break; /* don't route to same NAL */
+ if (!re->kpre_gateway->kpge_alive)
+ continue; /* gateway is dead */
+
+ tmp_ne = kpr_find_nal_entry_locked (re->kpre_gateway->kpge_nalid);
- /* Search for gateway's NAL's entry */
-
- for (e = kpr_nals.next; e != &kpr_nals; e = e->next)
- {
- kpr_nal_entry_t *dst_ne = list_entry (e, kpr_nal_entry_t, kpne_list);
-
- if (re->kpre_gateway_nalid != dst_ne->kpne_interface.kprni_nalid) /* no match */
- continue;
+ if (tmp_ne == NULL ||
+ tmp_ne->kpne_shutdown) {
+ /* NAL must be registered and not shutting down */
+ continue;
+ }
- if (dst_ne->kpne_shutdown) /* don't route if NAL is shutting down */
- break;
+ if (ge == NULL ||
+ kpr_ge_isbetter (re->kpre_gateway, ge)) {
+ ge = re->kpre_gateway;
+ dst_ne = tmp_ne;
+ }
+ }
+
+ if (ge != NULL) {
+ LASSERT (dst_ne != NULL);
+
+ kpr_update_weight (ge, nob);
- fwd->kprfd_gateway_nid = re->kpre_gateway_nid;
- atomic_inc (&dst_ne->kpne_refcount); /* dest nal is busy until fwd completes */
+ fwd->kprfd_gateway_nid = ge->kpge_nid;
+ atomic_inc (&dst_ne->kpne_refcount); /* dest nal is busy until fwd completes */
- read_unlock (&kpr_rwlock);
+ read_unlock (&kpr_rwlock);
- CDEBUG (D_OTHER, "forward [%p] "LPX64" from NAL %d: "LPX64" on NAL %d\n", fwd,
- target_nid, src_ne->kpne_interface.kprni_nalid,
- fwd->kprfd_gateway_nid, dst_ne->kpne_interface.kprni_nalid);
+ CDEBUG (D_NET, "forward [%p] "LPX64" from NAL %d: "
+ "to "LPX64" on NAL %d\n",
+ fwd, target_nid, src_ne->kpne_interface.kprni_nalid,
+ fwd->kprfd_gateway_nid, dst_ne->kpne_interface.kprni_nalid);
- dst_ne->kpne_interface.kprni_fwd (dst_ne->kpne_interface.kprni_arg, fwd);
- return;
- }
- break;
+ dst_ne->kpne_interface.kprni_fwd (dst_ne->kpne_interface.kprni_arg, fwd);
+ return;
}
- read_unlock (&kpr_rwlock);
+ read_unlock (&kpr_rwlock);
out:
kpr_fwd_errors++;
- CDEBUG (D_OTHER, "Failed to forward [%p] "LPX64" from NAL %d\n", fwd,
+ CDEBUG (D_NET, "Failed to forward [%p] "LPX64" from NAL %d\n", fwd,
target_nid, src_ne->kpne_interface.kprni_nalid);
/* Can't find anywhere to forward to */
kpr_nal_entry_t *dst_ne = (kpr_nal_entry_t *)arg;
kpr_nal_entry_t *src_ne = (kpr_nal_entry_t *)fwd->kprfd_router_arg;
- CDEBUG (D_OTHER, "complete(1) [%p] from NAL %d to NAL %d: %d\n", fwd,
+ CDEBUG (D_NET, "complete(1) [%p] from NAL %d to NAL %d: %d\n", fwd,
src_ne->kpne_interface.kprni_nalid, dst_ne->kpne_interface.kprni_nalid, error);
atomic_dec (&dst_ne->kpne_refcount); /* CAVEAT EMPTOR dst_ne can disappear now!!! */
(fwd->kprfd_callback)(fwd->kprfd_callback_arg, error);
- CDEBUG (D_OTHER, "complete(2) [%p] from NAL %d: %d\n", fwd,
+ CDEBUG (D_NET, "complete(2) [%p] from NAL %d: %d\n", fwd,
src_ne->kpne_interface.kprni_nalid, error);
atomic_dec (&kpr_queue_depth);
}
int
-kpr_add_route (int gateway_nalid, ptl_nid_t gateway_nid, ptl_nid_t lo_nid,
- ptl_nid_t hi_nid)
+kpr_add_route (int gateway_nalid, ptl_nid_t gateway_nid,
+ ptl_nid_t lo_nid, ptl_nid_t hi_nid)
{
- unsigned long flags;
- struct list_head *e;
- kpr_route_entry_t *re;
+ unsigned long flags;
+ struct list_head *e;
+ kpr_route_entry_t *re;
+ kpr_gateway_entry_t *ge;
+ int dup = 0;
- CDEBUG(D_OTHER, "Add route: %d "LPX64" : "LPX64" - "LPX64"\n",
+ CDEBUG(D_NET, "Add route: %d "LPX64" : "LPX64" - "LPX64"\n",
gateway_nalid, gateway_nid, lo_nid, hi_nid);
- LASSERT(lo_nid <= hi_nid);
+ if (gateway_nalid == PTL_NID_ANY ||
+ lo_nid == PTL_NID_ANY ||
+ hi_nid == PTL_NID_ANY ||
+ lo_nid > hi_nid)
+ return (-EINVAL);
+
+ PORTAL_ALLOC (ge, sizeof (*ge));
+ if (ge == NULL)
+ return (-ENOMEM);
+
+ ge->kpge_nalid = gateway_nalid;
+ ge->kpge_nid = gateway_nid;
+ ge->kpge_alive = 1;
+ ge->kpge_timestamp = 0;
+ ge->kpge_refcount = 0;
+ atomic_set (&ge->kpge_weight, 0);
PORTAL_ALLOC (re, sizeof (*re));
if (re == NULL)
return (-ENOMEM);
- re->kpre_gateway_nalid = gateway_nalid;
- re->kpre_gateway_nid = gateway_nid;
re->kpre_lo_nid = lo_nid;
re->kpre_hi_nid = hi_nid;
LASSERT(!in_interrupt());
write_lock_irqsave (&kpr_rwlock, flags);
- for (e = kpr_routes.next; e != &kpr_routes; e = e->next) {
- kpr_route_entry_t *re2 = list_entry(e, kpr_route_entry_t,
- kpre_list);
-
- if (re->kpre_lo_nid > re2->kpre_hi_nid ||
- re->kpre_hi_nid < re2->kpre_lo_nid)
- continue;
+ list_for_each (e, &kpr_gateways) {
+ kpr_gateway_entry_t *ge2 = list_entry(e, kpr_gateway_entry_t,
+ kpge_list);
+
+ if (ge2->kpge_nalid == gateway_nalid &&
+ ge2->kpge_nid == gateway_nid) {
+ PORTAL_FREE (ge, sizeof (*ge));
+ ge = ge2;
+ dup = 1;
+ break;
+ }
+ }
- CERROR ("Attempt to add duplicate routes ["LPX64" - "LPX64"]"
- "to ["LPX64" - "LPX64"]\n",
- re->kpre_lo_nid, re->kpre_hi_nid,
- re2->kpre_lo_nid, re2->kpre_hi_nid);
+ if (!dup) {
+ /* Adding a new gateway... */
+
+ list_add (&ge->kpge_list, &kpr_gateways);
- write_unlock_irqrestore (&kpr_rwlock, flags);
+ /* ...zero all gateway weights so this one doesn't have to
+ * play catch-up */
- PORTAL_FREE (re, sizeof (*re));
- return (-EINVAL);
+ list_for_each (e, &kpr_gateways) {
+ kpr_gateway_entry_t *ge2 = list_entry(e, kpr_gateway_entry_t,
+ kpge_list);
+ atomic_set (&ge2->kpge_weight, 0);
+ }
+
}
+ re->kpre_gateway = ge;
+ ge->kpge_refcount++;
list_add (&re->kpre_list, &kpr_routes);
write_unlock_irqrestore (&kpr_rwlock, flags);
}
int
-kpr_del_route (ptl_nid_t nid)
+kpr_sys_notify (int gateway_nalid, ptl_nid_t gateway_nid,
+ int alive, time_t when)
{
+ return (kpr_do_notify (0, gateway_nalid, gateway_nid, alive, when));
+}
+
+int
+kpr_del_route (int gw_nalid, ptl_nid_t gw_nid,
+ ptl_nid_t lo, ptl_nid_t hi)
+{
+ int specific = (lo != PTL_NID_ANY);
unsigned long flags;
+ int rc = -ENOENT;
struct list_head *e;
+ struct list_head *n;
- CDEBUG(D_OTHER, "Del route "LPX64"\n", nid);
+ CDEBUG(D_NET, "Del route [%d] "LPX64" : "LPX64" - "LPX64"\n",
+ gw_nalid, gw_nid, lo, hi);
LASSERT(!in_interrupt());
+
+ /* NB Caller may specify either all routes via the given gateway
+ * (lo/hi == PTL_NID_ANY) or a specific route entry (lo/hi are
+ * actual NIDs) */
+
+ if (specific ? (hi == PTL_NID_ANY || hi < lo) : (hi != PTL_NID_ANY))
+ return (-EINVAL);
+
write_lock_irqsave(&kpr_rwlock, flags);
- for (e = kpr_routes.next; e != &kpr_routes; e = e->next) {
- kpr_route_entry_t *re = list_entry(e, kpr_route_entry_t,
+ list_for_each_safe (e, n, &kpr_routes) {
+ kpr_route_entry_t *re = list_entry(e, kpr_route_entry_t,
kpre_list);
-
- if (re->kpre_lo_nid > nid || re->kpre_hi_nid < nid)
+ kpr_gateway_entry_t *ge = re->kpre_gateway;
+
+ if (ge->kpge_nalid != gw_nalid ||
+ ge->kpge_nid != gw_nid ||
+ (specific &&
+ (lo != re->kpre_lo_nid || hi != re->kpre_hi_nid)))
continue;
- list_del (&re->kpre_list);
- write_unlock_irqrestore(&kpr_rwlock, flags);
+ rc = 0;
+ if (--ge->kpge_refcount == 0) {
+ list_del (&ge->kpge_list);
+ PORTAL_FREE (ge, sizeof (*ge));
+ }
+
+ list_del (&re->kpre_list);
PORTAL_FREE(re, sizeof (*re));
- return (0);
+
+ if (specific)
+ break;
}
write_unlock_irqrestore(&kpr_rwlock, flags);
- return (-ENOENT);
+ return (rc);
}
int
-kpr_get_route(int idx, int *gateway_nalid, ptl_nid_t *gateway_nid,
- ptl_nid_t *lo_nid, ptl_nid_t *hi_nid)
+kpr_get_route (int idx, int *gateway_nalid, ptl_nid_t *gateway_nid,
+ ptl_nid_t *lo_nid, ptl_nid_t *hi_nid, int *alive)
{
struct list_head *e;
read_lock(&kpr_rwlock);
for (e = kpr_routes.next; e != &kpr_routes; e = e->next) {
- kpr_route_entry_t *re = list_entry(e, kpr_route_entry_t,
- kpre_list);
-
+ kpr_route_entry_t *re = list_entry(e, kpr_route_entry_t,
+ kpre_list);
+ kpr_gateway_entry_t *ge = re->kpre_gateway;
+
if (idx-- == 0) {
- *gateway_nalid = re->kpre_gateway_nalid;
- *gateway_nid = re->kpre_gateway_nid;
+ *gateway_nalid = ge->kpge_nalid;
+ *gateway_nid = ge->kpge_nid;
+ *alive = ge->kpge_alive;
*lo_nid = re->kpre_lo_nid;
*hi_nid = re->kpre_hi_nid;
typedef struct
{
+ struct list_head kpge_list;
+ atomic_t kpge_weight;
+ time_t kpge_timestamp;
+ int kpge_alive;
+ int kpge_nalid;
+ int kpge_refcount;
+ ptl_nid_t kpge_nid;
+} kpr_gateway_entry_t;
+
+typedef struct
+{
struct list_head kpre_list;
- int kpre_gateway_nalid;
- ptl_nid_t kpre_gateway_nid;
+ kpr_gateway_entry_t *kpre_gateway;
ptl_nid_t kpre_lo_nid;
ptl_nid_t kpre_hi_nid;
} kpr_route_entry_t;
+typedef struct
+{
+ struct tq_struct kpru_tq;
+ int kpru_nal_id;
+ ptl_nid_t kpru_nid;
+ int kpru_alive;
+ time_t kpru_when;
+} kpr_upcall_t;
+
extern int kpr_register_nal (kpr_nal_interface_t *nalif, void **argp);
-extern int kpr_lookup_target (void *arg, ptl_nid_t target_nid, ptl_nid_t *gateway_nidp);
+extern int kpr_lookup_target (void *arg, ptl_nid_t target_nid, int nob,
+ ptl_nid_t *gateway_nidp);
+extern kpr_nal_entry_t *kpr_find_nal_entry_locked (int nal_id);
extern void kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd);
extern void kpr_complete_packet (void *arg, kpr_fwd_desc_t *fwd, int error);
+extern void kpr_nal_notify (void *arg, ptl_nid_t peer,
+ int alive, time_t when);
extern void kpr_shutdown_nal (void *arg);
extern void kpr_deregister_nal (void *arg);
extern int kpr_add_route (int gateway_nal, ptl_nid_t gateway_nid,
ptl_nid_t lo_nid, ptl_nid_t hi_nid);
-extern int kpr_del_route (ptl_nid_t nid);
+extern int kpr_del_route (int gw_nal, ptl_nid_t gw_nid,
+ ptl_nid_t lo, ptl_nid_t hi);
extern int kpr_get_route (int idx, int *gateway_nal, ptl_nid_t *gateway_nid,
- ptl_nid_t *lo_nid, ptl_nid_t *hi_nid);
+ ptl_nid_t *lo_nid, ptl_nid_t *hi_nid, int *alive);
+extern int kpr_sys_notify (int gw_nalid, ptl_nid_t gw_nid,
+ int alive, time_t when);
extern unsigned long long kpr_fwd_bytes;
extern unsigned long kpr_fwd_packets;
magic = *(int *)(ev->mem_desc.start + ev->offset);
if(magic != 0xcafebabe) {
- printk ("Unexpected response \n");
+ printk ("LustreError: Unexpected response \n");
return 1;
}
if((i == count) || !count)
wake_up_process (client->tsk);
else
- printk ("Received response after timeout for %d\n",i);
+ printk ("LustreError: Received response after timeout for %d\n",i);
return 1;
}
pingcli_shutdown (1);
return NULL;
}
- printk ("sent msg no %d", count);
+ printk ("Lustre: sent msg no %d", count);
set_current_state (TASK_INTERRUPTIBLE);
rc = schedule_timeout (20 * args->ioc_timeout);
if (rc == 0) {
- printk (" :: timeout .....\n");
+ printk ("LustreError: :: timeout .....\n");
} else {
do_gettimeofday (&tv2);
- printk(" :: Reply in %u usec\n",
+ printk("Lustre: :: Reply in %u usec\n",
(unsigned)((tv2.tv_sec - tv1.tv_sec)
* 1000000 + (tv2.tv_usec - tv1.tv_usec)));
}
if(magic != 0xdeadbeef) {
- printk("Unexpected Packet to the server\n");
+ printk("LustreError: Unexpected Packet to the server\n");
}
memcpy (server->in_buf, &ping_bulk_magic, sizeof(ping_bulk_magic));
}
server->evnt = *ev;
- printk ("received ping from nid "LPX64" "
+ printk ("Lustre: received ping from nid "LPX64" "
"(off=%u rlen=%u mlen=%u head=%x seq=%d size=%d)\n",
ev->initiator.nid, ev->offset, ev->rlength, ev->mlength,
*((int *)(ev->mem_desc.start + ev->offset)),
set_current_state (TASK_INTERRUPTIBLE);
rc = schedule_timeout (20 * args->ioc_timeout);
if (rc == 0) {
- printk (" Time out on the server\n");
+ printk ("LustreError: Time out on the server\n");
pingcli_shutdown (2);
return NULL;
} else
- printk("Received respose from the server \n");
+ printk("Lustre: Received respose from the server \n");
pingcli_shutdown (2);
}
server->evnt = *ev;
- printk ("received ping from nid "LPX64" "
+ printk ("Lustre: received ping from nid "LPX64" "
"(off=%u rlen=%u mlen=%u head=%x)\n",
ev->initiator.nid, ev->offset, ev->rlength, ev->mlength,
*((int *)(ev->mem_desc.start + ev->offset)));
/sbin/insmod ./$PING
echo kqswnal > /tmp/nal
;;
+
+ gm)
+ /sbin/insmod portals
+ /sbin/insmod kgmnal
+ /sbin/insmod ./$PING
+ echo kgmnal > /tmp/nal
+ ;;
*)
- echo "Usage : ${0} < tcp | toe | elan >"
+ echo "Usage : ${0} < tcp | toe | elan | gm>"
exit 1;
esac
exit 0;
/sbin/insmod ./$PING nal=4
echo kqswnal > /tmp/nal
;;
+
+ gm)
+ /sbin/insmod portals
+ /sbin/insmod kgmnal
+ /sbin/insmod ./$PING nal=3
+ echo kgmnal > /tmp/nal
+ ;;
*)
- echo "Usage : ${0} < tcp | toe | elan >"
+ echo "Usage : ${0} < tcp | toe | elan | gm>"
exit 1;
esac
../utils/acceptor 9999&
.deps
routerstat
wirecheck
+gmnalnid
.*.cmd
if LIBLUSTRE
sbin_PROGRAMS = ptlctl debugctl routerstat wirecheck
else
-sbin_PROGRAMS = acceptor ptlctl debugctl routerstat wirecheck
+sbin_PROGRAMS = acceptor ptlctl debugctl routerstat wirecheck gmnalnid
endif
lib_LIBRARIES = libptlctl.a
libptlctl_a_SOURCES = portals.c debug.c l_ioctl.c parser.c parser.h
+gmnalnid_SOURCES = gmnalnid.c
+
ptlctl_SOURCES = ptlctl.c
ptlctl_LDADD = -L. -lptlctl -lncurses # -lefence
ptlctl_DEPENDENCIES = libptlctl.a
if (!strcasecmp (str, "no") ||
!strcasecmp (str, "n") ||
!strcasecmp (str, "off") ||
+ !strcasecmp (str, "down") ||
!strcasecmp (str, "disable"))
{
*b = 0;
if (!strcasecmp (str, "yes") ||
!strcasecmp (str, "y") ||
!strcasecmp (str, "on") ||
+ !strcasecmp (str, "up") ||
!strcasecmp (str, "enable"))
{
*b = 1;
unsigned int portal_debug;
unsigned int portal_printk;
unsigned int portal_stack;
-
+unsigned int portal_cerror;
static unsigned int g_nal = 0;
} name2num_t;
static name2num_t nalnames[] = {
+ {"any", 0},
{"tcp", SOCKNAL},
{"toe", TOENAL},
{"elan", QSWNAL},
{
name2num_t *e = name2num_lookup_name (nalnames, str);
- return ((e == NULL) ? 0 : e->num);
+ return ((e == NULL) ? -1 : e->num);
}
static char *
}
int
+ptl_parse_port (int *port, char *str)
+{
+ char *end;
+
+ *port = strtol (str, &end, 0);
+
+ if (*end == 0 && /* parsed whole string */
+ *port > 0 && *port < 65536) /* minimal sanity check */
+ return (0);
+
+ return (-1);
+}
+
+int
+ptl_parse_time (time_t *t, char *str)
+{
+ char *end;
+ int n;
+ struct tm tm;
+
+ *t = strtol (str, &end, 0);
+ if (*end == 0) /* parsed whole string */
+ return (0);
+
+ memset (&tm, 0, sizeof (tm));
+ n = sscanf (str, "%d-%d-%d %d:%d:%d",
+ &tm.tm_year, &tm.tm_mon, &tm.tm_mday,
+ &tm.tm_hour, &tm.tm_min, &tm.tm_sec);
+ if (n != 6)
+ return (-1);
+
+ tm.tm_mon--; /* convert to 0 == Jan */
+ tm.tm_year -= 1900; /* y2k quirk */
+ tm.tm_isdst = -1; /* dunno if it's daylight savings... */
+
+ *t = mktime (&tm);
+ if (*t == (time_t)-1)
+ return (-1);
+
+ return (0);
+}
+
+int
ptl_parse_ipaddr (__u32 *ipaddrp, char *str)
{
struct hostent *he;
int
ptl_parse_nid (ptl_nid_t *nidp, char *str)
{
- __u32 ipaddr;
- long lval;
+ __u32 ipaddr;
+ char *end;
+ unsigned long long ullval;
if (!strcmp (str, "_all_")) {
*nidp = PTL_NID_ANY;
return (0);
}
- if (sscanf (str, "%li", &lval) == 1)
- {
- *nidp = (ptl_nid_t)lval;
- return (0);
- }
-
- if (sscanf (str, "%lx", &lval) == 1)
- {
- *nidp = (ptl_nid_t)lval;
+ ullval = strtoull(str, &end, 0);
+ if (*end == 0) {
+ /* parsed whole string */
+ *nidp = (ptl_nid_t)ullval;
return (0);
}
if (he != NULL)
strcpy (buffer, he->h_name);
else
- sprintf (buffer, "0x"LPX64, nid);
+ sprintf (buffer, LPX64, nid);
return (buffer);
}
-int g_nal_is_compatible (char *cmd, ...)
+int g_nal_is_set ()
{
- va_list ap;
- int nal;
-
if (g_nal == 0) {
fprintf (stderr, "Error: you must run the 'network' command first.\n");
return (0);
}
-
+
+ return (1);
+}
+
+int g_nal_is_compatible (char *cmd, ...)
+{
+ va_list ap;
+ int nal;
+
+ if (!g_nal_is_set ())
+ return (0);
+
va_start (ap, cmd);
do {
if (g_nal == nal)
return (1);
-
- fprintf (stderr, "Command %s not compatible with nal %s\n",
- cmd, nal2name (g_nal));
+
+ if (cmd != NULL) {
+ /* Don't complain verbosely if we've not been passed a command
+ * name to complain about! */
+ fprintf (stderr, "Command %s not compatible with nal %s\n",
+ cmd, nal2name (g_nal));
+ }
return (0);
}
int nal;
if (argc == 2 &&
- (nal = ptl_name2nal (argv[1])) != 0) {
+ (nal = ptl_name2nal (argv[1])) >= 0) {
g_nal = nal;
return (0);
}
if (rc != 0)
break;
- printf (LPX64"@%s:%d #%d buffer %d nonagle %s xchg %s affinity %s share %d\n",
+ printf (LPX64"@%s:%d #%d buffer %d nonagle %s xchg %s "
+ "affinity %s eager %s share %d\n",
data.ioc_nid, ptl_ipaddr_2_str (data.ioc_id, buffer),
data.ioc_misc, data.ioc_count, data.ioc_size,
(data.ioc_flags & 1) ? "on" : "off",
(data.ioc_flags & 2) ? "on" : "off",
(data.ioc_flags & 4) ? "on" : "off",
+ (data.ioc_flags & 8) ? "on" : "off",
data.ioc_wait);
}
int xchange_nids = 0;
int irq_affinity = 0;
int share = 0;
+ int eager = 0;
int rc;
if (argc < 4 || argc > 5) {
- fprintf (stderr, "usage: %s nid ipaddr port [ixs]\n", argv[0]);
+ fprintf (stderr, "usage: %s nid ipaddr port [ixse]\n", argv[0]);
return 0;
}
return -1;
}
- port = atol (argv[3]);
-
+ if (ptl_parse_port (&port, argv[3]) != 0) {
+ fprintf (stderr, "Can't parse port: %s\n", argv[3]);
+ return -1;
+ }
+
if (argc > 4) {
char *opts = argv[4];
case 's':
share = 1;
break;
+ case 'e':
+ eager = 1;
+ break;
default:
fprintf (stderr, "Can't parse options: %s\n",
argv[4]);
data.ioc_misc = port;
/* only passing one buffer size! */
data.ioc_size = MAX (g_socket_rxmem, g_socket_txmem);
- data.ioc_flags = (g_socket_nonagle ? 1 : 0) |
- (xchange_nids ? 2 : 0) |
- (irq_affinity ? 4 : 0) |
- (share ? 8 : 0);
+ data.ioc_flags = (g_socket_nonagle ? 0x01 : 0) |
+ (xchange_nids ? 0x02 : 0) |
+ (irq_affinity ? 0x04 : 0) |
+ (share ? 0x08 : 0) |
+ (eager ? 0x10 : 0);
rc = l_ioctl (PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data);
if (rc != 0) {
if (rc != 0)
break;
- printf (LPD64"@%s:%d\n",
+ printf (LPX64"@%s:%d\n",
data.ioc_nid,
ptl_ipaddr_2_str (data.ioc_id, buffer),
data.ioc_misc);
return -1;
}
- port = atol(argv[2]);
+ if (ptl_parse_port (&port, argv[2]) != 0) {
+ fprintf (stderr, "Can't parse port: %s\n", argv[2]);
+ return -1;
+ }
+
if (argc > 3)
for (flag = argv[3]; *flag != 0; flag++)
switch (*flag)
return 0;
}
- if (!g_nal_is_compatible (argv[0], SOCKNAL, TOENAL, 0))
- return -1;
+ if (!g_nal_is_compatible (NULL, SOCKNAL, TOENAL, 0))
+ return 0;
if (argc >= 2 &&
ptl_parse_nid (&nid, argv[1]) != 0) {
return 0;
}
- if (g_nal == 0) {
- fprintf(stderr, "Error: you must run the 'network' command "
- "first.\n");
+ if (!g_nal_is_set())
return -1;
- }
if (ptl_parse_nid (&nid, argv[1]) != 0)
{
return 0;
}
- if (g_nal == 0) {
- fprintf(stderr, "Error: you must run the 'network' command first\n");
+ if (!g_nal_is_set())
return -1;
- }
-
+
PORTAL_IOC_INIT (data);
data.ioc_nal = g_nal;
rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_GET_NID, &data);
return 0;
}
- if (g_nal == 0) {
- fprintf(stderr, "Error: you must run the 'network' command "
- "first.\n");
+ if (!g_nal_is_set())
return -1;
- }
if (argc >= 2)
nidstr = argv[1];
return (0);
}
- if (g_nal == 0) {
- fprintf(stderr, "Error: you must run the 'network' command "
- "first.\n");
+ if (!g_nal_is_set())
return (-1);
- }
if (!strcmp (argv[1], "_all_"))
nid = PTL_NID_ANY;
if (Parser_bool (&enable, argv[1]) != 0)
{
fprintf (stderr, "Can't parse boolean %s\n", argv[1]);
- return (0);
+ return (-1);
}
g_socket_nonagle = !enable;
}
return (0);
}
- if (g_nal == 0) {
- fprintf(stderr, "Error: you must run the 'network' command "
- "first.\n");
+ if (!g_nal_is_set())
return (-1);
- }
if (ptl_parse_nid (&gateway_nid, argv[1]) != 0)
{
{
struct portal_ioctl_data data;
ptl_nid_t nid;
+ ptl_nid_t nid1 = PTL_NID_ANY;
+ ptl_nid_t nid2 = PTL_NID_ANY;
int rc;
if (argc < 2)
return (0);
}
+ if (!g_nal_is_set())
+ return (-1);
+
if (ptl_parse_nid (&nid, argv[1]) != 0)
{
- fprintf (stderr, "Can't parse target NID \"%s\"\n", argv[1]);
+ fprintf (stderr, "Can't parse gateway NID \"%s\"\n", argv[1]);
return (-1);
}
+ if (argc >= 3 &&
+ ptl_parse_nid (&nid1, argv[2]) != 0)
+ {
+ fprintf (stderr, "Can't parse target NID \"%s\"\n", argv[2]);
+ return (-1);
+ }
+
+ if (argc < 4) {
+ nid2 = nid1;
+ } else {
+ if (ptl_parse_nid (&nid2, argv[3]) != 0) {
+ fprintf (stderr, "Can't parse target NID \"%s\"\n", argv[3]);
+ return (-1);
+ }
+
+ if (nid1 > nid2) {
+ ptl_nid_t tmp = nid1;
+
+ nid1 = nid2;
+ nid2 = tmp;
+ }
+ }
+
PORTAL_IOC_INIT(data);
+ data.ioc_nal = g_nal;
data.ioc_nid = nid;
+ data.ioc_nid2 = nid1;
+ data.ioc_nid3 = nid2;
rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_DEL_ROUTE, &data);
if (rc != 0)
}
int
+jt_ptl_notify_router (int argc, char **argv)
+{
+ struct portal_ioctl_data data;
+ int enable;
+ ptl_nid_t nid;
+ int rc;
+ struct timeval now;
+ time_t when;
+
+ if (argc < 3)
+ {
+ fprintf (stderr, "usage: %s targetNID <up/down> [<time>]\n",
+ argv[0]);
+ return (0);
+ }
+
+ if (ptl_parse_nid (&nid, argv[1]) != 0)
+ {
+ fprintf (stderr, "Can't parse target NID \"%s\"\n", argv[1]);
+ return (-1);
+ }
+
+ if (Parser_bool (&enable, argv[2]) != 0) {
+ fprintf (stderr, "Can't parse boolean %s\n", argv[2]);
+ return (-1);
+ }
+
+ gettimeofday(&now, NULL);
+
+ if (argc < 4) {
+ when = now.tv_sec;
+ } else if (ptl_parse_time (&when, argv[3]) != 0) {
+ fprintf(stderr, "Can't parse time %s\n"
+ "Please specify either 'YYYY-MM-DD HH:MM:SS'\n"
+ "or an absolute unix time in seconds\n", argv[3]);
+ return (-1);
+ } else if (when > now.tv_sec) {
+ fprintf (stderr, "%s specifies a time in the future\n",
+ argv[3]);
+ return (-1);
+ }
+
+ PORTAL_IOC_INIT(data);
+ data.ioc_nal = g_nal;
+ data.ioc_nid = nid;
+ data.ioc_flags = enable;
+ /* Yeuch; 'cept I need a __u64 on 64 bit machines... */
+ data.ioc_nid3 = (__u64)when;
+
+ rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NOTIFY_ROUTER, &data);
+ if (rc != 0)
+ {
+ fprintf (stderr, "IOC_PORTAL_NOTIFY_ROUTER ("LPX64") failed: %s\n",
+ nid, strerror (errno));
+ return (-1);
+ }
+
+ return (0);
+}
+
+int
jt_ptl_print_routes (int argc, char **argv)
{
char buffer[3][128];
ptl_nid_t gateway_nid;
ptl_nid_t nid1;
ptl_nid_t nid2;
-
-
+ int alive;
+
for (index = 0;;index++)
{
PORTAL_IOC_INIT(data);
gateway_nid = data.ioc_nid;
nid1 = data.ioc_nid2;
nid2 = data.ioc_nid3;
-
- printf ("%8s %18s : %s - %s\n",
+ alive = data.ioc_flags;
+
+ printf ("%8s %18s : %s - %s, %s\n",
nal2name (gateway_nal),
ptl_nid2str (buffer[0], gateway_nid),
ptl_nid2str (buffer[1], nid1),
- ptl_nid2str (buffer[2], nid2));
+ ptl_nid2str (buffer[2], nid2),
+ alive ? "up" : "down");
}
return (0);
}
+static int
+lwt_control(int enable, int clear)
+{
+ struct portal_ioctl_data data;
+ int rc;
+
+ PORTAL_IOC_INIT(data);
+ data.ioc_flags = enable;
+ data.ioc_misc = clear;
+
+ rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_LWT_CONTROL, &data);
+ if (rc == 0)
+ return (0);
+
+ fprintf(stderr, "IOC_PORTAL_LWT_CONTROL failed: %s\n",
+ strerror(errno));
+ return (-1);
+}
+
+static int
+lwt_snapshot(int *ncpu, int *totalsize, lwt_event_t *events, int size)
+{
+ struct portal_ioctl_data data;
+ int rc;
+
+ PORTAL_IOC_INIT(data);
+ data.ioc_pbuf1 = (char *)events;
+ data.ioc_plen1 = size;
+
+ rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_LWT_SNAPSHOT, &data);
+ if (rc != 0) {
+ fprintf(stderr, "IOC_PORTAL_LWT_SNAPSHOT failed: %s\n",
+ strerror(errno));
+ return (-1);
+ }
+
+ LASSERT (data.ioc_count != 0);
+ LASSERT (data.ioc_misc != 0);
+
+ if (ncpu != NULL)
+ *ncpu = data.ioc_count;
+
+ if (totalsize != NULL)
+ *totalsize = data.ioc_misc;
+
+ return (0);
+}
+
+static char *
+lwt_get_string(char *kstr)
+{
+ char *ustr;
+ struct portal_ioctl_data data;
+ int size;
+ int rc;
+
+ /* FIXME: this could maintain a symbol table since we expect to be
+ * looking up the same strings all the time... */
+
+ PORTAL_IOC_INIT(data);
+ data.ioc_pbuf1 = kstr;
+ data.ioc_plen1 = 1; /* non-zero just to fool portal_ioctl_is_invalid() */
+ data.ioc_pbuf2 = NULL;
+ data.ioc_plen2 = 0;
+
+ rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_LWT_LOOKUP_STRING, &data);
+ if (rc != 0) {
+ fprintf(stderr, "IOC_PORTAL_LWT_LOOKUP_STRING failed: %s\n",
+ strerror(errno));
+ return (NULL);
+ }
+
+ size = data.ioc_count;
+ ustr = (char *)malloc(size);
+ if (ustr == NULL) {
+ fprintf(stderr, "Can't allocate string storage of size %d\n",
+ size);
+ return (NULL);
+ }
+
+ PORTAL_IOC_INIT(data);
+ data.ioc_pbuf1 = kstr;
+ data.ioc_plen1 = 1; /* non-zero just to fool portal_ioctl_is_invalid() */
+ data.ioc_pbuf2 = ustr;
+ data.ioc_plen2 = size;
+
+ rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_LWT_LOOKUP_STRING, &data);
+ if (rc != 0) {
+ fprintf(stderr, "IOC_PORTAL_LWT_LOOKUP_STRING failed: %s\n",
+ strerror(errno));
+ return (NULL);
+ }
+
+ LASSERT(strlen(ustr) == size - 1);
+ return (ustr);
+}
+
+static void
+lwt_put_string(char *ustr)
+{
+ free(ustr);
+}
+
+static int
+lwt_print(FILE *f, cycles_t t0, cycles_t tlast, double mhz, int cpu, lwt_event_t *e)
+{
+ char whenstr[32];
+ char *where = lwt_get_string(e->lwte_where);
+
+ if (where == NULL)
+ return (-1);
+
+ sprintf(whenstr, LPD64, e->lwte_when - t0);
+
+ fprintf(f, "%#010lx %#010lx %#010lx %#010lx: %#010lx %1d %10.6f %10.2f %s\n",
+ e->lwte_p1, e->lwte_p2, e->lwte_p3, e->lwte_p4,
+ (long)e->lwte_task, cpu, (e->lwte_when - t0) / (mhz * 1000000.0),
+ (t0 == e->lwte_when) ? 0.0 : (e->lwte_when - tlast) / mhz,
+ where);
+
+ lwt_put_string(where);
+
+ return (0);
+}
+
+double
+get_cycles_per_usec ()
+{
+ FILE *f = fopen ("/proc/cpuinfo", "r");
+ double mhz;
+ char line[64];
+
+ if (f != NULL) {
+ while (fgets (line, sizeof (line), f) != NULL)
+ if (sscanf (line, "cpu MHz : %lf", &mhz) == 1) {
+ fclose (f);
+ return (mhz);
+ }
+ fclose (f);
+ }
+
+ fprintf (stderr, "Can't read/parse /proc/cpuinfo\n");
+ return (1000.0);
+}
+
+int
+jt_ptl_lwt(int argc, char **argv)
+{
+#define MAX_CPUS 8
+ int ncpus;
+ int totalspace;
+ int nevents_per_cpu;
+ lwt_event_t *events;
+ lwt_event_t *cpu_event[MAX_CPUS + 1];
+ lwt_event_t *next_event[MAX_CPUS];
+ lwt_event_t *first_event[MAX_CPUS];
+ int cpu;
+ lwt_event_t *e;
+ int rc;
+ int i;
+ double mhz;
+ cycles_t t0;
+ cycles_t tlast;
+ FILE *f = stdout;
+
+ if (argc < 2 ||
+ (strcmp(argv[1], "start") &&
+ strcmp(argv[1], "stop"))) {
+ fprintf(stderr,
+ "usage: %s start\n"
+ " %s stop [fname]\n", argv[0], argv[0]);
+ return (-1);
+ }
+
+ if (!strcmp(argv[1], "start")) {
+ /* disable */
+ if (lwt_control(0, 0) != 0)
+ return (-1);
+
+ /* clear */
+ if (lwt_control(0, 1) != 0)
+ return (-1);
+
+ /* enable */
+ if (lwt_control(1, 0) != 0)
+ return (-1);
+
+ return (0);
+ }
+
+ if (lwt_snapshot(&ncpus, &totalspace, NULL, 0) != 0)
+ return (-1);
+
+ if (ncpus > MAX_CPUS) {
+ fprintf(stderr, "Too many cpus: %d (%d)\n", ncpus, MAX_CPUS);
+ return (-1);
+ }
+
+ events = (lwt_event_t *)malloc(totalspace);
+ if (events == NULL) {
+ fprintf(stderr, "Can't allocate %d\n", totalspace);
+ return (-1);
+ }
+
+ if (lwt_control(0, 0) != 0) { /* disable */
+ free(events);
+ return (-1);
+ }
+
+ if (lwt_snapshot(NULL, NULL, events, totalspace)) {
+ free(events);
+ return (-1);
+ }
+
+ if (argc > 2) {
+ f = fopen (argv[2], "w");
+ if (f == NULL) {
+ fprintf(stderr, "Can't open %s for writing: %s\n", argv[2], strerror (errno));
+ free(events);
+ return (-1);
+ }
+ }
+
+ mhz = get_cycles_per_usec();
+
+ /* carve events into per-cpu slices */
+ nevents_per_cpu = totalspace / (ncpus * sizeof(lwt_event_t));
+ for (cpu = 0; cpu <= ncpus; cpu++)
+ cpu_event[cpu] = &events[cpu * nevents_per_cpu];
+
+ /* find the earliest event on each cpu */
+ for (cpu = 0; cpu < ncpus; cpu++) {
+ first_event[cpu] = NULL;
+
+ for (e = cpu_event[cpu]; e < cpu_event[cpu + 1]; e++) {
+
+ if (e->lwte_where == NULL) /* not an event */
+ continue;
+
+ if (first_event[cpu] == NULL ||
+ first_event[cpu]->lwte_when > e->lwte_when)
+ first_event[cpu] = e;
+ }
+
+ next_event[cpu] = first_event[cpu];
+ }
+
+ t0 = tlast = 0;
+ for (cpu = 0; cpu < ncpus; cpu++) {
+ e = first_event[cpu];
+ if (e == NULL) /* no events this cpu */
+ continue;
+
+ if (e == cpu_event[cpu])
+ e = cpu_event[cpu + 1] - 1;
+ else
+ e = e - 1;
+
+ /* If there's an event immediately before the first one, this
+ * cpu wrapped its event buffer */
+ if (e->lwte_where == NULL)
+ continue;
+
+ /* We should only start outputting events from the most recent
+ * first event in any wrapped cpu. Events before this time on
+ * other cpus won't have any events from this CPU to interleave
+ * with. */
+ if (t0 < first_event[cpu]->lwte_when)
+ t0 = first_event[cpu]->lwte_when;
+ }
+
+ for (;;) {
+ /* find which cpu has the next event */
+ cpu = -1;
+ for (i = 0; i < ncpus; i++) {
+
+ if (next_event[i] == NULL) /* this cpu exhausted */
+ continue;
+
+ if (cpu < 0 ||
+ next_event[i]->lwte_when < next_event[cpu]->lwte_when)
+ cpu = i;
+ }
+
+ if (cpu < 0) /* all cpus exhausted */
+ break;
+
+ if (t0 == 0) {
+ /* no wrapped cpus and this is he first ever event */
+ t0 = next_event[cpu]->lwte_when;
+ }
+
+ if (t0 <= next_event[cpu]->lwte_when) {
+ /* on or after the first event */
+ rc = lwt_print(f, t0, tlast, mhz, cpu, next_event[cpu]);
+ if (rc != 0)
+ break;
+ }
+
+ tlast = next_event[cpu]->lwte_when;
+
+ next_event[cpu]++;
+ if (next_event[cpu] == cpu_event[cpu + 1])
+ next_event[cpu] = cpu_event[cpu];
+
+ if (next_event[cpu]->lwte_where == NULL ||
+ next_event[cpu] == first_event[cpu])
+ next_event[cpu] = NULL;
+ }
+
+ if (f != stdout)
+ fclose(f);
+
+ free(events);
+ return (0);
+#undef MAX_CPUS
+}
command_t list[] = {
{"network", jt_ptl_network, 0,"setup the NAL (args: nal name)"},
{"print_autoconns", jt_ptl_print_autoconnects, 0, "print autoconnect entries (no args)"},
- {"add_autoconn", jt_ptl_add_autoconnect, 0, "add autoconnect entry (args: nid host [ixs])"},
+ {"add_autoconn", jt_ptl_add_autoconnect, 0, "add autoconnect entry (args: nid host [ixse])"},
{"del_autoconn", jt_ptl_del_autoconnect, 0, "delete autoconnect entry (args: [nid] [host] [ks])"},
{"print_conns", jt_ptl_print_connections, 0, "print connections (no args)"},
{"connect", jt_ptl_connect, 0, "connect to a remote nid (args: host port [xi])"},
{"ping", jt_ptl_ping, 0, "do a ping test (args: nid [count] [size] [timeout])"},
{"shownid", jt_ptl_shownid, 0, "print the local NID"},
{"mynid", jt_ptl_mynid, 0, "inform the socknal of the local NID (args: [hostname])"},
- {"add_route", jt_ptl_add_route, 0, "add an entry to the routing table (args: gatewayNID targetNID [targetNID])"},
- {"del_route", jt_ptl_del_route, 0, "delete an entry from the routing table (args: targetNID"},
+ {"add_route", jt_ptl_add_route, 0,
+ "add an entry to the routing table (args: gatewayNID targetNID [targetNID])"},
+ {"del_route", jt_ptl_del_route, 0,
+ "delete all routes via a gateway from the routing table (args: gatewayNID"},
+ {"set_route", jt_ptl_notify_router, 0,
+ "enable/disable a route in the routing table (args: gatewayNID up/down [time]"},
{"print_routes", jt_ptl_print_routes, 0, "print the routing table (args: none)"},
{"recv_mem", jt_ptl_rxmem, 0, "Set socket receive buffer size (args: [size])"},
{"send_mem", jt_ptl_txmem, 0, "Set socket send buffer size (args: [size])"},
obj-y += ost/
obj-y += lov/
obj-y += llite/
+obj-y += lvfs/
# portals needs to be before utils/, which pulls in ptlctl objects
obj-m += utils/
--- /dev/null
+#
+# Automatically generated by make menuconfig: don't edit
+#
+CONFIG_X86=y
+# CONFIG_SBUS is not set
+CONFIG_UID16=y
+
+#
+# Code maturity level options
+#
+CONFIG_EXPERIMENTAL=y
+
+#
+# Loadable module support
+#
+CONFIG_MODULES=y
+CONFIG_MODVERSIONS=y
+CONFIG_KMOD=y
+
+#
+# Processor type and features
+#
+CONFIG_LOLAT=y
+# CONFIG_M386 is not set
+# CONFIG_M486 is not set
+# CONFIG_M586 is not set
+# CONFIG_M586TSC is not set
+# CONFIG_M586MMX is not set
+CONFIG_M686=y
+# CONFIG_MPENTIUMIII is not set
+# CONFIG_MPENTIUM4 is not set
+# CONFIG_MK6 is not set
+# CONFIG_MK7 is not set
+# CONFIG_MELAN is not set
+# CONFIG_MCRUSOE is not set
+# CONFIG_MWINCHIPC6 is not set
+# CONFIG_MWINCHIP2 is not set
+# CONFIG_MWINCHIP3D is not set
+# CONFIG_MCYRIXIII is not set
+CONFIG_X86_WP_WORKS_OK=y
+CONFIG_X86_INVLPG=y
+CONFIG_X86_CMPXCHG=y
+CONFIG_X86_XADD=y
+CONFIG_X86_BSWAP=y
+CONFIG_X86_POPAD_OK=y
+# CONFIG_RWSEM_GENERIC_SPINLOCK is not set
+CONFIG_RWSEM_XCHGADD_ALGORITHM=y
+CONFIG_X86_L1_CACHE_SHIFT=7
+CONFIG_X86_HAS_TSC=y
+CONFIG_X86_GOOD_APIC=y
+CONFIG_X86_PGE=y
+CONFIG_X86_USE_PPRO_CHECKSUM=y
+CONFIG_X86_PPRO_FENCE=y
+CONFIG_X86_F00F_WORKS_OK=y
+CONFIG_X86_MCE=y
+
+#
+# CPU Frequency scaling
+#
+# CONFIG_CPU_FREQ is not set
+CONFIG_TOSHIBA=m
+CONFIG_I8K=m
+CONFIG_MICROCODE=m
+CONFIG_X86_MSR=m
+CONFIG_X86_CPUID=m
+# CONFIG_E820_PROC is not set
+CONFIG_EDD=m
+# CONFIG_NOHIGHMEM is not set
+CONFIG_HIGHMEM4G=y
+# CONFIG_HIGHMEM64G is not set
+CONFIG_HIGHMEM=y
+CONFIG_HIGHPTE=y
+CONFIG_HIGHIO=y
+# CONFIG_MATH_EMULATION is not set
+CONFIG_MTRR=y
+CONFIG_SMP=y
+# CONFIG_X86_NUMA is not set
+# CONFIG_X86_TSC_DISABLE is not set
+CONFIG_X86_TSC=y
+CONFIG_HAVE_DEC_LOCK=y
+
+#
+# General setup
+#
+CONFIG_NET=y
+CONFIG_X86_IO_APIC=y
+CONFIG_X86_LOCAL_APIC=y
+CONFIG_PCI=y
+# CONFIG_PCI_GOBIOS is not set
+# CONFIG_PCI_GODIRECT is not set
+CONFIG_PCI_GOANY=y
+CONFIG_PCI_BIOS=y
+CONFIG_PCI_DIRECT=y
+CONFIG_ISA=y
+CONFIG_PCI_NAMES=y
+CONFIG_EISA=y
+# CONFIG_MCA is not set
+CONFIG_HOTPLUG=y
+
+#
+# PCMCIA/CardBus support
+#
+CONFIG_PCMCIA=m
+CONFIG_CARDBUS=y
+CONFIG_TCIC=y
+CONFIG_I82092=y
+CONFIG_I82365=y
+
+#
+# PCI Hotplug Support
+#
+CONFIG_HOTPLUG_PCI=y
+# CONFIG_HOTPLUG_PCI_ACPI is not set
+CONFIG_HOTPLUG_PCI_COMPAQ=m
+# CONFIG_HOTPLUG_PCI_COMPAQ_NVRAM is not set
+CONFIG_HOTPLUG_PCI_IBM=m
+# CONFIG_HOTPLUG_PCI_H2999 is not set
+CONFIG_SYSVIPC=y
+CONFIG_BSD_PROCESS_ACCT=y
+CONFIG_SYSCTL=y
+CONFIG_KCORE_ELF=y
+# CONFIG_KCORE_AOUT is not set
+CONFIG_BINFMT_AOUT=m
+CONFIG_BINFMT_ELF=y
+CONFIG_BINFMT_MISC=m
+CONFIG_PM=y
+# CONFIG_ACPI is not set
+CONFIG_APM=y
+# CONFIG_APM_IGNORE_USER_SUSPEND is not set
+# CONFIG_APM_DO_ENABLE is not set
+CONFIG_APM_CPU_IDLE=y
+# CONFIG_APM_DISPLAY_BLANK is not set
+CONFIG_APM_RTC_IS_GMT=y
+# CONFIG_APM_ALLOW_INTS is not set
+# CONFIG_APM_REAL_MODE_POWER_OFF is not set
+
+#
+# Memory Technology Devices (MTD)
+#
+# CONFIG_MTD is not set
+
+#
+# Parallel port support
+#
+CONFIG_PARPORT=m
+CONFIG_PARPORT_PC=m
+CONFIG_PARPORT_PC_CML1=m
+CONFIG_PARPORT_SERIAL=m
+# CONFIG_PARPORT_PC_FIFO is not set
+# CONFIG_PARPORT_PC_SUPERIO is not set
+CONFIG_PARPORT_PC_PCMCIA=m
+# CONFIG_PARPORT_AMIGA is not set
+# CONFIG_PARPORT_MFC3 is not set
+# CONFIG_PARPORT_ATARI is not set
+# CONFIG_PARPORT_GSC is not set
+# CONFIG_PARPORT_SUNBPP is not set
+# CONFIG_PARPORT_OTHER is not set
+CONFIG_PARPORT_1284=y
+
+#
+# Plug and Play configuration
+#
+CONFIG_PNP=y
+CONFIG_ISAPNP=y
+
+#
+# Block devices
+#
+CONFIG_BLK_DEV_FD=y
+CONFIG_BLK_DEV_XD=m
+CONFIG_PARIDE=m
+CONFIG_PARIDE_PARPORT=m
+CONFIG_PARIDE_PD=m
+CONFIG_PARIDE_PCD=m
+CONFIG_PARIDE_PF=m
+CONFIG_PARIDE_PT=m
+CONFIG_PARIDE_PG=m
+CONFIG_PARIDE_ATEN=m
+CONFIG_PARIDE_BPCK=m
+CONFIG_PARIDE_BPCK6=m
+CONFIG_PARIDE_COMM=m
+CONFIG_PARIDE_DSTR=m
+CONFIG_PARIDE_FIT2=m
+CONFIG_PARIDE_FIT3=m
+CONFIG_PARIDE_EPAT=m
+CONFIG_PARIDE_EPATC8=y
+CONFIG_PARIDE_EPIA=m
+CONFIG_PARIDE_FRIQ=m
+CONFIG_PARIDE_FRPW=m
+CONFIG_PARIDE_KBIC=m
+CONFIG_PARIDE_KTTI=m
+CONFIG_PARIDE_ON20=m
+CONFIG_PARIDE_ON26=m
+CONFIG_BLK_CPQ_DA=m
+CONFIG_BLK_CPQ_CISS_DA=m
+CONFIG_CISS_SCSI_TAPE=y
+CONFIG_BLK_DEV_DAC960=m
+CONFIG_BLK_DEV_UMEM=m
+CONFIG_BLK_DEV_LOOP=m
+CONFIG_BLK_DEV_NBD=m
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=4096
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_BLK_STATS=y
+
+#
+# Multi-device support (RAID and LVM)
+#
+CONFIG_MD=y
+CONFIG_BLK_DEV_MD=y
+CONFIG_MD_LINEAR=m
+CONFIG_MD_RAID0=m
+CONFIG_MD_RAID1=m
+CONFIG_MD_RAID5=m
+CONFIG_MD_MULTIPATH=m
+CONFIG_BLK_DEV_LVM=m
+
+#
+# Cryptography support (CryptoAPI)
+#
+CONFIG_CRYPTO=m
+CONFIG_CIPHERS=m
+CONFIG_CIPHER_AES=m
+CONFIG_CIPHER_IDENTITY=m
+CONFIG_CRYPTODEV=m
+CONFIG_CRYPTOLOOP=m
+
+#
+# Networking options
+#
+CONFIG_PACKET=y
+CONFIG_PACKET_MMAP=y
+CONFIG_NETLINK_DEV=y
+CONFIG_NETFILTER=y
+# CONFIG_NETFILTER_DEBUG is not set
+CONFIG_FILTER=y
+CONFIG_UNIX=y
+CONFIG_INET=y
+CONFIG_TUX=m
+CONFIG_TUX_EXTCGI=y
+# CONFIG_TUX_EXTENDED_LOG is not set
+# CONFIG_TUX_DEBUG is not set
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_ROUTE_FWMARK=y
+CONFIG_IP_ROUTE_NAT=y
+CONFIG_IP_ROUTE_MULTIPATH=y
+CONFIG_IP_ROUTE_TOS=y
+CONFIG_IP_ROUTE_VERBOSE=y
+CONFIG_IP_ROUTE_LARGE_TABLES=y
+# CONFIG_IP_PNP is not set
+CONFIG_NET_IPIP=m
+CONFIG_NET_IPGRE=m
+CONFIG_NET_IPGRE_BROADCAST=y
+CONFIG_IP_MROUTE=y
+CONFIG_IP_PIMSM_V1=y
+CONFIG_IP_PIMSM_V2=y
+# CONFIG_ARPD is not set
+# CONFIG_INET_ECN is not set
+CONFIG_SYN_COOKIES=y
+
+#
+# IP: Netfilter Configuration
+#
+CONFIG_IP_NF_CONNTRACK=m
+CONFIG_IP_NF_FTP=m
+CONFIG_IP_NF_AMANDA=m
+CONFIG_IP_NF_TFTP=m
+CONFIG_IP_NF_IRC=m
+CONFIG_IP_NF_QUEUE=m
+CONFIG_IP_NF_IPTABLES=m
+CONFIG_IP_NF_MATCH_LIMIT=m
+CONFIG_IP_NF_MATCH_MAC=m
+CONFIG_IP_NF_MATCH_PKTTYPE=m
+CONFIG_IP_NF_MATCH_MARK=m
+CONFIG_IP_NF_MATCH_MULTIPORT=m
+CONFIG_IP_NF_MATCH_TOS=m
+CONFIG_IP_NF_MATCH_ECN=m
+CONFIG_IP_NF_MATCH_DSCP=m
+CONFIG_IP_NF_MATCH_AH_ESP=m
+CONFIG_IP_NF_MATCH_LENGTH=m
+CONFIG_IP_NF_MATCH_TTL=m
+CONFIG_IP_NF_MATCH_TCPMSS=m
+CONFIG_IP_NF_MATCH_HELPER=m
+CONFIG_IP_NF_MATCH_STATE=m
+CONFIG_IP_NF_MATCH_CONNTRACK=m
+CONFIG_IP_NF_MATCH_UNCLEAN=m
+CONFIG_IP_NF_MATCH_OWNER=m
+CONFIG_IP_NF_FILTER=m
+CONFIG_IP_NF_TARGET_REJECT=m
+CONFIG_IP_NF_TARGET_MIRROR=m
+CONFIG_IP_NF_NAT=m
+CONFIG_IP_NF_NAT_NEEDED=y
+CONFIG_IP_NF_TARGET_MASQUERADE=m
+CONFIG_IP_NF_TARGET_REDIRECT=m
+CONFIG_IP_NF_NAT_AMANDA=m
+CONFIG_IP_NF_NAT_LOCAL=y
+CONFIG_IP_NF_NAT_SNMP_BASIC=m
+CONFIG_IP_NF_NAT_IRC=m
+CONFIG_IP_NF_NAT_FTP=m
+CONFIG_IP_NF_NAT_TFTP=m
+CONFIG_IP_NF_MANGLE=m
+CONFIG_IP_NF_TARGET_TOS=m
+CONFIG_IP_NF_TARGET_ECN=m
+CONFIG_IP_NF_TARGET_DSCP=m
+CONFIG_IP_NF_TARGET_MARK=m
+CONFIG_IP_NF_TARGET_LOG=m
+CONFIG_IP_NF_TARGET_ULOG=m
+CONFIG_IP_NF_TARGET_TCPMSS=m
+CONFIG_IP_NF_ARPTABLES=m
+CONFIG_IP_NF_ARPFILTER=m
+CONFIG_IP_NF_COMPAT_IPCHAINS=m
+CONFIG_IP_NF_NAT_NEEDED=y
+CONFIG_IP_NF_COMPAT_IPFWADM=m
+CONFIG_IP_NF_NAT_NEEDED=y
+CONFIG_IPV6=m
+
+#
+# IPv6: Netfilter Configuration
+#
+# CONFIG_IP6_NF_QUEUE is not set
+CONFIG_IP6_NF_IPTABLES=m
+CONFIG_IP6_NF_MATCH_LIMIT=m
+CONFIG_IP6_NF_MATCH_MAC=m
+CONFIG_IP6_NF_MATCH_RT=m
+CONFIG_IP6_NF_MATCH_OPTS=m
+CONFIG_IP6_NF_MATCH_FRAG=m
+CONFIG_IP6_NF_MATCH_HL=m
+CONFIG_IP6_NF_MATCH_MULTIPORT=m
+CONFIG_IP6_NF_MATCH_OWNER=m
+CONFIG_IP6_NF_MATCH_MARK=m
+CONFIG_IP6_NF_MATCH_IPV6HEADER=m
+CONFIG_IP6_NF_MATCH_AHESP=m
+CONFIG_IP6_NF_MATCH_LENGTH=m
+CONFIG_IP6_NF_MATCH_EUI64=m
+CONFIG_IP6_NF_FILTER=m
+CONFIG_IP6_NF_TARGET_LOG=m
+CONFIG_IP6_NF_MANGLE=m
+CONFIG_IP6_NF_TARGET_MARK=m
+# CONFIG_KHTTPD is not set
+CONFIG_ATM=y
+CONFIG_ATM_CLIP=y
+# CONFIG_ATM_CLIP_NO_ICMP is not set
+CONFIG_ATM_LANE=m
+CONFIG_ATM_MPOA=m
+CONFIG_ATM_BR2684=m
+CONFIG_ATM_BR2684_IPFILTER=y
+CONFIG_VLAN_8021Q=m
+CONFIG_IPX=m
+# CONFIG_IPX_INTERN is not set
+CONFIG_ATALK=m
+
+#
+# Appletalk devices
+#
+CONFIG_DEV_APPLETALK=y
+CONFIG_LTPC=m
+CONFIG_COPS=m
+CONFIG_COPS_DAYNA=y
+CONFIG_COPS_TANGENT=y
+CONFIG_IPDDP=m
+CONFIG_IPDDP_ENCAP=y
+CONFIG_IPDDP_DECAP=y
+CONFIG_DECNET=m
+CONFIG_DECNET_SIOCGIFCONF=y
+CONFIG_DECNET_ROUTER=y
+CONFIG_DECNET_ROUTE_FWMARK=y
+CONFIG_BRIDGE=m
+# CONFIG_X25 is not set
+# CONFIG_LAPB is not set
+# CONFIG_LLC is not set
+CONFIG_NET_DIVERT=y
+# CONFIG_ECONET is not set
+CONFIG_WAN_ROUTER=m
+# CONFIG_NET_FASTROUTE is not set
+# CONFIG_NET_HW_FLOWCONTROL is not set
+
+#
+# QoS and/or fair queueing
+#
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_CBQ=m
+CONFIG_NET_SCH_HTB=m
+CONFIG_NET_SCH_CSZ=m
+# CONFIG_NET_SCH_ATM is not set
+CONFIG_NET_SCH_PRIO=m
+CONFIG_NET_SCH_RED=m
+CONFIG_NET_SCH_SFQ=m
+CONFIG_NET_SCH_TEQL=m
+CONFIG_NET_SCH_TBF=m
+CONFIG_NET_SCH_GRED=m
+CONFIG_NET_SCH_DSMARK=m
+CONFIG_NET_SCH_INGRESS=m
+CONFIG_NET_QOS=y
+CONFIG_NET_ESTIMATOR=y
+CONFIG_NET_CLS=y
+CONFIG_NET_CLS_TCINDEX=m
+CONFIG_NET_CLS_ROUTE4=m
+CONFIG_NET_CLS_ROUTE=y
+CONFIG_NET_CLS_FW=m
+CONFIG_NET_CLS_U32=m
+CONFIG_NET_CLS_RSVP=m
+CONFIG_NET_CLS_RSVP6=m
+CONFIG_NET_CLS_POLICE=y
+
+#
+# Network testing
+#
+# CONFIG_NET_PKTGEN is not set
+
+#
+# Telephony Support
+#
+CONFIG_PHONE=m
+CONFIG_PHONE_IXJ=m
+CONFIG_PHONE_IXJ_PCMCIA=m
+
+#
+# ATA/IDE/MFM/RLL support
+#
+CONFIG_IDE=y
+
+#
+# IDE, ATA and ATAPI Block devices
+#
+CONFIG_BLK_DEV_IDE=y
+# CONFIG_BLK_DEV_HD_IDE is not set
+# CONFIG_BLK_DEV_HD is not set
+CONFIG_BLK_DEV_IDEDISK=y
+CONFIG_IDEDISK_MULTI_MODE=y
+# CONFIG_IDEDISK_STROKE is not set
+CONFIG_BLK_DEV_IDECS=m
+CONFIG_BLK_DEV_IDECD=m
+CONFIG_BLK_DEV_IDETAPE=m
+CONFIG_BLK_DEV_IDEFLOPPY=y
+CONFIG_BLK_DEV_IDESCSI=m
+# CONFIG_IDE_TASK_IOCTL is not set
+CONFIG_BLK_DEV_CMD640=y
+# CONFIG_BLK_DEV_CMD640_ENHANCED is not set
+CONFIG_BLK_DEV_ISAPNP=y
+CONFIG_BLK_DEV_IDEPCI=y
+CONFIG_BLK_DEV_GENERIC=y
+CONFIG_IDEPCI_SHARE_IRQ=y
+CONFIG_BLK_DEV_IDEDMA_PCI=y
+# CONFIG_BLK_DEV_OFFBOARD is not set
+# CONFIG_BLK_DEV_IDEDMA_FORCED is not set
+CONFIG_IDEDMA_PCI_AUTO=y
+# CONFIG_IDEDMA_ONLYDISK is not set
+CONFIG_BLK_DEV_IDEDMA=y
+# CONFIG_IDEDMA_PCI_WIP is not set
+CONFIG_BLK_DEV_ADMA100=y
+CONFIG_BLK_DEV_AEC62XX=y
+CONFIG_BLK_DEV_ALI15X3=y
+# CONFIG_WDC_ALI15X3 is not set
+CONFIG_BLK_DEV_AMD74XX=y
+# CONFIG_AMD74XX_OVERRIDE is not set
+CONFIG_BLK_DEV_CMD64X=y
+CONFIG_BLK_DEV_TRIFLEX=y
+CONFIG_BLK_DEV_CY82C693=y
+CONFIG_BLK_DEV_CS5530=y
+CONFIG_BLK_DEV_HPT34X=y
+# CONFIG_HPT34X_AUTODMA is not set
+CONFIG_BLK_DEV_HPT366=y
+CONFIG_BLK_DEV_PIIX=y
+# CONFIG_BLK_DEV_NS87415 is not set
+# CONFIG_BLK_DEV_OPTI621 is not set
+CONFIG_BLK_DEV_PDC202XX_OLD=y
+# CONFIG_PDC202XX_BURST is not set
+CONFIG_BLK_DEV_PDC202XX_NEW=y
+CONFIG_PDC202XX_FORCE=y
+CONFIG_BLK_DEV_RZ1000=y
+# CONFIG_BLK_DEV_SC1200 is not set
+CONFIG_BLK_DEV_SVWKS=y
+CONFIG_BLK_DEV_SIIMAGE=y
+CONFIG_BLK_DEV_SIS5513=y
+CONFIG_BLK_DEV_SLC90E66=y
+# CONFIG_BLK_DEV_TRM290 is not set
+CONFIG_BLK_DEV_VIA82CXXX=y
+# CONFIG_IDE_CHIPSETS is not set
+CONFIG_IDEDMA_AUTO=y
+# CONFIG_IDEDMA_IVB is not set
+# CONFIG_DMA_NONPCI is not set
+CONFIG_BLK_DEV_PDC202XX=y
+CONFIG_BLK_DEV_IDE_MODES=y
+CONFIG_BLK_DEV_ATARAID=m
+CONFIG_BLK_DEV_ATARAID_PDC=m
+CONFIG_BLK_DEV_ATARAID_HPT=m
+CONFIG_BLK_DEV_ATARAID_SII=m
+
+#
+# SCSI support
+#
+CONFIG_SCSI=m
+CONFIG_BLK_DEV_SD=m
+CONFIG_SD_EXTRA_DEVS=40
+CONFIG_CHR_DEV_ST=m
+CONFIG_CHR_DEV_OSST=m
+CONFIG_BLK_DEV_SR=m
+CONFIG_BLK_DEV_SR_VENDOR=y
+CONFIG_SR_EXTRA_DEVS=4
+CONFIG_CHR_DEV_SG=m
+# CONFIG_SCSI_DEBUG_QUEUES is not set
+# CONFIG_SCSI_MULTI_LUN is not set
+CONFIG_SCSI_CONSTANTS=y
+CONFIG_SCSI_LOGGING=y
+
+#
+# SCSI low-level drivers
+#
+CONFIG_BLK_DEV_3W_XXXX_RAID=m
+CONFIG_SCSI_7000FASST=m
+CONFIG_SCSI_ACARD=m
+CONFIG_SCSI_AHA152X=m
+CONFIG_SCSI_AHA1542=m
+CONFIG_SCSI_AHA1740=m
+CONFIG_SCSI_AACRAID=m
+CONFIG_SCSI_AIC7XXX=m
+CONFIG_AIC7XXX_CMDS_PER_DEVICE=253
+CONFIG_AIC7XXX_RESET_DELAY_MS=15000
+# CONFIG_AIC7XXX_PROBE_EISA_VL is not set
+# CONFIG_AIC7XXX_BUILD_FIRMWARE is not set
+CONFIG_SCSI_AIC79XX=m
+CONFIG_AIC79XX_CMDS_PER_DEVICE=253
+CONFIG_AIC79XX_RESET_DELAY_MS=15000
+# CONFIG_AIC79XX_BUILD_FIRMWARE is not set
+CONFIG_AIC79XX_ENABLE_RD_STRM=y
+# CONFIG_AIC79XX_DEBUG_ENABLE is not set
+CONFIG_AIC79XX_DEBUG_MASK=0
+CONFIG_SCSI_AIC7XXX_OLD=m
+CONFIG_AIC7XXX_OLD_TCQ_ON_BY_DEFAULT=y
+CONFIG_AIC7XXX_OLD_CMDS_PER_DEVICE=32
+CONFIG_AIC7XXX_OLD_PROC_STATS=y
+CONFIG_SCSI_DPT_I2O=m
+CONFIG_SCSI_ADVANSYS=m
+CONFIG_SCSI_IN2000=m
+CONFIG_SCSI_AM53C974=m
+CONFIG_SCSI_MEGARAID=m
+CONFIG_SCSI_BUSLOGIC=m
+# CONFIG_SCSI_OMIT_FLASHPOINT is not set
+CONFIG_SCSI_CPQFCTS=m
+CONFIG_SCSI_DMX3191D=m
+CONFIG_SCSI_DTC3280=m
+CONFIG_SCSI_EATA=m
+CONFIG_SCSI_EATA_TAGGED_QUEUE=y
+# CONFIG_SCSI_EATA_LINKED_COMMANDS is not set
+CONFIG_SCSI_EATA_MAX_TAGS=16
+CONFIG_SCSI_EATA_DMA=m
+CONFIG_SCSI_EATA_PIO=m
+CONFIG_SCSI_FUTURE_DOMAIN=m
+CONFIG_SCSI_GDTH=m
+CONFIG_SCSI_GENERIC_NCR5380=m
+# CONFIG_SCSI_GENERIC_NCR53C400 is not set
+CONFIG_SCSI_G_NCR5380_PORT=y
+# CONFIG_SCSI_G_NCR5380_MEM is not set
+CONFIG_SCSI_IPS=m
+CONFIG_SCSI_INITIO=m
+CONFIG_SCSI_INIA100=m
+CONFIG_SCSI_PPA=m
+CONFIG_SCSI_IMM=m
+# CONFIG_SCSI_IZIP_EPP16 is not set
+# CONFIG_SCSI_IZIP_SLOW_CTR is not set
+CONFIG_SCSI_NCR53C406A=m
+CONFIG_SCSI_NCR53C7xx=m
+# CONFIG_SCSI_NCR53C7xx_sync is not set
+CONFIG_SCSI_NCR53C7xx_FAST=y
+CONFIG_SCSI_NCR53C7xx_DISCONNECT=y
+CONFIG_SCSI_SYM53C8XX_2=m
+CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=1
+CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16
+CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64
+# CONFIG_SCSI_SYM53C8XX_IOMAPPED is not set
+CONFIG_SCSI_NCR53C8XX=m
+CONFIG_SCSI_SYM53C8XX=m
+CONFIG_SCSI_NCR53C8XX_DEFAULT_TAGS=8
+CONFIG_SCSI_NCR53C8XX_MAX_TAGS=32
+CONFIG_SCSI_NCR53C8XX_SYNC=40
+# CONFIG_SCSI_NCR53C8XX_PROFILE is not set
+# CONFIG_SCSI_NCR53C8XX_IOMAPPED is not set
+# CONFIG_SCSI_NCR53C8XX_PQS_PDS is not set
+# CONFIG_SCSI_NCR53C8XX_SYMBIOS_COMPAT is not set
+CONFIG_SCSI_PAS16=m
+CONFIG_SCSI_PCI2000=m
+CONFIG_SCSI_PCI2220I=m
+CONFIG_SCSI_PSI240I=m
+CONFIG_SCSI_QLOGIC_FAS=m
+CONFIG_SCSI_QLOGIC_ISP=m
+CONFIG_SCSI_QLOGIC_FC=m
+# CONFIG_SCSI_QLOGIC_FC_FIRMWARE is not set
+CONFIG_SCSI_QLOGIC_1280=m
+CONFIG_SCSI_NEWISP=m
+CONFIG_SCSI_SEAGATE=m
+CONFIG_SCSI_SIM710=m
+CONFIG_SCSI_SYM53C416=m
+CONFIG_SCSI_DC390T=m
+# CONFIG_SCSI_DC390T_NOGENSUPP is not set
+CONFIG_SCSI_T128=m
+CONFIG_SCSI_U14_34F=m
+# CONFIG_SCSI_U14_34F_LINKED_COMMANDS is not set
+CONFIG_SCSI_U14_34F_MAX_TAGS=8
+CONFIG_SCSI_ULTRASTOR=m
+CONFIG_SCSI_NSP32=m
+CONFIG_SCSI_DEBUG=m
+
+#
+# PCMCIA SCSI adapter support
+#
+CONFIG_SCSI_PCMCIA=y
+CONFIG_PCMCIA_AHA152X=m
+CONFIG_PCMCIA_FDOMAIN=m
+CONFIG_PCMCIA_NINJA_SCSI=m
+CONFIG_PCMCIA_QLOGIC=m
+
+#
+# Fusion MPT device support
+#
+CONFIG_FUSION=m
+# CONFIG_FUSION_BOOT is not set
+CONFIG_FUSION_MAX_SGE=40
+# CONFIG_FUSION_ISENSE is not set
+CONFIG_FUSION_CTL=m
+CONFIG_FUSION_LAN=m
+CONFIG_NET_FC=y
+
+#
+# IEEE 1394 (FireWire) support (EXPERIMENTAL)
+#
+CONFIG_IEEE1394=m
+# CONFIG_IEEE1394_PCILYNX is not set
+CONFIG_IEEE1394_OHCI1394=m
+CONFIG_IEEE1394_VIDEO1394=m
+CONFIG_IEEE1394_SBP2=m
+CONFIG_IEEE1394_SBP2_PHYS_DMA=y
+CONFIG_IEEE1394_ETH1394=m
+CONFIG_IEEE1394_DV1394=m
+CONFIG_IEEE1394_RAWIO=m
+CONFIG_IEEE1394_CMP=m
+CONFIG_IEEE1394_AMDTP=m
+# CONFIG_IEEE1394_VERBOSEDEBUG is not set
+
+#
+# I2O device support
+#
+CONFIG_I2O=m
+CONFIG_I2O_PCI=m
+CONFIG_I2O_BLOCK=m
+CONFIG_I2O_LAN=m
+CONFIG_I2O_SCSI=m
+CONFIG_I2O_PROC=m
+
+#
+# Network device support
+#
+CONFIG_NETDEVICES=y
+
+#
+# ARCnet devices
+#
+# CONFIG_ARCNET is not set
+CONFIG_DUMMY=m
+CONFIG_BONDING=m
+CONFIG_EQUALIZER=m
+CONFIG_TUN=m
+CONFIG_ETHERTAP=m
+CONFIG_NET_SB1000=m
+
+#
+# Ethernet (10 or 100Mbit)
+#
+CONFIG_NET_ETHERNET=y
+# CONFIG_SUNLANCE is not set
+CONFIG_HAPPYMEAL=m
+# CONFIG_SUNBMAC is not set
+# CONFIG_SUNQE is not set
+CONFIG_SUNGEM=m
+CONFIG_NET_VENDOR_3COM=y
+CONFIG_EL1=m
+CONFIG_EL2=m
+CONFIG_ELPLUS=m
+CONFIG_EL16=m
+CONFIG_EL3=m
+CONFIG_3C515=m
+# CONFIG_ELMC is not set
+# CONFIG_ELMC_II is not set
+CONFIG_VORTEX=m
+CONFIG_TYPHOON=m
+CONFIG_LANCE=m
+CONFIG_NET_VENDOR_SMC=y
+CONFIG_WD80x3=m
+# CONFIG_ULTRAMCA is not set
+CONFIG_ULTRA=m
+CONFIG_ULTRA32=m
+CONFIG_SMC9194=m
+CONFIG_NET_VENDOR_RACAL=y
+CONFIG_NI5010=m
+CONFIG_NI52=m
+CONFIG_NI65=m
+CONFIG_AT1700=m
+CONFIG_DEPCA=m
+CONFIG_HP100=m
+CONFIG_NET_ISA=y
+CONFIG_E2100=m
+# CONFIG_EWRK3 is not set
+CONFIG_EEXPRESS=m
+CONFIG_EEXPRESS_PRO=m
+CONFIG_HPLAN_PLUS=m
+CONFIG_HPLAN=m
+CONFIG_LP486E=m
+CONFIG_ETH16I=m
+CONFIG_NE2000=m
+CONFIG_NET_PCI=y
+CONFIG_PCNET32=m
+CONFIG_AMD8111_ETH=m
+CONFIG_ADAPTEC_STARFIRE=m
+CONFIG_AC3200=m
+CONFIG_APRICOT=m
+CONFIG_CS89x0=m
+CONFIG_TULIP=m
+# CONFIG_TULIP_MWI is not set
+CONFIG_TULIP_MMIO=y
+CONFIG_DE4X5=m
+CONFIG_DGRS=m
+CONFIG_DM9102=m
+CONFIG_EEPRO100=m
+# CONFIG_EEPRO100_PIO is not set
+CONFIG_E100=m
+CONFIG_LNE390=m
+CONFIG_FEALNX=m
+CONFIG_NATSEMI=m
+CONFIG_NE2K_PCI=m
+CONFIG_NE3210=m
+CONFIG_ES3210=m
+CONFIG_8139CP=m
+CONFIG_8139TOO=m
+# CONFIG_8139TOO_PIO is not set
+# CONFIG_8139TOO_TUNE_TWISTER is not set
+CONFIG_8139TOO_8129=y
+# CONFIG_8139_OLD_RX_RESET is not set
+CONFIG_SIS900=m
+CONFIG_EPIC100=m
+CONFIG_SUNDANCE=m
+# CONFIG_SUNDANCE_MMIO is not set
+CONFIG_TLAN=m
+CONFIG_TC35815=m
+CONFIG_VIA_RHINE=m
+# CONFIG_VIA_RHINE_MMIO is not set
+CONFIG_WINBOND_840=m
+CONFIG_NET_POCKET=y
+CONFIG_ATP=m
+CONFIG_DE600=m
+CONFIG_DE620=m
+
+#
+# Ethernet (1000 Mbit)
+#
+CONFIG_ACENIC=m
+# CONFIG_ACENIC_OMIT_TIGON_I is not set
+CONFIG_DL2K=m
+CONFIG_E1000=m
+# CONFIG_MYRI_SBUS is not set
+CONFIG_NS83820=m
+CONFIG_HAMACHI=m
+CONFIG_YELLOWFIN=m
+CONFIG_R8169=m
+CONFIG_SK98LIN=m
+CONFIG_TIGON3=m
+CONFIG_FDDI=y
+CONFIG_DEFXX=m
+CONFIG_SKFP=m
+CONFIG_NETCONSOLE=m
+# CONFIG_HIPPI is not set
+CONFIG_PLIP=m
+CONFIG_PPP=m
+CONFIG_PPP_MULTILINK=y
+CONFIG_PPP_FILTER=y
+CONFIG_PPP_ASYNC=m
+CONFIG_PPP_SYNC_TTY=m
+CONFIG_PPP_DEFLATE=m
+# CONFIG_PPP_BSDCOMP is not set
+CONFIG_PPPOE=m
+CONFIG_PPPOATM=m
+CONFIG_SLIP=m
+CONFIG_SLIP_COMPRESSED=y
+CONFIG_SLIP_SMART=y
+CONFIG_SLIP_MODE_SLIP6=y
+
+#
+# Wireless LAN (non-hamradio)
+#
+CONFIG_NET_RADIO=y
+CONFIG_STRIP=m
+CONFIG_WAVELAN=m
+CONFIG_ARLAN=m
+CONFIG_AIRONET4500=m
+CONFIG_AIRONET4500_NONCS=m
+CONFIG_AIRONET4500_PNP=y
+CONFIG_AIRONET4500_PCI=y
+CONFIG_AIRONET4500_ISA=y
+CONFIG_AIRONET4500_I365=y
+CONFIG_AIRONET4500_PROC=m
+CONFIG_AIRO=m
+CONFIG_HERMES=m
+CONFIG_PLX_HERMES=m
+CONFIG_PCI_HERMES=m
+CONFIG_PCMCIA_HERMES=m
+CONFIG_AIRO_CS=m
+CONFIG_NET_WIRELESS=y
+CONFIG_PCMCIA_HERMES_OLD=m
+
+#
+# Token Ring devices
+#
+CONFIG_TR=y
+CONFIG_IBMTR=m
+CONFIG_IBMOL=m
+CONFIG_IBMLS=m
+CONFIG_3C359=m
+CONFIG_TMS380TR=m
+CONFIG_TMSPCI=m
+CONFIG_TMSISA=m
+CONFIG_ABYSS=m
+# CONFIG_MADGEMC is not set
+CONFIG_SMCTR=m
+CONFIG_NET_FC=y
+CONFIG_IPHASE5526=m
+CONFIG_RCPCI=m
+CONFIG_SHAPER=m
+
+#
+# Wan interfaces
+#
+CONFIG_WAN=y
+CONFIG_HOSTESS_SV11=m
+CONFIG_COSA=m
+# CONFIG_COMX is not set
+# CONFIG_DSCC4 is not set
+# CONFIG_LANMEDIA is not set
+CONFIG_ATI_XX20=m
+CONFIG_SEALEVEL_4021=m
+# CONFIG_SYNCLINK_SYNCPPP is not set
+# CONFIG_HDLC is not set
+CONFIG_DLCI=m
+CONFIG_DLCI_COUNT=24
+CONFIG_DLCI_MAX=8
+CONFIG_SDLA=m
+CONFIG_WAN_ROUTER_DRIVERS=y
+CONFIG_VENDOR_SANGOMA=m
+CONFIG_WANPIPE_CHDLC=y
+CONFIG_WANPIPE_FR=y
+CONFIG_WANPIPE_X25=y
+CONFIG_WANPIPE_PPP=y
+CONFIG_WANPIPE_MULTPPP=y
+CONFIG_CYCLADES_SYNC=m
+CONFIG_CYCLOMX_X25=y
+# CONFIG_LAPBETHER is not set
+# CONFIG_X25_ASY is not set
+CONFIG_SBNI=m
+CONFIG_SBNI_MULTILINE=y
+
+#
+# PCMCIA network device support
+#
+CONFIG_NET_PCMCIA=y
+CONFIG_PCMCIA_3C589=m
+CONFIG_PCMCIA_3C574=m
+CONFIG_PCMCIA_FMVJ18X=m
+CONFIG_PCMCIA_PCNET=m
+CONFIG_PCMCIA_AXNET=m
+CONFIG_PCMCIA_NMCLAN=m
+CONFIG_PCMCIA_SMC91C92=m
+CONFIG_PCMCIA_XIRC2PS=m
+# CONFIG_ARCNET_COM20020_CS is not set
+CONFIG_PCMCIA_IBMTR=m
+CONFIG_PCMCIA_XIRCOM=m
+CONFIG_PCMCIA_XIRTULIP=m
+CONFIG_NET_PCMCIA_RADIO=y
+CONFIG_PCMCIA_RAYCS=m
+CONFIG_PCMCIA_NETWAVE=m
+CONFIG_PCMCIA_WAVELAN=m
+CONFIG_PCMCIA_WVLAN=m
+CONFIG_AIRONET4500_CS=m
+
+#
+# ATM drivers
+#
+CONFIG_ATM_TCP=m
+CONFIG_ATM_LANAI=m
+CONFIG_ATM_ENI=m
+# CONFIG_ATM_ENI_DEBUG is not set
+# CONFIG_ATM_ENI_TUNE_BURST is not set
+CONFIG_ATM_FIRESTREAM=m
+CONFIG_ATM_ZATM=m
+# CONFIG_ATM_ZATM_DEBUG is not set
+CONFIG_ATM_ZATM_EXACT_TS=y
+CONFIG_ATM_NICSTAR=m
+CONFIG_ATM_NICSTAR_USE_SUNI=y
+CONFIG_ATM_NICSTAR_USE_IDT77105=y
+CONFIG_ATM_IDT77252=m
+# CONFIG_ATM_IDT77252_DEBUG is not set
+# CONFIG_ATM_IDT77252_RCV_ALL is not set
+CONFIG_ATM_IDT77252_USE_SUNI=y
+CONFIG_ATM_AMBASSADOR=m
+# CONFIG_ATM_AMBASSADOR_DEBUG is not set
+CONFIG_ATM_HORIZON=m
+# CONFIG_ATM_HORIZON_DEBUG is not set
+CONFIG_ATM_IA=m
+# CONFIG_ATM_IA_DEBUG is not set
+CONFIG_ATM_FORE200E_MAYBE=m
+CONFIG_ATM_FORE200E_PCA=y
+CONFIG_ATM_FORE200E_PCA_DEFAULT_FW=y
+CONFIG_ATM_FORE200E_TX_RETRY=16
+CONFIG_ATM_FORE200E_DEBUG=0
+CONFIG_ATM_FORE200E=m
+
+#
+# Amateur Radio support
+#
+CONFIG_HAMRADIO=y
+CONFIG_AX25=m
+# CONFIG_AX25_DAMA_SLAVE is not set
+CONFIG_NETROM=m
+CONFIG_ROSE=m
+
+#
+# AX.25 network device drivers
+#
+# CONFIG_MKISS is not set
+# CONFIG_6PACK is not set
+# CONFIG_BPQETHER is not set
+# CONFIG_DMASCC is not set
+# CONFIG_SCC is not set
+# CONFIG_BAYCOM_SER_FDX is not set
+# CONFIG_BAYCOM_SER_HDX is not set
+# CONFIG_BAYCOM_PAR is not set
+# CONFIG_BAYCOM_EPP is not set
+CONFIG_SOUNDMODEM=m
+CONFIG_SOUNDMODEM_SBC=y
+CONFIG_SOUNDMODEM_WSS=y
+CONFIG_SOUNDMODEM_AFSK1200=y
+CONFIG_SOUNDMODEM_AFSK2400_7=y
+CONFIG_SOUNDMODEM_AFSK2400_8=y
+CONFIG_SOUNDMODEM_AFSK2666=y
+CONFIG_SOUNDMODEM_HAPN4800=y
+CONFIG_SOUNDMODEM_PSK4800=y
+CONFIG_SOUNDMODEM_FSK9600=y
+# CONFIG_YAM is not set
+
+#
+# IrDA (infrared) support
+#
+CONFIG_IRDA=m
+CONFIG_IRLAN=m
+CONFIG_IRNET=m
+CONFIG_IRCOMM=m
+CONFIG_IRDA_ULTRA=y
+CONFIG_IRDA_CACHE_LAST_LSAP=y
+CONFIG_IRDA_FAST_RR=y
+# CONFIG_IRDA_DEBUG is not set
+
+#
+# Infrared-port device drivers
+#
+CONFIG_IRTTY_SIR=m
+CONFIG_IRPORT_SIR=m
+CONFIG_DONGLE=y
+CONFIG_ESI_DONGLE=m
+CONFIG_ACTISYS_DONGLE=m
+CONFIG_TEKRAM_DONGLE=m
+CONFIG_GIRBIL_DONGLE=m
+CONFIG_LITELINK_DONGLE=m
+CONFIG_MCP2120_DONGLE=m
+CONFIG_OLD_BELKIN_DONGLE=m
+CONFIG_ACT200L_DONGLE=m
+CONFIG_MA600_DONGLE=m
+CONFIG_USB_IRDA=m
+CONFIG_NSC_FIR=m
+CONFIG_WINBOND_FIR=m
+CONFIG_TOSHIBA_OLD=m
+CONFIG_TOSHIBA_FIR=m
+CONFIG_SMC_IRCC_FIR=m
+CONFIG_ALI_FIR=m
+CONFIG_VLSI_FIR=m
+
+#
+# ISDN subsystem
+#
+CONFIG_ISDN=m
+CONFIG_ISDN_BOOL=y
+CONFIG_ISDN_PPP=y
+CONFIG_ISDN_PPP_VJ=y
+CONFIG_ISDN_MPP=y
+CONFIG_ISDN_PPP_BSDCOMP=m
+CONFIG_ISDN_AUDIO=y
+CONFIG_ISDN_TTY_FAX=y
+
+#
+# ISDN feature submodules
+#
+CONFIG_ISDN_DRV_LOOP=m
+# CONFIG_ISDN_DIVERSION is not set
+
+#
+# Passive ISDN cards
+#
+CONFIG_ISDN_DRV_HISAX=m
+CONFIG_ISDN_HISAX=y
+CONFIG_HISAX_EURO=y
+CONFIG_DE_AOC=y
+# CONFIG_HISAX_NO_SENDCOMPLETE is not set
+# CONFIG_HISAX_NO_LLC is not set
+# CONFIG_HISAX_NO_KEYPAD is not set
+CONFIG_HISAX_1TR6=y
+CONFIG_HISAX_NI1=y
+CONFIG_HISAX_MAX_CARDS=8
+CONFIG_HISAX_16_0=y
+CONFIG_HISAX_16_3=y
+CONFIG_HISAX_AVM_A1=y
+CONFIG_HISAX_IX1MICROR2=y
+CONFIG_HISAX_ASUSCOM=y
+CONFIG_HISAX_TELEINT=y
+CONFIG_HISAX_HFCS=y
+CONFIG_HISAX_SPORTSTER=y
+CONFIG_HISAX_MIC=y
+CONFIG_HISAX_ISURF=y
+CONFIG_HISAX_HSTSAPHIR=y
+CONFIG_HISAX_TELESPCI=y
+CONFIG_HISAX_S0BOX=y
+CONFIG_HISAX_FRITZPCI=y
+CONFIG_HISAX_AVM_A1_PCMCIA=y
+CONFIG_HISAX_ELSA=y
+CONFIG_HISAX_DIEHLDIVA=y
+CONFIG_HISAX_SEDLBAUER=y
+CONFIG_HISAX_NETJET=y
+CONFIG_HISAX_NETJET_U=y
+CONFIG_HISAX_NICCY=y
+CONFIG_HISAX_BKM_A4T=y
+CONFIG_HISAX_SCT_QUADRO=y
+CONFIG_HISAX_GAZEL=y
+CONFIG_HISAX_HFC_PCI=y
+CONFIG_HISAX_W6692=y
+CONFIG_HISAX_HFC_SX=y
+CONFIG_HISAX_ENTERNOW_PCI=y
+CONFIG_HISAX_DEBUG=y
+CONFIG_HISAX_SEDLBAUER_CS=m
+CONFIG_HISAX_ELSA_CS=m
+CONFIG_HISAX_AVM_A1_CS=m
+CONFIG_HISAX_ST5481=m
+CONFIG_HISAX_FRITZ_PCIPNP=m
+CONFIG_USB_AUERISDN=m
+
+#
+# Active ISDN cards
+#
+CONFIG_ISDN_DRV_ICN=m
+CONFIG_ISDN_DRV_PCBIT=m
+# CONFIG_ISDN_DRV_SC is not set
+# CONFIG_ISDN_DRV_ACT2000 is not set
+CONFIG_ISDN_DRV_EICON=y
+CONFIG_ISDN_DRV_EICON_DIVAS=m
+# CONFIG_ISDN_DRV_EICON_OLD is not set
+CONFIG_ISDN_DRV_TPAM=m
+CONFIG_ISDN_CAPI=m
+CONFIG_ISDN_DRV_AVMB1_VERBOSE_REASON=y
+CONFIG_ISDN_CAPI_MIDDLEWARE=y
+CONFIG_ISDN_CAPI_CAPI20=m
+CONFIG_ISDN_CAPI_CAPIFS_BOOL=y
+CONFIG_ISDN_CAPI_CAPIFS=m
+CONFIG_ISDN_CAPI_CAPIDRV=m
+CONFIG_ISDN_DRV_AVMB1_B1ISA=m
+CONFIG_ISDN_DRV_AVMB1_B1PCI=m
+CONFIG_ISDN_DRV_AVMB1_B1PCIV4=y
+CONFIG_ISDN_DRV_AVMB1_T1ISA=m
+CONFIG_ISDN_DRV_AVMB1_B1PCMCIA=m
+CONFIG_ISDN_DRV_AVMB1_AVM_CS=m
+CONFIG_ISDN_DRV_AVMB1_T1PCI=m
+CONFIG_ISDN_DRV_AVMB1_C4=m
+CONFIG_HYSDN=m
+CONFIG_HYSDN_CAPI=y
+CONFIG_KALLSYMS=y
+
+#
+# Old CD-ROM drivers (not SCSI, not IDE)
+#
+# CONFIG_CD_NO_IDESCSI is not set
+
+#
+# Input core support
+#
+CONFIG_INPUT=m
+CONFIG_INPUT_KEYBDEV=m
+CONFIG_INPUT_MOUSEDEV=m
+CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024
+CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768
+CONFIG_INPUT_JOYDEV=m
+CONFIG_INPUT_EVDEV=m
+
+#
+# Character devices
+#
+CONFIG_VT=y
+CONFIG_ECC=m
+CONFIG_VT_CONSOLE=y
+CONFIG_SERIAL=y
+CONFIG_SERIAL_CONSOLE=y
+CONFIG_SERIAL_EXTENDED=y
+CONFIG_SERIAL_MANY_PORTS=y
+CONFIG_SERIAL_SHARE_IRQ=y
+# CONFIG_SERIAL_DETECT_IRQ is not set
+CONFIG_SERIAL_MULTIPORT=y
+# CONFIG_HUB6 is not set
+CONFIG_SERIAL_NONSTANDARD=y
+CONFIG_COMPUTONE=m
+CONFIG_ROCKETPORT=m
+CONFIG_CYCLADES=m
+# CONFIG_CYZ_INTR is not set
+CONFIG_DIGIEPCA=m
+CONFIG_ESPSERIAL=m
+CONFIG_MOXA_INTELLIO=m
+CONFIG_MOXA_SMARTIO=m
+CONFIG_ISI=m
+CONFIG_SYNCLINK=m
+# CONFIG_SYNCLINKMP is not set
+CONFIG_N_HDLC=m
+CONFIG_RISCOM8=m
+CONFIG_SPECIALIX=m
+CONFIG_SPECIALIX_RTSCTS=y
+CONFIG_SX=m
+# CONFIG_RIO is not set
+CONFIG_STALDRV=y
+CONFIG_STALLION=m
+CONFIG_ISTALLION=m
+CONFIG_UNIX98_PTYS=y
+CONFIG_UNIX98_PTY_COUNT=2048
+CONFIG_PRINTER=m
+CONFIG_LP_CONSOLE=y
+CONFIG_PPDEV=m
+CONFIG_TIPAR=m
+
+#
+# I2C support
+#
+CONFIG_I2C=m
+CONFIG_I2C_ALGOBIT=m
+CONFIG_I2C_PHILIPSPAR=m
+CONFIG_I2C_ELV=m
+CONFIG_I2C_VELLEMAN=m
+# CONFIG_SCx200_I2C is not set
+# CONFIG_SCx200_ACB is not set
+CONFIG_I2C_ALGOPCF=m
+CONFIG_I2C_ELEKTOR=m
+CONFIG_I2C_MAINBOARD=y
+CONFIG_I2C_ALI1535=m
+CONFIG_I2C_ALI15X3=m
+CONFIG_I2C_HYDRA=m
+CONFIG_I2C_AMD756=m
+# CONFIG_I2C_TSUNAMI is not set
+CONFIG_I2C_I801=m
+# CONFIG_I2C_I810 is not set
+CONFIG_I2C_PIIX4=m
+CONFIG_I2C_SIS5595=m
+CONFIG_I2C_VIA=m
+CONFIG_I2C_VIAPRO=m
+CONFIG_I2C_VOODOO3=m
+CONFIG_I2C_ISA=m
+CONFIG_I2C_CHARDEV=m
+CONFIG_I2C_PROC=m
+
+#
+# Hardware sensors support
+#
+CONFIG_SENSORS=y
+CONFIG_SENSORS_ADM1021=m
+CONFIG_SENSORS_ADM1024=m
+CONFIG_SENSORS_ADM1025=m
+CONFIG_SENSORS_ADM9240=m
+CONFIG_SENSORS_DS1621=m
+CONFIG_SENSORS_FSCPOS=m
+CONFIG_SENSORS_FSCSCY=m
+CONFIG_SENSORS_GL518SM=m
+CONFIG_SENSORS_GL520SM=m
+CONFIG_SENSORS_MAXILIFE=m
+CONFIG_SENSORS_IT87=m
+CONFIG_SENSORS_MTP008=m
+CONFIG_SENSORS_LM75=m
+CONFIG_SENSORS_LM78=m
+CONFIG_SENSORS_LM80=m
+CONFIG_SENSORS_LM87=m
+CONFIG_SENSORS_LM92=m
+CONFIG_SENSORS_SIS5595=m
+CONFIG_SENSORS_SMSC47M1=m
+CONFIG_SENSORS_THMC50=m
+CONFIG_SENSORS_VIA686A=m
+CONFIG_SENSORS_VT1211=m
+CONFIG_SENSORS_VT8231=m
+CONFIG_SENSORS_W83781D=m
+CONFIG_SENSORS_OTHER=y
+CONFIG_SENSORS_BT869=m
+CONFIG_SENSORS_DDCMON=m
+CONFIG_SENSORS_EEPROM=m
+CONFIG_SENSORS_MATORB=m
+CONFIG_SENSORS_PCF8574=m
+CONFIG_SENSORS_PCF8591=m
+
+#
+# Mice
+#
+CONFIG_BUSMOUSE=m
+CONFIG_ATIXL_BUSMOUSE=m
+CONFIG_LOGIBUSMOUSE=m
+CONFIG_MS_BUSMOUSE=m
+CONFIG_MOUSE=y
+CONFIG_PSMOUSE=y
+CONFIG_82C710_MOUSE=m
+CONFIG_PC110_PAD=m
+CONFIG_MK712_MOUSE=m
+
+#
+# Joysticks
+#
+CONFIG_INPUT_GAMEPORT=m
+CONFIG_INPUT_NS558=m
+CONFIG_INPUT_LIGHTNING=m
+CONFIG_INPUT_PCIGAME=m
+CONFIG_INPUT_CS461X=m
+CONFIG_INPUT_EMU10K1=m
+CONFIG_INPUT_SERIO=m
+CONFIG_INPUT_SERPORT=m
+CONFIG_INPUT_ANALOG=m
+CONFIG_INPUT_A3D=m
+CONFIG_INPUT_ADI=m
+CONFIG_INPUT_COBRA=m
+CONFIG_INPUT_GF2K=m
+CONFIG_INPUT_GRIP=m
+CONFIG_INPUT_INTERACT=m
+CONFIG_INPUT_TMDC=m
+CONFIG_INPUT_SIDEWINDER=m
+CONFIG_INPUT_IFORCE_USB=m
+CONFIG_INPUT_IFORCE_232=m
+CONFIG_INPUT_WARRIOR=m
+CONFIG_INPUT_MAGELLAN=m
+CONFIG_INPUT_SPACEORB=m
+CONFIG_INPUT_SPACEBALL=m
+CONFIG_INPUT_STINGER=m
+CONFIG_INPUT_DB9=m
+CONFIG_INPUT_GAMECON=m
+CONFIG_INPUT_TURBOGRAFX=m
+# CONFIG_QIC02_TAPE is not set
+CONFIG_IPMI_HANDLER=m
+# CONFIG_IPMI_PANIC_EVENT is not set
+CONFIG_IPMI_DEVICE_INTERFACE=m
+CONFIG_IPMI_KCS=m
+CONFIG_IPMI_WATCHDOG=m
+
+#
+# Watchdog Cards
+#
+CONFIG_WATCHDOG=y
+# CONFIG_WATCHDOG_NOWAYOUT is not set
+CONFIG_ACQUIRE_WDT=m
+CONFIG_ADVANTECH_WDT=m
+CONFIG_ALIM7101_WDT=m
+CONFIG_SC520_WDT=m
+CONFIG_PCWATCHDOG=m
+CONFIG_EUROTECH_WDT=m
+CONFIG_IB700_WDT=m
+CONFIG_WAFER_WDT=m
+CONFIG_I810_TCO=m
+# CONFIG_MIXCOMWD is not set
+# CONFIG_60XX_WDT is not set
+CONFIG_SC1200_WDT=m
+# CONFIG_SCx200_WDT is not set
+CONFIG_SOFT_WATCHDOG=m
+CONFIG_W83877F_WDT=m
+CONFIG_WDT=m
+CONFIG_WDTPCI=m
+# CONFIG_WDT_501 is not set
+CONFIG_MACHZ_WDT=m
+CONFIG_AMD7XX_TCO=m
+# CONFIG_SCx200_GPIO is not set
+CONFIG_AMD_RNG=m
+CONFIG_INTEL_RNG=m
+CONFIG_AMD_PM768=m
+CONFIG_NVRAM=m
+CONFIG_RTC=y
+CONFIG_DTLK=m
+CONFIG_R3964=m
+# CONFIG_APPLICOM is not set
+CONFIG_SONYPI=m
+
+#
+# Ftape, the floppy tape device driver
+#
+CONFIG_FTAPE=m
+CONFIG_ZFTAPE=m
+CONFIG_ZFT_DFLT_BLK_SZ=10240
+CONFIG_ZFT_COMPRESSOR=m
+CONFIG_FT_NR_BUFFERS=3
+# CONFIG_FT_PROC_FS is not set
+CONFIG_FT_NORMAL_DEBUG=y
+# CONFIG_FT_FULL_DEBUG is not set
+# CONFIG_FT_NO_TRACE is not set
+# CONFIG_FT_NO_TRACE_AT_ALL is not set
+CONFIG_FT_STD_FDC=y
+# CONFIG_FT_MACH2 is not set
+# CONFIG_FT_PROBE_FC10 is not set
+# CONFIG_FT_ALT_FDC is not set
+CONFIG_FT_FDC_THR=8
+CONFIG_FT_FDC_MAX_RATE=2000
+CONFIG_FT_ALPHA_CLOCK=0
+CONFIG_AGP=m
+CONFIG_AGP_INTEL=y
+CONFIG_AGP_I810=y
+CONFIG_AGP_VIA=y
+CONFIG_AGP_AMD=y
+CONFIG_AGP_AMD_8151=y
+CONFIG_AGP_SIS=y
+CONFIG_AGP_ALI=y
+CONFIG_AGP_SWORKS=y
+CONFIG_DRM=y
+# CONFIG_DRM_OLD is not set
+CONFIG_DRM_NEW=y
+CONFIG_DRM_TDFX=m
+CONFIG_DRM_R128=m
+CONFIG_DRM_RADEON=m
+CONFIG_DRM_I810=m
+# CONFIG_DRM_I810_XFREE_41 is not set
+CONFIG_DRM_I830=m
+CONFIG_DRM_MGA=m
+# CONFIG_DRM_SIS is not set
+
+#
+# PCMCIA character devices
+#
+CONFIG_PCMCIA_SERIAL_CS=m
+CONFIG_SYNCLINK_CS=m
+CONFIG_MWAVE=m
+CONFIG_BATTERY_GERICOM=m
+
+#
+# Multimedia devices
+#
+CONFIG_VIDEO_DEV=m
+
+#
+# Video For Linux
+#
+CONFIG_VIDEO_PROC_FS=y
+CONFIG_I2C_PARPORT=m
+CONFIG_VIDEO_BT848=m
+CONFIG_VIDEO_PMS=m
+CONFIG_VIDEO_BWQCAM=m
+CONFIG_VIDEO_CQCAM=m
+CONFIG_VIDEO_W9966=m
+CONFIG_VIDEO_CPIA=m
+CONFIG_VIDEO_CPIA_PP=m
+CONFIG_VIDEO_CPIA_USB=m
+CONFIG_VIDEO_SAA5249=m
+CONFIG_TUNER_3036=m
+CONFIG_VIDEO_STRADIS=m
+CONFIG_VIDEO_ZORAN=m
+CONFIG_VIDEO_ZORAN_BUZ=m
+CONFIG_VIDEO_ZORAN_DC10=m
+CONFIG_VIDEO_ZORAN_LML33=m
+CONFIG_VIDEO_ZR36120=m
+CONFIG_VIDEO_MEYE=m
+
+#
+# Radio Adapters
+#
+CONFIG_RADIO_CADET=m
+CONFIG_RADIO_RTRACK=m
+CONFIG_RADIO_RTRACK2=m
+CONFIG_RADIO_AZTECH=m
+CONFIG_RADIO_GEMTEK=m
+CONFIG_RADIO_GEMTEK_PCI=m
+CONFIG_RADIO_MAXIRADIO=m
+CONFIG_RADIO_MAESTRO=m
+CONFIG_RADIO_MIROPCM20=m
+CONFIG_RADIO_MIROPCM20_RDS=m
+CONFIG_RADIO_SF16FMI=m
+CONFIG_RADIO_SF16FMR2=m
+CONFIG_RADIO_TERRATEC=m
+CONFIG_RADIO_TRUST=m
+CONFIG_RADIO_TYPHOON=m
+CONFIG_RADIO_TYPHOON_PROC_FS=y
+CONFIG_RADIO_ZOLTRIX=m
+
+#
+# Crypto Hardware support
+#
+CONFIG_CRYPTO=m
+CONFIG_CRYPTO_BROADCOM=m
+
+#
+# File systems
+#
+CONFIG_QUOTA=y
+# CONFIG_QFMT_V1 is not set
+CONFIG_QFMT_V2=y
+# CONFIG_QIFACE_COMPAT is not set
+CONFIG_AUTOFS_FS=m
+CONFIG_AUTOFS4_FS=m
+CONFIG_REISERFS_FS=m
+# CONFIG_REISERFS_CHECK is not set
+CONFIG_REISERFS_PROC_INFO=y
+# CONFIG_ADFS_FS is not set
+CONFIG_AFS_FS=m
+# CONFIG_ADFS_FS_RW is not set
+# CONFIG_AFFS_FS is not set
+CONFIG_HFS_FS=m
+CONFIG_BEFS_FS=m
+# CONFIG_BEFS_DEBUG is not set
+CONFIG_BFS_FS=m
+CONFIG_EXT3_FS=m
+CONFIG_EXT3_FS_XATTR=y
+CONFIG_EXT3_FS_XATTR_SHARING=y
+CONFIG_EXT3_FS_XATTR_USER=y
+CONFIG_JBD=m
+# CONFIG_JBD_DEBUG is not set
+CONFIG_FAT_FS=m
+CONFIG_MSDOS_FS=m
+CONFIG_UMSDOS_FS=m
+CONFIG_VFAT_FS=m
+# CONFIG_EFS_FS is not set
+# CONFIG_JFFS_FS is not set
+# CONFIG_JFFS2_FS is not set
+CONFIG_CRAMFS=m
+CONFIG_TMPFS=y
+CONFIG_RAMFS=y
+CONFIG_ISO9660_FS=y
+CONFIG_JOLIET=y
+CONFIG_ZISOFS=y
+CONFIG_JFS_FS=m
+# CONFIG_JFS_DEBUG is not set
+# CONFIG_JFS_STATISTICS is not set
+CONFIG_MINIX_FS=m
+CONFIG_VXFS_FS=m
+# CONFIG_NTFS_FS is not set
+# CONFIG_NTFS_RW is not set
+# CONFIG_HPFS_FS is not set
+CONFIG_PROC_FS=y
+# CONFIG_DEVFS_FS is not set
+# CONFIG_DEVFS_MOUNT is not set
+# CONFIG_DEVFS_DEBUG is not set
+CONFIG_DEVPTS_FS=y
+# CONFIG_QNX4FS_FS is not set
+# CONFIG_QNX4FS_RW is not set
+CONFIG_ROMFS_FS=m
+CONFIG_EXT2_FS=y
+CONFIG_EXT2_FS_XATTR=y
+CONFIG_EXT2_FS_XATTR_SHARING=y
+CONFIG_EXT2_FS_XATTR_USER=y
+CONFIG_SYSV_FS=m
+CONFIG_UDF_FS=m
+CONFIG_UDF_RW=y
+CONFIG_UFS_FS=m
+# CONFIG_UFS_FS_WRITE is not set
+
+#
+# Network File Systems
+#
+CONFIG_CODA_FS=m
+CONFIG_INTERMEZZO_FS=m
+CONFIG_NFS_FS=m
+CONFIG_NFS_V3=y
+# CONFIG_ROOT_NFS is not set
+CONFIG_NFSD=m
+CONFIG_NFSD_V3=y
+# CONFIG_NFSD_TCP is not set
+CONFIG_SUNRPC=m
+CONFIG_LOCKD=m
+CONFIG_LOCKD_V4=y
+CONFIG_SMB_FS=m
+# CONFIG_SMB_NLS_DEFAULT is not set
+CONFIG_NCP_FS=m
+CONFIG_NCPFS_PACKET_SIGNING=y
+CONFIG_NCPFS_IOCTL_LOCKING=y
+CONFIG_NCPFS_STRONG=y
+CONFIG_NCPFS_NFS_NS=y
+CONFIG_NCPFS_OS2_NS=y
+CONFIG_NCPFS_SMALLDOS=y
+CONFIG_NCPFS_NLS=y
+CONFIG_NCPFS_EXTRAS=y
+CONFIG_ZISOFS_FS=y
+CONFIG_FS_MBCACHE=y
+
+#
+# Partition Types
+#
+CONFIG_PARTITION_ADVANCED=y
+# CONFIG_ACORN_PARTITION is not set
+CONFIG_OSF_PARTITION=y
+# CONFIG_AMIGA_PARTITION is not set
+# CONFIG_ATARI_PARTITION is not set
+CONFIG_MAC_PARTITION=y
+CONFIG_MSDOS_PARTITION=y
+CONFIG_BSD_DISKLABEL=y
+CONFIG_MINIX_SUBPARTITION=y
+CONFIG_SOLARIS_X86_PARTITION=y
+CONFIG_UNIXWARE_DISKLABEL=y
+# CONFIG_LDM_PARTITION is not set
+CONFIG_SGI_PARTITION=y
+# CONFIG_ULTRIX_PARTITION is not set
+CONFIG_SUN_PARTITION=y
+# CONFIG_EFI_PARTITION is not set
+CONFIG_SMB_NLS=y
+CONFIG_NLS=y
+
+#
+# Native Language Support
+#
+CONFIG_NLS_DEFAULT="iso8859-1"
+CONFIG_NLS_CODEPAGE_437=m
+CONFIG_NLS_CODEPAGE_737=m
+CONFIG_NLS_CODEPAGE_775=m
+CONFIG_NLS_CODEPAGE_850=m
+CONFIG_NLS_CODEPAGE_852=m
+CONFIG_NLS_CODEPAGE_855=m
+CONFIG_NLS_CODEPAGE_857=m
+CONFIG_NLS_CODEPAGE_860=m
+CONFIG_NLS_CODEPAGE_861=m
+CONFIG_NLS_CODEPAGE_862=m
+CONFIG_NLS_CODEPAGE_863=m
+CONFIG_NLS_CODEPAGE_864=m
+CONFIG_NLS_CODEPAGE_865=m
+CONFIG_NLS_CODEPAGE_866=m
+CONFIG_NLS_CODEPAGE_869=m
+CONFIG_NLS_CODEPAGE_936=m
+CONFIG_NLS_CODEPAGE_950=m
+CONFIG_NLS_CODEPAGE_932=m
+CONFIG_NLS_CODEPAGE_949=m
+CONFIG_NLS_CODEPAGE_874=m
+CONFIG_NLS_ISO8859_8=m
+CONFIG_NLS_CODEPAGE_1250=m
+CONFIG_NLS_CODEPAGE_1251=m
+CONFIG_NLS_ISO8859_1=m
+CONFIG_NLS_ISO8859_2=m
+CONFIG_NLS_ISO8859_3=m
+CONFIG_NLS_ISO8859_4=m
+CONFIG_NLS_ISO8859_5=m
+CONFIG_NLS_ISO8859_6=m
+CONFIG_NLS_ISO8859_7=m
+CONFIG_NLS_ISO8859_9=m
+CONFIG_NLS_ISO8859_13=m
+CONFIG_NLS_ISO8859_14=m
+CONFIG_NLS_ISO8859_15=m
+CONFIG_NLS_KOI8_R=m
+CONFIG_NLS_KOI8_U=m
+CONFIG_NLS_UTF8=m
+
+#
+# Console drivers
+#
+CONFIG_VGA_CONSOLE=y
+CONFIG_VIDEO_SELECT=y
+# CONFIG_VIDEO_IGNORE_BAD_MODE is not set
+CONFIG_MDA_CONSOLE=m
+
+#
+# Frame-buffer support
+#
+CONFIG_FB=y
+CONFIG_DUMMY_CONSOLE=y
+CONFIG_FB_RIVA=m
+CONFIG_FB_CLGEN=m
+CONFIG_FB_PM2=m
+# CONFIG_FB_PM2_FIFO_DISCONNECT is not set
+CONFIG_FB_PM2_PCI=y
+CONFIG_FB_PM3=m
+# CONFIG_FB_CYBER2000 is not set
+CONFIG_FB_VESA=y
+CONFIG_FB_VGA16=m
+CONFIG_FB_HGA=m
+CONFIG_VIDEO_SELECT=y
+CONFIG_FB_MATROX=m
+CONFIG_FB_MATROX_MILLENIUM=y
+CONFIG_FB_MATROX_MYSTIQUE=y
+# CONFIG_FB_MATROX_G450 is not set
+CONFIG_FB_MATROX_G100A=y
+CONFIG_FB_MATROX_G100=y
+CONFIG_FB_MATROX_I2C=m
+CONFIG_FB_MATROX_MAVEN=m
+# CONFIG_FB_MATROX_PROC is not set
+CONFIG_FB_MATROX_MULTIHEAD=y
+CONFIG_FB_ATY=m
+CONFIG_FB_ATY_GX=y
+CONFIG_FB_ATY_CT=y
+CONFIG_FB_ATY_CT_VAIO_LCD=y
+CONFIG_FB_RADEON=m
+CONFIG_FB_ATY128=m
+CONFIG_FB_SIS=m
+CONFIG_FB_SIS_300=y
+CONFIG_FB_SIS_315=y
+CONFIG_FB_NEOMAGIC=m
+CONFIG_FB_3DFX=m
+CONFIG_FB_VOODOO1=m
+# CONFIG_FB_TRIDENT is not set
+# CONFIG_FB_VIRTUAL is not set
+# CONFIG_FBCON_ADVANCED is not set
+CONFIG_FBCON_MFB=m
+CONFIG_FBCON_CFB8=y
+CONFIG_FBCON_CFB16=y
+CONFIG_FBCON_CFB24=y
+CONFIG_FBCON_CFB32=y
+CONFIG_FBCON_VGA_PLANES=m
+CONFIG_FBCON_HGA=m
+# CONFIG_FBCON_FONTWIDTH8_ONLY is not set
+# CONFIG_FBCON_FONTS is not set
+CONFIG_FONT_8x8=y
+CONFIG_FONT_8x16=y
+
+#
+# Sound
+#
+CONFIG_SOUND=m
+CONFIG_SOUND_ALI5455=m
+CONFIG_SOUND_BT878=m
+CONFIG_SOUND_CMPCI=m
+CONFIG_SOUND_CMPCI_FM=y
+CONFIG_SOUND_CMPCI_FMIO=388
+CONFIG_SOUND_CMPCI_FMIO=388
+CONFIG_SOUND_CMPCI_MIDI=y
+CONFIG_SOUND_CMPCI_MPUIO=330
+CONFIG_SOUND_CMPCI_JOYSTICK=y
+CONFIG_SOUND_CMPCI_CM8738=y
+# CONFIG_SOUND_CMPCI_SPDIFINVERSE is not set
+CONFIG_SOUND_CMPCI_SPDIFLOOP=y
+CONFIG_SOUND_CMPCI_SPEAKERS=2
+CONFIG_SOUND_EMU10K1=m
+CONFIG_MIDI_EMU10K1=y
+CONFIG_SOUND_AUDIGY=m
+CONFIG_SOUND_FUSION=m
+CONFIG_SOUND_CS4281=m
+CONFIG_SOUND_ES1370=m
+CONFIG_SOUND_ES1371=m
+CONFIG_SOUND_ESSSOLO1=m
+CONFIG_SOUND_MAESTRO=m
+CONFIG_SOUND_MAESTRO3=m
+CONFIG_SOUND_FORTE=m
+CONFIG_SOUND_ICH=m
+CONFIG_SOUND_RME96XX=m
+CONFIG_SOUND_SONICVIBES=m
+CONFIG_SOUND_TRIDENT=m
+CONFIG_SOUND_MSNDCLAS=m
+# CONFIG_MSNDCLAS_HAVE_BOOT is not set
+CONFIG_MSNDCLAS_INIT_FILE="/etc/sound/msndinit.bin"
+CONFIG_MSNDCLAS_PERM_FILE="/etc/sound/msndperm.bin"
+CONFIG_SOUND_MSNDPIN=m
+# CONFIG_MSNDPIN_HAVE_BOOT is not set
+CONFIG_MSNDPIN_INIT_FILE="/etc/sound/pndspini.bin"
+CONFIG_MSNDPIN_PERM_FILE="/etc/sound/pndsperm.bin"
+CONFIG_SOUND_VIA82CXXX=m
+CONFIG_MIDI_VIA82CXXX=y
+CONFIG_SOUND_OSS=m
+# CONFIG_SOUND_TRACEINIT is not set
+CONFIG_SOUND_DMAP=y
+CONFIG_SOUND_AD1816=m
+CONFIG_SOUND_AD1889=m
+CONFIG_SOUND_SGALAXY=m
+CONFIG_SOUND_ADLIB=m
+CONFIG_SOUND_ACI_MIXER=m
+CONFIG_SOUND_CS4232=m
+CONFIG_SOUND_SSCAPE=m
+CONFIG_SOUND_GUS=m
+CONFIG_SOUND_GUS16=y
+CONFIG_SOUND_GUSMAX=y
+CONFIG_SOUND_VMIDI=m
+CONFIG_SOUND_TRIX=m
+CONFIG_SOUND_MSS=m
+CONFIG_SOUND_MPU401=m
+CONFIG_SOUND_NM256=m
+CONFIG_SOUND_MAD16=m
+CONFIG_MAD16_OLDCARD=y
+CONFIG_SOUND_PAS=m
+# CONFIG_PAS_JOYSTICK is not set
+CONFIG_SOUND_PSS=m
+# CONFIG_PSS_MIXER is not set
+# CONFIG_PSS_HAVE_BOOT is not set
+CONFIG_SOUND_SB=m
+CONFIG_SOUND_AWE32_SYNTH=m
+CONFIG_SOUND_KAHLUA=m
+CONFIG_SOUND_WAVEFRONT=m
+CONFIG_SOUND_MAUI=m
+CONFIG_SOUND_YM3812=m
+CONFIG_SOUND_OPL3SA1=m
+CONFIG_SOUND_OPL3SA2=m
+CONFIG_SOUND_YMFPCI=m
+CONFIG_SOUND_YMFPCI_LEGACY=y
+CONFIG_SOUND_UART6850=m
+CONFIG_SOUND_AEDSP16=m
+CONFIG_SC6600=y
+CONFIG_SC6600_JOY=y
+CONFIG_SC6600_CDROM=4
+CONFIG_SC6600_CDROMBASE=0
+CONFIG_AEDSP16_SBPRO=y
+CONFIG_AEDSP16_MPU401=y
+CONFIG_SOUND_TVMIXER=m
+
+#
+# USB support
+#
+CONFIG_USB=m
+# CONFIG_USB_DEBUG is not set
+CONFIG_USB_DEVICEFS=y
+# CONFIG_USB_BANDWIDTH is not set
+CONFIG_USB_EHCI_HCD=m
+CONFIG_USB_UHCI=m
+CONFIG_USB_UHCI_ALT=m
+CONFIG_USB_OHCI=m
+CONFIG_USB_AUDIO=m
+# CONFIG_USB_EMI26 is not set
+CONFIG_USB_MIDI=m
+CONFIG_USB_STORAGE=m
+# CONFIG_USB_STORAGE_DEBUG is not set
+CONFIG_USB_STORAGE_DATAFAB=y
+CONFIG_USB_STORAGE_FREECOM=y
+CONFIG_USB_STORAGE_ISD200=y
+CONFIG_USB_STORAGE_DPCM=y
+CONFIG_USB_STORAGE_HP8200e=y
+CONFIG_USB_STORAGE_SDDR09=y
+CONFIG_USB_STORAGE_SDDR55=y
+CONFIG_USB_STORAGE_JUMPSHOT=y
+CONFIG_USB_ACM=m
+CONFIG_USB_PRINTER=m
+CONFIG_USB_HID=m
+CONFIG_USB_HIDINPUT=y
+CONFIG_USB_HIDDEV=y
+# CONFIG_USB_KBD is not set
+# CONFIG_USB_MOUSE is not set
+CONFIG_USB_AIPTEK=m
+CONFIG_USB_WACOM=m
+CONFIG_USB_KBTAB=m
+CONFIG_USB_POWERMATE=m
+# CONFIG_USB_DC2XX is not set
+CONFIG_USB_MDC800=m
+CONFIG_USB_SCANNER=m
+CONFIG_USB_MICROTEK=m
+CONFIG_USB_HPUSBSCSI=m
+CONFIG_USB_IBMCAM=m
+CONFIG_USB_KONICAWC=m
+CONFIG_USB_OV511=m
+CONFIG_USB_PWC=m
+CONFIG_USB_SE401=m
+CONFIG_USB_STV680=m
+CONFIG_USB_VICAM=m
+CONFIG_USB_DSBR=m
+CONFIG_USB_DABUSB=m
+CONFIG_USB_PEGASUS=m
+CONFIG_USB_RTL8150=m
+CONFIG_USB_KAWETH=m
+CONFIG_USB_CATC=m
+CONFIG_USB_CDCETHER=m
+CONFIG_USB_USBNET=m
+CONFIG_USB_USS720=m
+
+#
+# USB Serial Converter support
+#
+CONFIG_USB_SERIAL=m
+# CONFIG_USB_SERIAL_DEBUG is not set
+CONFIG_USB_SERIAL_GENERIC=y
+CONFIG_USB_SERIAL_BELKIN=m
+CONFIG_USB_SERIAL_WHITEHEAT=m
+CONFIG_USB_SERIAL_DIGI_ACCELEPORT=m
+CONFIG_USB_SERIAL_EMPEG=m
+CONFIG_USB_SERIAL_FTDI_SIO=m
+CONFIG_USB_SERIAL_VISOR=m
+CONFIG_USB_SERIAL_IPAQ=m
+CONFIG_USB_SERIAL_IR=m
+CONFIG_USB_SERIAL_EDGEPORT=m
+CONFIG_USB_SERIAL_EDGEPORT_TI=m
+CONFIG_USB_SERIAL_KEYSPAN_PDA=m
+CONFIG_USB_SERIAL_KEYSPAN=m
+# CONFIG_USB_SERIAL_KEYSPAN_USA28 is not set
+# CONFIG_USB_SERIAL_KEYSPAN_USA28X is not set
+CONFIG_USB_SERIAL_KEYSPAN_USA28XA=y
+CONFIG_USB_SERIAL_KEYSPAN_USA28XB=y
+# CONFIG_USB_SERIAL_KEYSPAN_USA19 is not set
+# CONFIG_USB_SERIAL_KEYSPAN_USA18X is not set
+CONFIG_USB_SERIAL_KEYSPAN_USA19W=y
+CONFIG_USB_SERIAL_KEYSPAN_USA19QW=y
+CONFIG_USB_SERIAL_KEYSPAN_USA19QI=y
+CONFIG_USB_SERIAL_KEYSPAN_MPR=y
+CONFIG_USB_SERIAL_KEYSPAN_USA49W=y
+CONFIG_USB_SERIAL_KEYSPAN_USA49WLC=y
+CONFIG_USB_SERIAL_MCT_U232=m
+CONFIG_USB_SERIAL_KLSI=m
+CONFIG_USB_SERIAL_KOBIL_SCT=m
+CONFIG_USB_SERIAL_PL2303=m
+CONFIG_USB_SERIAL_CYBERJACK=m
+CONFIG_USB_SERIAL_XIRCOM=m
+CONFIG_USB_SERIAL_OMNINET=m
+CONFIG_USB_RIO500=m
+CONFIG_USB_AUERSWALD=m
+CONFIG_USB_TIGL=m
+CONFIG_USB_BRLVGER=m
+CONFIG_USB_LCD=m
+
+#
+# Additional device driver support
+#
+CONFIG_NET_BROADCOM=m
+CONFIG_CIPE=m
+CONFIG_CRYPTO_AEP=m
+CONFIG_MEGARAC=m
+CONFIG_FC_QLA2200=m
+CONFIG_FC_QLA2300=m
+CONFIG_SCSI_ISCSI=m
+
+#
+# Bluetooth support
+#
+CONFIG_BLUEZ=m
+CONFIG_BLUEZ_L2CAP=m
+CONFIG_BLUEZ_SCO=m
+CONFIG_BLUEZ_RFCOMM=m
+CONFIG_BLUEZ_RFCOMM_TTY=y
+CONFIG_BLUEZ_BNEP=m
+CONFIG_BLUEZ_BNEP_MC_FILTER=y
+CONFIG_BLUEZ_BNEP_PROTO_FILTER=y
+
+#
+# Bluetooth device drivers
+#
+CONFIG_BLUEZ_HCIUSB=m
+CONFIG_BLUEZ_USB_SCO=y
+CONFIG_BLUEZ_USB_ZERO_PACKET=y
+CONFIG_BLUEZ_HCIUART=m
+CONFIG_BLUEZ_HCIUART_H4=y
+CONFIG_BLUEZ_HCIUART_BCSP=y
+CONFIG_BLUEZ_HCIUART_BCSP_TXCRC=y
+CONFIG_BLUEZ_HCIDTL1=m
+CONFIG_BLUEZ_HCIBT3C=m
+CONFIG_BLUEZ_HCIBLUECARD=m
+CONFIG_BLUEZ_HCIBTUART=m
+CONFIG_BLUEZ_HCIVHCI=m
+
+#
+# Profiling support
+#
+CONFIG_PROFILING=y
+CONFIG_OPROFILE=m
+
+#
+# Kernel hacking
+#
+CONFIG_DEBUG_KERNEL=y
+CONFIG_DEBUG_STACKOVERFLOW=y
+# CONFIG_DEBUG_HIGHMEM is not set
+# CONFIG_DEBUG_SLAB is not set
+# CONFIG_DEBUG_IOVIRT is not set
+CONFIG_MAGIC_SYSRQ=y
+# CONFIG_DEBUG_SPINLOCK is not set
+# CONFIG_FRAME_POINTER is not set
+# CONFIG_MCL_COREDUMP is not set
+
+#
+# Library routines
+#
+CONFIG_ZLIB_INFLATE=y
+CONFIG_ZLIB_DEFLATE=m
--- /dev/null
+#
+# Automatically generated by make menuconfig: don't edit
+#
+CONFIG_X86=y
+# CONFIG_SBUS is not set
+CONFIG_UID16=y
+
+#
+# Code maturity level options
+#
+CONFIG_EXPERIMENTAL=y
+
+#
+# Loadable module support
+#
+CONFIG_MODULES=y
+CONFIG_MODVERSIONS=y
+CONFIG_KMOD=y
+
+#
+# Processor type and features
+#
+CONFIG_LOLAT=y
+# CONFIG_M386 is not set
+# CONFIG_M486 is not set
+# CONFIG_M586 is not set
+# CONFIG_M586TSC is not set
+# CONFIG_M586MMX is not set
+CONFIG_M686=y
+# CONFIG_MPENTIUMIII is not set
+# CONFIG_MPENTIUM4 is not set
+# CONFIG_MK6 is not set
+# CONFIG_MK7 is not set
+# CONFIG_MELAN is not set
+# CONFIG_MCRUSOE is not set
+# CONFIG_MWINCHIPC6 is not set
+# CONFIG_MWINCHIP2 is not set
+# CONFIG_MWINCHIP3D is not set
+# CONFIG_MCYRIXIII is not set
+CONFIG_X86_WP_WORKS_OK=y
+CONFIG_X86_INVLPG=y
+CONFIG_X86_CMPXCHG=y
+CONFIG_X86_XADD=y
+CONFIG_X86_BSWAP=y
+CONFIG_X86_POPAD_OK=y
+# CONFIG_RWSEM_GENERIC_SPINLOCK is not set
+CONFIG_RWSEM_XCHGADD_ALGORITHM=y
+CONFIG_X86_L1_CACHE_SHIFT=7
+CONFIG_X86_HAS_TSC=y
+CONFIG_X86_GOOD_APIC=y
+CONFIG_X86_PGE=y
+CONFIG_X86_USE_PPRO_CHECKSUM=y
+CONFIG_X86_PPRO_FENCE=y
+CONFIG_X86_F00F_WORKS_OK=y
+CONFIG_X86_MCE=y
+
+#
+# CPU Frequency scaling
+#
+# CONFIG_CPU_FREQ is not set
+CONFIG_TOSHIBA=m
+CONFIG_I8K=m
+CONFIG_MICROCODE=m
+CONFIG_X86_MSR=m
+CONFIG_X86_CPUID=m
+# CONFIG_E820_PROC is not set
+CONFIG_EDD=m
+# CONFIG_NOHIGHMEM is not set
+CONFIG_HIGHMEM4G=y
+# CONFIG_HIGHMEM64G is not set
+CONFIG_HIGHMEM=y
+CONFIG_HIGHPTE=y
+CONFIG_HIGHIO=y
+# CONFIG_MATH_EMULATION is not set
+CONFIG_MTRR=y
+CONFIG_SMP=y
+# CONFIG_X86_NUMA is not set
+# CONFIG_X86_TSC_DISABLE is not set
+CONFIG_X86_TSC=y
+CONFIG_HAVE_DEC_LOCK=y
+
+#
+# General setup
+#
+CONFIG_NET=y
+CONFIG_X86_IO_APIC=y
+CONFIG_X86_LOCAL_APIC=y
+CONFIG_PCI=y
+# CONFIG_PCI_GOBIOS is not set
+# CONFIG_PCI_GODIRECT is not set
+CONFIG_PCI_GOANY=y
+CONFIG_PCI_BIOS=y
+CONFIG_PCI_DIRECT=y
+CONFIG_ISA=y
+CONFIG_PCI_NAMES=y
+CONFIG_EISA=y
+# CONFIG_MCA is not set
+CONFIG_HOTPLUG=y
+
+#
+# PCMCIA/CardBus support
+#
+CONFIG_PCMCIA=m
+CONFIG_CARDBUS=y
+CONFIG_TCIC=y
+CONFIG_I82092=y
+CONFIG_I82365=y
+
+#
+# PCI Hotplug Support
+#
+CONFIG_HOTPLUG_PCI=y
+# CONFIG_HOTPLUG_PCI_ACPI is not set
+CONFIG_HOTPLUG_PCI_COMPAQ=m
+# CONFIG_HOTPLUG_PCI_COMPAQ_NVRAM is not set
+CONFIG_HOTPLUG_PCI_IBM=m
+# CONFIG_HOTPLUG_PCI_H2999 is not set
+CONFIG_SYSVIPC=y
+CONFIG_BSD_PROCESS_ACCT=y
+CONFIG_SYSCTL=y
+CONFIG_KCORE_ELF=y
+# CONFIG_KCORE_AOUT is not set
+CONFIG_BINFMT_AOUT=m
+CONFIG_BINFMT_ELF=y
+CONFIG_BINFMT_MISC=m
+CONFIG_PM=y
+# CONFIG_ACPI is not set
+CONFIG_APM=y
+# CONFIG_APM_IGNORE_USER_SUSPEND is not set
+# CONFIG_APM_DO_ENABLE is not set
+CONFIG_APM_CPU_IDLE=y
+# CONFIG_APM_DISPLAY_BLANK is not set
+CONFIG_APM_RTC_IS_GMT=y
+# CONFIG_APM_ALLOW_INTS is not set
+# CONFIG_APM_REAL_MODE_POWER_OFF is not set
+
+#
+# Memory Technology Devices (MTD)
+#
+# CONFIG_MTD is not set
+
+#
+# Parallel port support
+#
+CONFIG_PARPORT=m
+CONFIG_PARPORT_PC=m
+CONFIG_PARPORT_PC_CML1=m
+CONFIG_PARPORT_SERIAL=m
+# CONFIG_PARPORT_PC_FIFO is not set
+# CONFIG_PARPORT_PC_SUPERIO is not set
+CONFIG_PARPORT_PC_PCMCIA=m
+# CONFIG_PARPORT_AMIGA is not set
+# CONFIG_PARPORT_MFC3 is not set
+# CONFIG_PARPORT_ATARI is not set
+# CONFIG_PARPORT_GSC is not set
+# CONFIG_PARPORT_SUNBPP is not set
+# CONFIG_PARPORT_OTHER is not set
+CONFIG_PARPORT_1284=y
+
+#
+# Plug and Play configuration
+#
+CONFIG_PNP=y
+CONFIG_ISAPNP=y
+
+#
+# Block devices
+#
+CONFIG_BLK_DEV_FD=y
+CONFIG_BLK_DEV_XD=m
+CONFIG_PARIDE=m
+CONFIG_PARIDE_PARPORT=m
+CONFIG_PARIDE_PD=m
+CONFIG_PARIDE_PCD=m
+CONFIG_PARIDE_PF=m
+CONFIG_PARIDE_PT=m
+CONFIG_PARIDE_PG=m
+CONFIG_PARIDE_ATEN=m
+CONFIG_PARIDE_BPCK=m
+CONFIG_PARIDE_BPCK6=m
+CONFIG_PARIDE_COMM=m
+CONFIG_PARIDE_DSTR=m
+CONFIG_PARIDE_FIT2=m
+CONFIG_PARIDE_FIT3=m
+CONFIG_PARIDE_EPAT=m
+CONFIG_PARIDE_EPATC8=y
+CONFIG_PARIDE_EPIA=m
+CONFIG_PARIDE_FRIQ=m
+CONFIG_PARIDE_FRPW=m
+CONFIG_PARIDE_KBIC=m
+CONFIG_PARIDE_KTTI=m
+CONFIG_PARIDE_ON20=m
+CONFIG_PARIDE_ON26=m
+CONFIG_BLK_CPQ_DA=m
+CONFIG_BLK_CPQ_CISS_DA=m
+CONFIG_CISS_SCSI_TAPE=y
+CONFIG_BLK_DEV_DAC960=m
+CONFIG_BLK_DEV_UMEM=m
+CONFIG_BLK_DEV_LOOP=m
+CONFIG_BLK_DEV_NBD=m
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=4096
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_BLK_STATS=y
+
+#
+# Multi-device support (RAID and LVM)
+#
+CONFIG_MD=y
+CONFIG_BLK_DEV_MD=y
+CONFIG_MD_LINEAR=m
+CONFIG_MD_RAID0=m
+CONFIG_MD_RAID1=m
+CONFIG_MD_RAID5=m
+CONFIG_MD_MULTIPATH=m
+CONFIG_BLK_DEV_LVM=m
+
+#
+# Cryptography support (CryptoAPI)
+#
+CONFIG_CRYPTO=m
+CONFIG_CIPHERS=m
+CONFIG_CIPHER_AES=m
+CONFIG_CIPHER_IDENTITY=m
+CONFIG_CRYPTODEV=m
+CONFIG_CRYPTOLOOP=m
+
+#
+# Networking options
+#
+CONFIG_PACKET=y
+CONFIG_PACKET_MMAP=y
+CONFIG_NETLINK_DEV=y
+CONFIG_NETFILTER=y
+# CONFIG_NETFILTER_DEBUG is not set
+CONFIG_FILTER=y
+CONFIG_UNIX=y
+CONFIG_INET=y
+CONFIG_TUX=m
+CONFIG_TUX_EXTCGI=y
+# CONFIG_TUX_EXTENDED_LOG is not set
+# CONFIG_TUX_DEBUG is not set
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_ROUTE_FWMARK=y
+CONFIG_IP_ROUTE_NAT=y
+CONFIG_IP_ROUTE_MULTIPATH=y
+CONFIG_IP_ROUTE_TOS=y
+CONFIG_IP_ROUTE_VERBOSE=y
+CONFIG_IP_ROUTE_LARGE_TABLES=y
+# CONFIG_IP_PNP is not set
+CONFIG_NET_IPIP=m
+CONFIG_NET_IPGRE=m
+CONFIG_NET_IPGRE_BROADCAST=y
+CONFIG_IP_MROUTE=y
+CONFIG_IP_PIMSM_V1=y
+CONFIG_IP_PIMSM_V2=y
+# CONFIG_ARPD is not set
+# CONFIG_INET_ECN is not set
+CONFIG_SYN_COOKIES=y
+
+#
+# IP: Netfilter Configuration
+#
+CONFIG_IP_NF_CONNTRACK=m
+CONFIG_IP_NF_FTP=m
+CONFIG_IP_NF_AMANDA=m
+CONFIG_IP_NF_TFTP=m
+CONFIG_IP_NF_IRC=m
+CONFIG_IP_NF_QUEUE=m
+CONFIG_IP_NF_IPTABLES=m
+CONFIG_IP_NF_MATCH_LIMIT=m
+CONFIG_IP_NF_MATCH_MAC=m
+CONFIG_IP_NF_MATCH_PKTTYPE=m
+CONFIG_IP_NF_MATCH_MARK=m
+CONFIG_IP_NF_MATCH_MULTIPORT=m
+CONFIG_IP_NF_MATCH_TOS=m
+CONFIG_IP_NF_MATCH_ECN=m
+CONFIG_IP_NF_MATCH_DSCP=m
+CONFIG_IP_NF_MATCH_AH_ESP=m
+CONFIG_IP_NF_MATCH_LENGTH=m
+CONFIG_IP_NF_MATCH_TTL=m
+CONFIG_IP_NF_MATCH_TCPMSS=m
+CONFIG_IP_NF_MATCH_HELPER=m
+CONFIG_IP_NF_MATCH_STATE=m
+CONFIG_IP_NF_MATCH_CONNTRACK=m
+CONFIG_IP_NF_MATCH_UNCLEAN=m
+CONFIG_IP_NF_MATCH_OWNER=m
+CONFIG_IP_NF_FILTER=m
+CONFIG_IP_NF_TARGET_REJECT=m
+CONFIG_IP_NF_TARGET_MIRROR=m
+CONFIG_IP_NF_NAT=m
+CONFIG_IP_NF_NAT_NEEDED=y
+CONFIG_IP_NF_TARGET_MASQUERADE=m
+CONFIG_IP_NF_TARGET_REDIRECT=m
+CONFIG_IP_NF_NAT_AMANDA=m
+CONFIG_IP_NF_NAT_LOCAL=y
+CONFIG_IP_NF_NAT_SNMP_BASIC=m
+CONFIG_IP_NF_NAT_IRC=m
+CONFIG_IP_NF_NAT_FTP=m
+CONFIG_IP_NF_NAT_TFTP=m
+CONFIG_IP_NF_MANGLE=m
+CONFIG_IP_NF_TARGET_TOS=m
+CONFIG_IP_NF_TARGET_ECN=m
+CONFIG_IP_NF_TARGET_DSCP=m
+CONFIG_IP_NF_TARGET_MARK=m
+CONFIG_IP_NF_TARGET_LOG=m
+CONFIG_IP_NF_TARGET_ULOG=m
+CONFIG_IP_NF_TARGET_TCPMSS=m
+CONFIG_IP_NF_ARPTABLES=m
+CONFIG_IP_NF_ARPFILTER=m
+CONFIG_IP_NF_COMPAT_IPCHAINS=m
+CONFIG_IP_NF_NAT_NEEDED=y
+CONFIG_IP_NF_COMPAT_IPFWADM=m
+CONFIG_IP_NF_NAT_NEEDED=y
+CONFIG_IPV6=m
+
+#
+# IPv6: Netfilter Configuration
+#
+# CONFIG_IP6_NF_QUEUE is not set
+CONFIG_IP6_NF_IPTABLES=m
+CONFIG_IP6_NF_MATCH_LIMIT=m
+CONFIG_IP6_NF_MATCH_MAC=m
+CONFIG_IP6_NF_MATCH_RT=m
+CONFIG_IP6_NF_MATCH_OPTS=m
+CONFIG_IP6_NF_MATCH_FRAG=m
+CONFIG_IP6_NF_MATCH_HL=m
+CONFIG_IP6_NF_MATCH_MULTIPORT=m
+CONFIG_IP6_NF_MATCH_OWNER=m
+CONFIG_IP6_NF_MATCH_MARK=m
+CONFIG_IP6_NF_MATCH_IPV6HEADER=m
+CONFIG_IP6_NF_MATCH_AHESP=m
+CONFIG_IP6_NF_MATCH_LENGTH=m
+CONFIG_IP6_NF_MATCH_EUI64=m
+CONFIG_IP6_NF_FILTER=m
+CONFIG_IP6_NF_TARGET_LOG=m
+CONFIG_IP6_NF_MANGLE=m
+CONFIG_IP6_NF_TARGET_MARK=m
+# CONFIG_KHTTPD is not set
+CONFIG_ATM=y
+CONFIG_ATM_CLIP=y
+# CONFIG_ATM_CLIP_NO_ICMP is not set
+CONFIG_ATM_LANE=m
+CONFIG_ATM_MPOA=m
+CONFIG_ATM_BR2684=m
+CONFIG_ATM_BR2684_IPFILTER=y
+CONFIG_VLAN_8021Q=m
+CONFIG_IPX=m
+# CONFIG_IPX_INTERN is not set
+CONFIG_ATALK=m
+
+#
+# Appletalk devices
+#
+CONFIG_DEV_APPLETALK=y
+CONFIG_LTPC=m
+CONFIG_COPS=m
+CONFIG_COPS_DAYNA=y
+CONFIG_COPS_TANGENT=y
+CONFIG_IPDDP=m
+CONFIG_IPDDP_ENCAP=y
+CONFIG_IPDDP_DECAP=y
+CONFIG_DECNET=m
+CONFIG_DECNET_SIOCGIFCONF=y
+CONFIG_DECNET_ROUTER=y
+CONFIG_DECNET_ROUTE_FWMARK=y
+CONFIG_BRIDGE=m
+# CONFIG_X25 is not set
+# CONFIG_LAPB is not set
+# CONFIG_LLC is not set
+CONFIG_NET_DIVERT=y
+# CONFIG_ECONET is not set
+CONFIG_WAN_ROUTER=m
+# CONFIG_NET_FASTROUTE is not set
+# CONFIG_NET_HW_FLOWCONTROL is not set
+
+#
+# QoS and/or fair queueing
+#
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_CBQ=m
+CONFIG_NET_SCH_HTB=m
+CONFIG_NET_SCH_CSZ=m
+# CONFIG_NET_SCH_ATM is not set
+CONFIG_NET_SCH_PRIO=m
+CONFIG_NET_SCH_RED=m
+CONFIG_NET_SCH_SFQ=m
+CONFIG_NET_SCH_TEQL=m
+CONFIG_NET_SCH_TBF=m
+CONFIG_NET_SCH_GRED=m
+CONFIG_NET_SCH_DSMARK=m
+CONFIG_NET_SCH_INGRESS=m
+CONFIG_NET_QOS=y
+CONFIG_NET_ESTIMATOR=y
+CONFIG_NET_CLS=y
+CONFIG_NET_CLS_TCINDEX=m
+CONFIG_NET_CLS_ROUTE4=m
+CONFIG_NET_CLS_ROUTE=y
+CONFIG_NET_CLS_FW=m
+CONFIG_NET_CLS_U32=m
+CONFIG_NET_CLS_RSVP=m
+CONFIG_NET_CLS_RSVP6=m
+CONFIG_NET_CLS_POLICE=y
+
+#
+# Network testing
+#
+# CONFIG_NET_PKTGEN is not set
+
+#
+# Telephony Support
+#
+CONFIG_PHONE=m
+CONFIG_PHONE_IXJ=m
+CONFIG_PHONE_IXJ_PCMCIA=m
+
+#
+# ATA/IDE/MFM/RLL support
+#
+CONFIG_IDE=y
+
+#
+# IDE, ATA and ATAPI Block devices
+#
+CONFIG_BLK_DEV_IDE=y
+# CONFIG_BLK_DEV_HD_IDE is not set
+# CONFIG_BLK_DEV_HD is not set
+CONFIG_BLK_DEV_IDEDISK=y
+CONFIG_IDEDISK_MULTI_MODE=y
+# CONFIG_IDEDISK_STROKE is not set
+CONFIG_BLK_DEV_IDECS=m
+CONFIG_BLK_DEV_IDECD=m
+CONFIG_BLK_DEV_IDETAPE=m
+CONFIG_BLK_DEV_IDEFLOPPY=y
+CONFIG_BLK_DEV_IDESCSI=m
+# CONFIG_IDE_TASK_IOCTL is not set
+CONFIG_BLK_DEV_CMD640=y
+# CONFIG_BLK_DEV_CMD640_ENHANCED is not set
+CONFIG_BLK_DEV_ISAPNP=y
+CONFIG_BLK_DEV_IDEPCI=y
+CONFIG_BLK_DEV_GENERIC=y
+CONFIG_IDEPCI_SHARE_IRQ=y
+CONFIG_BLK_DEV_IDEDMA_PCI=y
+# CONFIG_BLK_DEV_OFFBOARD is not set
+# CONFIG_BLK_DEV_IDEDMA_FORCED is not set
+CONFIG_IDEDMA_PCI_AUTO=y
+# CONFIG_IDEDMA_ONLYDISK is not set
+CONFIG_BLK_DEV_IDEDMA=y
+# CONFIG_IDEDMA_PCI_WIP is not set
+CONFIG_BLK_DEV_ADMA100=y
+CONFIG_BLK_DEV_AEC62XX=y
+CONFIG_BLK_DEV_ALI15X3=y
+# CONFIG_WDC_ALI15X3 is not set
+CONFIG_BLK_DEV_AMD74XX=y
+# CONFIG_AMD74XX_OVERRIDE is not set
+CONFIG_BLK_DEV_CMD64X=y
+CONFIG_BLK_DEV_TRIFLEX=y
+CONFIG_BLK_DEV_CY82C693=y
+CONFIG_BLK_DEV_CS5530=y
+CONFIG_BLK_DEV_HPT34X=y
+# CONFIG_HPT34X_AUTODMA is not set
+CONFIG_BLK_DEV_HPT366=y
+CONFIG_BLK_DEV_PIIX=y
+# CONFIG_BLK_DEV_NS87415 is not set
+# CONFIG_BLK_DEV_OPTI621 is not set
+CONFIG_BLK_DEV_PDC202XX_OLD=y
+# CONFIG_PDC202XX_BURST is not set
+CONFIG_BLK_DEV_PDC202XX_NEW=y
+CONFIG_PDC202XX_FORCE=y
+CONFIG_BLK_DEV_RZ1000=y
+# CONFIG_BLK_DEV_SC1200 is not set
+CONFIG_BLK_DEV_SVWKS=y
+CONFIG_BLK_DEV_SIIMAGE=y
+CONFIG_BLK_DEV_SIS5513=y
+CONFIG_BLK_DEV_SLC90E66=y
+# CONFIG_BLK_DEV_TRM290 is not set
+CONFIG_BLK_DEV_VIA82CXXX=y
+# CONFIG_IDE_CHIPSETS is not set
+CONFIG_IDEDMA_AUTO=y
+# CONFIG_IDEDMA_IVB is not set
+# CONFIG_DMA_NONPCI is not set
+CONFIG_BLK_DEV_PDC202XX=y
+CONFIG_BLK_DEV_IDE_MODES=y
+CONFIG_BLK_DEV_ATARAID=m
+CONFIG_BLK_DEV_ATARAID_PDC=m
+CONFIG_BLK_DEV_ATARAID_HPT=m
+CONFIG_BLK_DEV_ATARAID_SII=m
+
+#
+# SCSI support
+#
+CONFIG_SCSI=m
+CONFIG_BLK_DEV_SD=m
+CONFIG_SD_EXTRA_DEVS=40
+CONFIG_CHR_DEV_ST=m
+CONFIG_CHR_DEV_OSST=m
+CONFIG_BLK_DEV_SR=m
+CONFIG_BLK_DEV_SR_VENDOR=y
+CONFIG_SR_EXTRA_DEVS=4
+CONFIG_CHR_DEV_SG=m
+# CONFIG_SCSI_DEBUG_QUEUES is not set
+# CONFIG_SCSI_MULTI_LUN is not set
+CONFIG_SCSI_CONSTANTS=y
+CONFIG_SCSI_LOGGING=y
+
+#
+# SCSI low-level drivers
+#
+CONFIG_BLK_DEV_3W_XXXX_RAID=m
+CONFIG_SCSI_7000FASST=m
+CONFIG_SCSI_ACARD=m
+CONFIG_SCSI_AHA152X=m
+CONFIG_SCSI_AHA1542=m
+CONFIG_SCSI_AHA1740=m
+CONFIG_SCSI_AACRAID=m
+CONFIG_SCSI_AIC7XXX=m
+CONFIG_AIC7XXX_CMDS_PER_DEVICE=253
+CONFIG_AIC7XXX_RESET_DELAY_MS=15000
+# CONFIG_AIC7XXX_PROBE_EISA_VL is not set
+# CONFIG_AIC7XXX_BUILD_FIRMWARE is not set
+CONFIG_SCSI_AIC79XX=m
+CONFIG_AIC79XX_CMDS_PER_DEVICE=253
+CONFIG_AIC79XX_RESET_DELAY_MS=15000
+# CONFIG_AIC79XX_BUILD_FIRMWARE is not set
+CONFIG_AIC79XX_ENABLE_RD_STRM=y
+# CONFIG_AIC79XX_DEBUG_ENABLE is not set
+CONFIG_AIC79XX_DEBUG_MASK=0
+CONFIG_SCSI_AIC7XXX_OLD=m
+CONFIG_AIC7XXX_OLD_TCQ_ON_BY_DEFAULT=y
+CONFIG_AIC7XXX_OLD_CMDS_PER_DEVICE=32
+CONFIG_AIC7XXX_OLD_PROC_STATS=y
+CONFIG_SCSI_DPT_I2O=m
+CONFIG_SCSI_ADVANSYS=m
+CONFIG_SCSI_IN2000=m
+CONFIG_SCSI_AM53C974=m
+CONFIG_SCSI_MEGARAID=m
+CONFIG_SCSI_BUSLOGIC=m
+# CONFIG_SCSI_OMIT_FLASHPOINT is not set
+CONFIG_SCSI_CPQFCTS=m
+CONFIG_SCSI_DMX3191D=m
+CONFIG_SCSI_DTC3280=m
+CONFIG_SCSI_EATA=m
+CONFIG_SCSI_EATA_TAGGED_QUEUE=y
+# CONFIG_SCSI_EATA_LINKED_COMMANDS is not set
+CONFIG_SCSI_EATA_MAX_TAGS=16
+CONFIG_SCSI_EATA_DMA=m
+CONFIG_SCSI_EATA_PIO=m
+CONFIG_SCSI_FUTURE_DOMAIN=m
+CONFIG_SCSI_GDTH=m
+CONFIG_SCSI_GENERIC_NCR5380=m
+# CONFIG_SCSI_GENERIC_NCR53C400 is not set
+CONFIG_SCSI_G_NCR5380_PORT=y
+# CONFIG_SCSI_G_NCR5380_MEM is not set
+CONFIG_SCSI_IPS=m
+CONFIG_SCSI_INITIO=m
+CONFIG_SCSI_INIA100=m
+CONFIG_SCSI_PPA=m
+CONFIG_SCSI_IMM=m
+# CONFIG_SCSI_IZIP_EPP16 is not set
+# CONFIG_SCSI_IZIP_SLOW_CTR is not set
+CONFIG_SCSI_NCR53C406A=m
+CONFIG_SCSI_NCR53C7xx=m
+# CONFIG_SCSI_NCR53C7xx_sync is not set
+CONFIG_SCSI_NCR53C7xx_FAST=y
+CONFIG_SCSI_NCR53C7xx_DISCONNECT=y
+CONFIG_SCSI_SYM53C8XX_2=m
+CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=1
+CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16
+CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64
+# CONFIG_SCSI_SYM53C8XX_IOMAPPED is not set
+CONFIG_SCSI_NCR53C8XX=m
+CONFIG_SCSI_SYM53C8XX=m
+CONFIG_SCSI_NCR53C8XX_DEFAULT_TAGS=8
+CONFIG_SCSI_NCR53C8XX_MAX_TAGS=32
+CONFIG_SCSI_NCR53C8XX_SYNC=40
+# CONFIG_SCSI_NCR53C8XX_PROFILE is not set
+# CONFIG_SCSI_NCR53C8XX_IOMAPPED is not set
+# CONFIG_SCSI_NCR53C8XX_PQS_PDS is not set
+# CONFIG_SCSI_NCR53C8XX_SYMBIOS_COMPAT is not set
+CONFIG_SCSI_PAS16=m
+CONFIG_SCSI_PCI2000=m
+CONFIG_SCSI_PCI2220I=m
+CONFIG_SCSI_PSI240I=m
+CONFIG_SCSI_QLOGIC_FAS=m
+CONFIG_SCSI_QLOGIC_ISP=m
+CONFIG_SCSI_QLOGIC_FC=m
+# CONFIG_SCSI_QLOGIC_FC_FIRMWARE is not set
+CONFIG_SCSI_QLOGIC_1280=m
+CONFIG_SCSI_NEWISP=m
+CONFIG_SCSI_SEAGATE=m
+CONFIG_SCSI_SIM710=m
+CONFIG_SCSI_SYM53C416=m
+CONFIG_SCSI_DC390T=m
+# CONFIG_SCSI_DC390T_NOGENSUPP is not set
+CONFIG_SCSI_T128=m
+CONFIG_SCSI_U14_34F=m
+# CONFIG_SCSI_U14_34F_LINKED_COMMANDS is not set
+CONFIG_SCSI_U14_34F_MAX_TAGS=8
+CONFIG_SCSI_ULTRASTOR=m
+CONFIG_SCSI_NSP32=m
+CONFIG_SCSI_DEBUG=m
+
+#
+# PCMCIA SCSI adapter support
+#
+CONFIG_SCSI_PCMCIA=y
+CONFIG_PCMCIA_AHA152X=m
+CONFIG_PCMCIA_FDOMAIN=m
+CONFIG_PCMCIA_NINJA_SCSI=m
+CONFIG_PCMCIA_QLOGIC=m
+
+#
+# Fusion MPT device support
+#
+CONFIG_FUSION=m
+# CONFIG_FUSION_BOOT is not set
+CONFIG_FUSION_MAX_SGE=40
+# CONFIG_FUSION_ISENSE is not set
+CONFIG_FUSION_CTL=m
+CONFIG_FUSION_LAN=m
+CONFIG_NET_FC=y
+
+#
+# IEEE 1394 (FireWire) support (EXPERIMENTAL)
+#
+CONFIG_IEEE1394=m
+# CONFIG_IEEE1394_PCILYNX is not set
+CONFIG_IEEE1394_OHCI1394=m
+CONFIG_IEEE1394_VIDEO1394=m
+CONFIG_IEEE1394_SBP2=m
+CONFIG_IEEE1394_SBP2_PHYS_DMA=y
+CONFIG_IEEE1394_ETH1394=m
+CONFIG_IEEE1394_DV1394=m
+CONFIG_IEEE1394_RAWIO=m
+CONFIG_IEEE1394_CMP=m
+CONFIG_IEEE1394_AMDTP=m
+# CONFIG_IEEE1394_VERBOSEDEBUG is not set
+
+#
+# I2O device support
+#
+CONFIG_I2O=m
+CONFIG_I2O_PCI=m
+CONFIG_I2O_BLOCK=m
+CONFIG_I2O_LAN=m
+CONFIG_I2O_SCSI=m
+CONFIG_I2O_PROC=m
+
+#
+# Network device support
+#
+CONFIG_NETDEVICES=y
+
+#
+# ARCnet devices
+#
+# CONFIG_ARCNET is not set
+CONFIG_DUMMY=m
+CONFIG_BONDING=m
+CONFIG_EQUALIZER=m
+CONFIG_TUN=m
+CONFIG_ETHERTAP=m
+CONFIG_NET_SB1000=m
+
+#
+# Ethernet (10 or 100Mbit)
+#
+CONFIG_NET_ETHERNET=y
+# CONFIG_SUNLANCE is not set
+CONFIG_HAPPYMEAL=m
+# CONFIG_SUNBMAC is not set
+# CONFIG_SUNQE is not set
+CONFIG_SUNGEM=m
+CONFIG_NET_VENDOR_3COM=y
+CONFIG_EL1=m
+CONFIG_EL2=m
+CONFIG_ELPLUS=m
+CONFIG_EL16=m
+CONFIG_EL3=m
+CONFIG_3C515=m
+# CONFIG_ELMC is not set
+# CONFIG_ELMC_II is not set
+CONFIG_VORTEX=m
+CONFIG_TYPHOON=m
+CONFIG_LANCE=m
+CONFIG_NET_VENDOR_SMC=y
+CONFIG_WD80x3=m
+# CONFIG_ULTRAMCA is not set
+CONFIG_ULTRA=m
+CONFIG_ULTRA32=m
+CONFIG_SMC9194=m
+CONFIG_NET_VENDOR_RACAL=y
+CONFIG_NI5010=m
+CONFIG_NI52=m
+CONFIG_NI65=m
+CONFIG_AT1700=m
+CONFIG_DEPCA=m
+CONFIG_HP100=m
+CONFIG_NET_ISA=y
+CONFIG_E2100=m
+# CONFIG_EWRK3 is not set
+CONFIG_EEXPRESS=m
+CONFIG_EEXPRESS_PRO=m
+CONFIG_HPLAN_PLUS=m
+CONFIG_HPLAN=m
+CONFIG_LP486E=m
+CONFIG_ETH16I=m
+CONFIG_NE2000=m
+CONFIG_NET_PCI=y
+CONFIG_PCNET32=m
+CONFIG_AMD8111_ETH=m
+CONFIG_ADAPTEC_STARFIRE=m
+CONFIG_AC3200=m
+CONFIG_APRICOT=m
+CONFIG_CS89x0=m
+CONFIG_TULIP=m
+# CONFIG_TULIP_MWI is not set
+CONFIG_TULIP_MMIO=y
+CONFIG_DE4X5=m
+CONFIG_DGRS=m
+CONFIG_DM9102=m
+CONFIG_EEPRO100=m
+# CONFIG_EEPRO100_PIO is not set
+CONFIG_E100=m
+CONFIG_LNE390=m
+CONFIG_FEALNX=m
+CONFIG_NATSEMI=m
+CONFIG_NE2K_PCI=m
+CONFIG_NE3210=m
+CONFIG_ES3210=m
+CONFIG_8139CP=m
+CONFIG_8139TOO=m
+# CONFIG_8139TOO_PIO is not set
+# CONFIG_8139TOO_TUNE_TWISTER is not set
+CONFIG_8139TOO_8129=y
+# CONFIG_8139_OLD_RX_RESET is not set
+CONFIG_SIS900=m
+CONFIG_EPIC100=m
+CONFIG_SUNDANCE=m
+# CONFIG_SUNDANCE_MMIO is not set
+CONFIG_TLAN=m
+CONFIG_TC35815=m
+CONFIG_VIA_RHINE=m
+# CONFIG_VIA_RHINE_MMIO is not set
+CONFIG_WINBOND_840=m
+CONFIG_NET_POCKET=y
+CONFIG_ATP=m
+CONFIG_DE600=m
+CONFIG_DE620=m
+
+#
+# Ethernet (1000 Mbit)
+#
+CONFIG_ACENIC=m
+# CONFIG_ACENIC_OMIT_TIGON_I is not set
+CONFIG_DL2K=m
+CONFIG_E1000=m
+# CONFIG_MYRI_SBUS is not set
+CONFIG_NS83820=m
+CONFIG_HAMACHI=m
+CONFIG_YELLOWFIN=m
+CONFIG_R8169=m
+CONFIG_SK98LIN=m
+CONFIG_TIGON3=m
+CONFIG_FDDI=y
+CONFIG_DEFXX=m
+CONFIG_SKFP=m
+CONFIG_NETCONSOLE=m
+# CONFIG_HIPPI is not set
+CONFIG_PLIP=m
+CONFIG_PPP=m
+CONFIG_PPP_MULTILINK=y
+CONFIG_PPP_FILTER=y
+CONFIG_PPP_ASYNC=m
+CONFIG_PPP_SYNC_TTY=m
+CONFIG_PPP_DEFLATE=m
+# CONFIG_PPP_BSDCOMP is not set
+CONFIG_PPPOE=m
+CONFIG_PPPOATM=m
+CONFIG_SLIP=m
+CONFIG_SLIP_COMPRESSED=y
+CONFIG_SLIP_SMART=y
+CONFIG_SLIP_MODE_SLIP6=y
+
+#
+# Wireless LAN (non-hamradio)
+#
+CONFIG_NET_RADIO=y
+CONFIG_STRIP=m
+CONFIG_WAVELAN=m
+CONFIG_ARLAN=m
+CONFIG_AIRONET4500=m
+CONFIG_AIRONET4500_NONCS=m
+CONFIG_AIRONET4500_PNP=y
+CONFIG_AIRONET4500_PCI=y
+CONFIG_AIRONET4500_ISA=y
+CONFIG_AIRONET4500_I365=y
+CONFIG_AIRONET4500_PROC=m
+CONFIG_AIRO=m
+CONFIG_HERMES=m
+CONFIG_PLX_HERMES=m
+CONFIG_PCI_HERMES=m
+CONFIG_PCMCIA_HERMES=m
+CONFIG_AIRO_CS=m
+CONFIG_NET_WIRELESS=y
+CONFIG_PCMCIA_HERMES_OLD=m
+
+#
+# Token Ring devices
+#
+CONFIG_TR=y
+CONFIG_IBMTR=m
+CONFIG_IBMOL=m
+CONFIG_IBMLS=m
+CONFIG_3C359=m
+CONFIG_TMS380TR=m
+CONFIG_TMSPCI=m
+CONFIG_TMSISA=m
+CONFIG_ABYSS=m
+# CONFIG_MADGEMC is not set
+CONFIG_SMCTR=m
+CONFIG_NET_FC=y
+CONFIG_IPHASE5526=m
+CONFIG_RCPCI=m
+CONFIG_SHAPER=m
+
+#
+# Wan interfaces
+#
+CONFIG_WAN=y
+CONFIG_HOSTESS_SV11=m
+CONFIG_COSA=m
+# CONFIG_COMX is not set
+# CONFIG_DSCC4 is not set
+# CONFIG_LANMEDIA is not set
+CONFIG_ATI_XX20=m
+CONFIG_SEALEVEL_4021=m
+# CONFIG_SYNCLINK_SYNCPPP is not set
+# CONFIG_HDLC is not set
+CONFIG_DLCI=m
+CONFIG_DLCI_COUNT=24
+CONFIG_DLCI_MAX=8
+CONFIG_SDLA=m
+CONFIG_WAN_ROUTER_DRIVERS=y
+CONFIG_VENDOR_SANGOMA=m
+CONFIG_WANPIPE_CHDLC=y
+CONFIG_WANPIPE_FR=y
+CONFIG_WANPIPE_X25=y
+CONFIG_WANPIPE_PPP=y
+CONFIG_WANPIPE_MULTPPP=y
+CONFIG_CYCLADES_SYNC=m
+CONFIG_CYCLOMX_X25=y
+# CONFIG_LAPBETHER is not set
+# CONFIG_X25_ASY is not set
+CONFIG_SBNI=m
+CONFIG_SBNI_MULTILINE=y
+
+#
+# PCMCIA network device support
+#
+CONFIG_NET_PCMCIA=y
+CONFIG_PCMCIA_3C589=m
+CONFIG_PCMCIA_3C574=m
+CONFIG_PCMCIA_FMVJ18X=m
+CONFIG_PCMCIA_PCNET=m
+CONFIG_PCMCIA_AXNET=m
+CONFIG_PCMCIA_NMCLAN=m
+CONFIG_PCMCIA_SMC91C92=m
+CONFIG_PCMCIA_XIRC2PS=m
+# CONFIG_ARCNET_COM20020_CS is not set
+CONFIG_PCMCIA_IBMTR=m
+CONFIG_PCMCIA_XIRCOM=m
+CONFIG_PCMCIA_XIRTULIP=m
+CONFIG_NET_PCMCIA_RADIO=y
+CONFIG_PCMCIA_RAYCS=m
+CONFIG_PCMCIA_NETWAVE=m
+CONFIG_PCMCIA_WAVELAN=m
+CONFIG_PCMCIA_WVLAN=m
+CONFIG_AIRONET4500_CS=m
+
+#
+# ATM drivers
+#
+CONFIG_ATM_TCP=m
+CONFIG_ATM_LANAI=m
+CONFIG_ATM_ENI=m
+# CONFIG_ATM_ENI_DEBUG is not set
+# CONFIG_ATM_ENI_TUNE_BURST is not set
+CONFIG_ATM_FIRESTREAM=m
+CONFIG_ATM_ZATM=m
+# CONFIG_ATM_ZATM_DEBUG is not set
+CONFIG_ATM_ZATM_EXACT_TS=y
+CONFIG_ATM_NICSTAR=m
+CONFIG_ATM_NICSTAR_USE_SUNI=y
+CONFIG_ATM_NICSTAR_USE_IDT77105=y
+CONFIG_ATM_IDT77252=m
+# CONFIG_ATM_IDT77252_DEBUG is not set
+# CONFIG_ATM_IDT77252_RCV_ALL is not set
+CONFIG_ATM_IDT77252_USE_SUNI=y
+CONFIG_ATM_AMBASSADOR=m
+# CONFIG_ATM_AMBASSADOR_DEBUG is not set
+CONFIG_ATM_HORIZON=m
+# CONFIG_ATM_HORIZON_DEBUG is not set
+CONFIG_ATM_IA=m
+# CONFIG_ATM_IA_DEBUG is not set
+CONFIG_ATM_FORE200E_MAYBE=m
+CONFIG_ATM_FORE200E_PCA=y
+CONFIG_ATM_FORE200E_PCA_DEFAULT_FW=y
+CONFIG_ATM_FORE200E_TX_RETRY=16
+CONFIG_ATM_FORE200E_DEBUG=0
+CONFIG_ATM_FORE200E=m
+
+#
+# Amateur Radio support
+#
+CONFIG_HAMRADIO=y
+CONFIG_AX25=m
+# CONFIG_AX25_DAMA_SLAVE is not set
+CONFIG_NETROM=m
+CONFIG_ROSE=m
+
+#
+# AX.25 network device drivers
+#
+# CONFIG_MKISS is not set
+# CONFIG_6PACK is not set
+# CONFIG_BPQETHER is not set
+# CONFIG_DMASCC is not set
+# CONFIG_SCC is not set
+# CONFIG_BAYCOM_SER_FDX is not set
+# CONFIG_BAYCOM_SER_HDX is not set
+# CONFIG_BAYCOM_PAR is not set
+# CONFIG_BAYCOM_EPP is not set
+CONFIG_SOUNDMODEM=m
+CONFIG_SOUNDMODEM_SBC=y
+CONFIG_SOUNDMODEM_WSS=y
+CONFIG_SOUNDMODEM_AFSK1200=y
+CONFIG_SOUNDMODEM_AFSK2400_7=y
+CONFIG_SOUNDMODEM_AFSK2400_8=y
+CONFIG_SOUNDMODEM_AFSK2666=y
+CONFIG_SOUNDMODEM_HAPN4800=y
+CONFIG_SOUNDMODEM_PSK4800=y
+CONFIG_SOUNDMODEM_FSK9600=y
+# CONFIG_YAM is not set
+
+#
+# IrDA (infrared) support
+#
+CONFIG_IRDA=m
+CONFIG_IRLAN=m
+CONFIG_IRNET=m
+CONFIG_IRCOMM=m
+CONFIG_IRDA_ULTRA=y
+CONFIG_IRDA_CACHE_LAST_LSAP=y
+CONFIG_IRDA_FAST_RR=y
+# CONFIG_IRDA_DEBUG is not set
+
+#
+# Infrared-port device drivers
+#
+CONFIG_IRTTY_SIR=m
+CONFIG_IRPORT_SIR=m
+CONFIG_DONGLE=y
+CONFIG_ESI_DONGLE=m
+CONFIG_ACTISYS_DONGLE=m
+CONFIG_TEKRAM_DONGLE=m
+CONFIG_GIRBIL_DONGLE=m
+CONFIG_LITELINK_DONGLE=m
+CONFIG_MCP2120_DONGLE=m
+CONFIG_OLD_BELKIN_DONGLE=m
+CONFIG_ACT200L_DONGLE=m
+CONFIG_MA600_DONGLE=m
+CONFIG_USB_IRDA=m
+CONFIG_NSC_FIR=m
+CONFIG_WINBOND_FIR=m
+CONFIG_TOSHIBA_OLD=m
+CONFIG_TOSHIBA_FIR=m
+CONFIG_SMC_IRCC_FIR=m
+CONFIG_ALI_FIR=m
+CONFIG_VLSI_FIR=m
+
+#
+# ISDN subsystem
+#
+CONFIG_ISDN=m
+CONFIG_ISDN_BOOL=y
+CONFIG_ISDN_PPP=y
+CONFIG_ISDN_PPP_VJ=y
+CONFIG_ISDN_MPP=y
+CONFIG_ISDN_PPP_BSDCOMP=m
+CONFIG_ISDN_AUDIO=y
+CONFIG_ISDN_TTY_FAX=y
+
+#
+# ISDN feature submodules
+#
+CONFIG_ISDN_DRV_LOOP=m
+# CONFIG_ISDN_DIVERSION is not set
+
+#
+# Passive ISDN cards
+#
+CONFIG_ISDN_DRV_HISAX=m
+CONFIG_ISDN_HISAX=y
+CONFIG_HISAX_EURO=y
+CONFIG_DE_AOC=y
+# CONFIG_HISAX_NO_SENDCOMPLETE is not set
+# CONFIG_HISAX_NO_LLC is not set
+# CONFIG_HISAX_NO_KEYPAD is not set
+CONFIG_HISAX_1TR6=y
+CONFIG_HISAX_NI1=y
+CONFIG_HISAX_MAX_CARDS=8
+CONFIG_HISAX_16_0=y
+CONFIG_HISAX_16_3=y
+CONFIG_HISAX_AVM_A1=y
+CONFIG_HISAX_IX1MICROR2=y
+CONFIG_HISAX_ASUSCOM=y
+CONFIG_HISAX_TELEINT=y
+CONFIG_HISAX_HFCS=y
+CONFIG_HISAX_SPORTSTER=y
+CONFIG_HISAX_MIC=y
+CONFIG_HISAX_ISURF=y
+CONFIG_HISAX_HSTSAPHIR=y
+CONFIG_HISAX_TELESPCI=y
+CONFIG_HISAX_S0BOX=y
+CONFIG_HISAX_FRITZPCI=y
+CONFIG_HISAX_AVM_A1_PCMCIA=y
+CONFIG_HISAX_ELSA=y
+CONFIG_HISAX_DIEHLDIVA=y
+CONFIG_HISAX_SEDLBAUER=y
+CONFIG_HISAX_NETJET=y
+CONFIG_HISAX_NETJET_U=y
+CONFIG_HISAX_NICCY=y
+CONFIG_HISAX_BKM_A4T=y
+CONFIG_HISAX_SCT_QUADRO=y
+CONFIG_HISAX_GAZEL=y
+CONFIG_HISAX_HFC_PCI=y
+CONFIG_HISAX_W6692=y
+CONFIG_HISAX_HFC_SX=y
+CONFIG_HISAX_ENTERNOW_PCI=y
+CONFIG_HISAX_DEBUG=y
+CONFIG_HISAX_SEDLBAUER_CS=m
+CONFIG_HISAX_ELSA_CS=m
+CONFIG_HISAX_AVM_A1_CS=m
+CONFIG_HISAX_ST5481=m
+CONFIG_HISAX_FRITZ_PCIPNP=m
+CONFIG_USB_AUERISDN=m
+
+#
+# Active ISDN cards
+#
+CONFIG_ISDN_DRV_ICN=m
+CONFIG_ISDN_DRV_PCBIT=m
+# CONFIG_ISDN_DRV_SC is not set
+# CONFIG_ISDN_DRV_ACT2000 is not set
+CONFIG_ISDN_DRV_EICON=y
+CONFIG_ISDN_DRV_EICON_DIVAS=m
+# CONFIG_ISDN_DRV_EICON_OLD is not set
+CONFIG_ISDN_DRV_TPAM=m
+CONFIG_ISDN_CAPI=m
+CONFIG_ISDN_DRV_AVMB1_VERBOSE_REASON=y
+CONFIG_ISDN_CAPI_MIDDLEWARE=y
+CONFIG_ISDN_CAPI_CAPI20=m
+CONFIG_ISDN_CAPI_CAPIFS_BOOL=y
+CONFIG_ISDN_CAPI_CAPIFS=m
+CONFIG_ISDN_CAPI_CAPIDRV=m
+CONFIG_ISDN_DRV_AVMB1_B1ISA=m
+CONFIG_ISDN_DRV_AVMB1_B1PCI=m
+CONFIG_ISDN_DRV_AVMB1_B1PCIV4=y
+CONFIG_ISDN_DRV_AVMB1_T1ISA=m
+CONFIG_ISDN_DRV_AVMB1_B1PCMCIA=m
+CONFIG_ISDN_DRV_AVMB1_AVM_CS=m
+CONFIG_ISDN_DRV_AVMB1_T1PCI=m
+CONFIG_ISDN_DRV_AVMB1_C4=m
+CONFIG_HYSDN=m
+CONFIG_HYSDN_CAPI=y
+CONFIG_KALLSYMS=y
+
+#
+# Old CD-ROM drivers (not SCSI, not IDE)
+#
+# CONFIG_CD_NO_IDESCSI is not set
+
+#
+# Input core support
+#
+CONFIG_INPUT=m
+CONFIG_INPUT_KEYBDEV=m
+CONFIG_INPUT_MOUSEDEV=m
+CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024
+CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768
+CONFIG_INPUT_JOYDEV=m
+CONFIG_INPUT_EVDEV=m
+
+#
+# Character devices
+#
+CONFIG_VT=y
+CONFIG_ECC=m
+CONFIG_VT_CONSOLE=y
+CONFIG_SERIAL=y
+CONFIG_SERIAL_CONSOLE=y
+CONFIG_SERIAL_EXTENDED=y
+CONFIG_SERIAL_MANY_PORTS=y
+CONFIG_SERIAL_SHARE_IRQ=y
+# CONFIG_SERIAL_DETECT_IRQ is not set
+CONFIG_SERIAL_MULTIPORT=y
+# CONFIG_HUB6 is not set
+CONFIG_SERIAL_NONSTANDARD=y
+CONFIG_COMPUTONE=m
+CONFIG_ROCKETPORT=m
+CONFIG_CYCLADES=m
+# CONFIG_CYZ_INTR is not set
+CONFIG_DIGIEPCA=m
+CONFIG_ESPSERIAL=m
+CONFIG_MOXA_INTELLIO=m
+CONFIG_MOXA_SMARTIO=m
+CONFIG_ISI=m
+CONFIG_SYNCLINK=m
+# CONFIG_SYNCLINKMP is not set
+CONFIG_N_HDLC=m
+CONFIG_RISCOM8=m
+CONFIG_SPECIALIX=m
+CONFIG_SPECIALIX_RTSCTS=y
+CONFIG_SX=m
+# CONFIG_RIO is not set
+CONFIG_STALDRV=y
+CONFIG_STALLION=m
+CONFIG_ISTALLION=m
+CONFIG_UNIX98_PTYS=y
+CONFIG_UNIX98_PTY_COUNT=2048
+CONFIG_PRINTER=m
+CONFIG_LP_CONSOLE=y
+CONFIG_PPDEV=m
+CONFIG_TIPAR=m
+
+#
+# I2C support
+#
+CONFIG_I2C=m
+CONFIG_I2C_ALGOBIT=m
+CONFIG_I2C_PHILIPSPAR=m
+CONFIG_I2C_ELV=m
+CONFIG_I2C_VELLEMAN=m
+# CONFIG_SCx200_I2C is not set
+# CONFIG_SCx200_ACB is not set
+CONFIG_I2C_ALGOPCF=m
+CONFIG_I2C_ELEKTOR=m
+CONFIG_I2C_MAINBOARD=y
+CONFIG_I2C_ALI1535=m
+CONFIG_I2C_ALI15X3=m
+CONFIG_I2C_HYDRA=m
+CONFIG_I2C_AMD756=m
+# CONFIG_I2C_TSUNAMI is not set
+CONFIG_I2C_I801=m
+# CONFIG_I2C_I810 is not set
+CONFIG_I2C_PIIX4=m
+CONFIG_I2C_SIS5595=m
+CONFIG_I2C_VIA=m
+CONFIG_I2C_VIAPRO=m
+CONFIG_I2C_VOODOO3=m
+CONFIG_I2C_ISA=m
+CONFIG_I2C_CHARDEV=m
+CONFIG_I2C_PROC=m
+
+#
+# Hardware sensors support
+#
+CONFIG_SENSORS=y
+CONFIG_SENSORS_ADM1021=m
+CONFIG_SENSORS_ADM1024=m
+CONFIG_SENSORS_ADM1025=m
+CONFIG_SENSORS_ADM9240=m
+CONFIG_SENSORS_DS1621=m
+CONFIG_SENSORS_FSCPOS=m
+CONFIG_SENSORS_FSCSCY=m
+CONFIG_SENSORS_GL518SM=m
+CONFIG_SENSORS_GL520SM=m
+CONFIG_SENSORS_MAXILIFE=m
+CONFIG_SENSORS_IT87=m
+CONFIG_SENSORS_MTP008=m
+CONFIG_SENSORS_LM75=m
+CONFIG_SENSORS_LM78=m
+CONFIG_SENSORS_LM80=m
+CONFIG_SENSORS_LM87=m
+CONFIG_SENSORS_LM92=m
+CONFIG_SENSORS_SIS5595=m
+CONFIG_SENSORS_SMSC47M1=m
+CONFIG_SENSORS_THMC50=m
+CONFIG_SENSORS_VIA686A=m
+CONFIG_SENSORS_VT1211=m
+CONFIG_SENSORS_VT8231=m
+CONFIG_SENSORS_W83781D=m
+CONFIG_SENSORS_OTHER=y
+CONFIG_SENSORS_BT869=m
+CONFIG_SENSORS_DDCMON=m
+CONFIG_SENSORS_EEPROM=m
+CONFIG_SENSORS_MATORB=m
+CONFIG_SENSORS_PCF8574=m
+CONFIG_SENSORS_PCF8591=m
+
+#
+# Mice
+#
+CONFIG_BUSMOUSE=m
+CONFIG_ATIXL_BUSMOUSE=m
+CONFIG_LOGIBUSMOUSE=m
+CONFIG_MS_BUSMOUSE=m
+CONFIG_MOUSE=y
+CONFIG_PSMOUSE=y
+CONFIG_82C710_MOUSE=m
+CONFIG_PC110_PAD=m
+CONFIG_MK712_MOUSE=m
+
+#
+# Joysticks
+#
+CONFIG_INPUT_GAMEPORT=m
+CONFIG_INPUT_NS558=m
+CONFIG_INPUT_LIGHTNING=m
+CONFIG_INPUT_PCIGAME=m
+CONFIG_INPUT_CS461X=m
+CONFIG_INPUT_EMU10K1=m
+CONFIG_INPUT_SERIO=m
+CONFIG_INPUT_SERPORT=m
+CONFIG_INPUT_ANALOG=m
+CONFIG_INPUT_A3D=m
+CONFIG_INPUT_ADI=m
+CONFIG_INPUT_COBRA=m
+CONFIG_INPUT_GF2K=m
+CONFIG_INPUT_GRIP=m
+CONFIG_INPUT_INTERACT=m
+CONFIG_INPUT_TMDC=m
+CONFIG_INPUT_SIDEWINDER=m
+CONFIG_INPUT_IFORCE_USB=m
+CONFIG_INPUT_IFORCE_232=m
+CONFIG_INPUT_WARRIOR=m
+CONFIG_INPUT_MAGELLAN=m
+CONFIG_INPUT_SPACEORB=m
+CONFIG_INPUT_SPACEBALL=m
+CONFIG_INPUT_STINGER=m
+CONFIG_INPUT_DB9=m
+CONFIG_INPUT_GAMECON=m
+CONFIG_INPUT_TURBOGRAFX=m
+# CONFIG_QIC02_TAPE is not set
+CONFIG_IPMI_HANDLER=m
+# CONFIG_IPMI_PANIC_EVENT is not set
+CONFIG_IPMI_DEVICE_INTERFACE=m
+CONFIG_IPMI_KCS=m
+CONFIG_IPMI_WATCHDOG=m
+
+#
+# Watchdog Cards
+#
+CONFIG_WATCHDOG=y
+# CONFIG_WATCHDOG_NOWAYOUT is not set
+CONFIG_ACQUIRE_WDT=m
+CONFIG_ADVANTECH_WDT=m
+CONFIG_ALIM7101_WDT=m
+CONFIG_SC520_WDT=m
+CONFIG_PCWATCHDOG=m
+CONFIG_EUROTECH_WDT=m
+CONFIG_IB700_WDT=m
+CONFIG_WAFER_WDT=m
+CONFIG_I810_TCO=m
+# CONFIG_MIXCOMWD is not set
+# CONFIG_60XX_WDT is not set
+CONFIG_SC1200_WDT=m
+# CONFIG_SCx200_WDT is not set
+CONFIG_SOFT_WATCHDOG=m
+CONFIG_W83877F_WDT=m
+CONFIG_WDT=m
+CONFIG_WDTPCI=m
+# CONFIG_WDT_501 is not set
+CONFIG_MACHZ_WDT=m
+CONFIG_AMD7XX_TCO=m
+# CONFIG_SCx200_GPIO is not set
+CONFIG_AMD_RNG=m
+CONFIG_INTEL_RNG=m
+CONFIG_AMD_PM768=m
+CONFIG_NVRAM=m
+CONFIG_RTC=y
+CONFIG_DTLK=m
+CONFIG_R3964=m
+# CONFIG_APPLICOM is not set
+CONFIG_SONYPI=m
+
+#
+# Ftape, the floppy tape device driver
+#
+CONFIG_FTAPE=m
+CONFIG_ZFTAPE=m
+CONFIG_ZFT_DFLT_BLK_SZ=10240
+CONFIG_ZFT_COMPRESSOR=m
+CONFIG_FT_NR_BUFFERS=3
+# CONFIG_FT_PROC_FS is not set
+CONFIG_FT_NORMAL_DEBUG=y
+# CONFIG_FT_FULL_DEBUG is not set
+# CONFIG_FT_NO_TRACE is not set
+# CONFIG_FT_NO_TRACE_AT_ALL is not set
+CONFIG_FT_STD_FDC=y
+# CONFIG_FT_MACH2 is not set
+# CONFIG_FT_PROBE_FC10 is not set
+# CONFIG_FT_ALT_FDC is not set
+CONFIG_FT_FDC_THR=8
+CONFIG_FT_FDC_MAX_RATE=2000
+CONFIG_FT_ALPHA_CLOCK=0
+CONFIG_AGP=m
+CONFIG_AGP_INTEL=y
+CONFIG_AGP_I810=y
+CONFIG_AGP_VIA=y
+CONFIG_AGP_AMD=y
+CONFIG_AGP_AMD_8151=y
+CONFIG_AGP_SIS=y
+CONFIG_AGP_ALI=y
+CONFIG_AGP_SWORKS=y
+CONFIG_DRM=y
+# CONFIG_DRM_OLD is not set
+CONFIG_DRM_NEW=y
+CONFIG_DRM_TDFX=m
+CONFIG_DRM_R128=m
+CONFIG_DRM_RADEON=m
+CONFIG_DRM_I810=m
+# CONFIG_DRM_I810_XFREE_41 is not set
+CONFIG_DRM_I830=m
+CONFIG_DRM_MGA=m
+# CONFIG_DRM_SIS is not set
+
+#
+# PCMCIA character devices
+#
+CONFIG_PCMCIA_SERIAL_CS=m
+CONFIG_SYNCLINK_CS=m
+CONFIG_MWAVE=m
+CONFIG_BATTERY_GERICOM=m
+
+#
+# Multimedia devices
+#
+CONFIG_VIDEO_DEV=m
+
+#
+# Video For Linux
+#
+CONFIG_VIDEO_PROC_FS=y
+CONFIG_I2C_PARPORT=m
+CONFIG_VIDEO_BT848=m
+CONFIG_VIDEO_PMS=m
+CONFIG_VIDEO_BWQCAM=m
+CONFIG_VIDEO_CQCAM=m
+CONFIG_VIDEO_W9966=m
+CONFIG_VIDEO_CPIA=m
+CONFIG_VIDEO_CPIA_PP=m
+CONFIG_VIDEO_CPIA_USB=m
+CONFIG_VIDEO_SAA5249=m
+CONFIG_TUNER_3036=m
+CONFIG_VIDEO_STRADIS=m
+CONFIG_VIDEO_ZORAN=m
+CONFIG_VIDEO_ZORAN_BUZ=m
+CONFIG_VIDEO_ZORAN_DC10=m
+CONFIG_VIDEO_ZORAN_LML33=m
+CONFIG_VIDEO_ZR36120=m
+CONFIG_VIDEO_MEYE=m
+
+#
+# Radio Adapters
+#
+CONFIG_RADIO_CADET=m
+CONFIG_RADIO_RTRACK=m
+CONFIG_RADIO_RTRACK2=m
+CONFIG_RADIO_AZTECH=m
+CONFIG_RADIO_GEMTEK=m
+CONFIG_RADIO_GEMTEK_PCI=m
+CONFIG_RADIO_MAXIRADIO=m
+CONFIG_RADIO_MAESTRO=m
+CONFIG_RADIO_MIROPCM20=m
+CONFIG_RADIO_MIROPCM20_RDS=m
+CONFIG_RADIO_SF16FMI=m
+CONFIG_RADIO_SF16FMR2=m
+CONFIG_RADIO_TERRATEC=m
+CONFIG_RADIO_TRUST=m
+CONFIG_RADIO_TYPHOON=m
+CONFIG_RADIO_TYPHOON_PROC_FS=y
+CONFIG_RADIO_ZOLTRIX=m
+
+#
+# Crypto Hardware support
+#
+CONFIG_CRYPTO=m
+CONFIG_CRYPTO_BROADCOM=m
+
+#
+# File systems
+#
+CONFIG_QUOTA=y
+# CONFIG_QFMT_V1 is not set
+CONFIG_QFMT_V2=y
+# CONFIG_QIFACE_COMPAT is not set
+CONFIG_AUTOFS_FS=m
+CONFIG_AUTOFS4_FS=m
+CONFIG_REISERFS_FS=m
+# CONFIG_REISERFS_CHECK is not set
+CONFIG_REISERFS_PROC_INFO=y
+# CONFIG_ADFS_FS is not set
+CONFIG_AFS_FS=m
+# CONFIG_ADFS_FS_RW is not set
+# CONFIG_AFFS_FS is not set
+CONFIG_HFS_FS=m
+CONFIG_BEFS_FS=m
+# CONFIG_BEFS_DEBUG is not set
+CONFIG_BFS_FS=m
+CONFIG_EXT3_FS=m
+CONFIG_EXT3_FS_XATTR=y
+CONFIG_EXT3_FS_XATTR_SHARING=y
+CONFIG_EXT3_FS_XATTR_USER=y
+CONFIG_JBD=m
+# CONFIG_JBD_DEBUG is not set
+CONFIG_FAT_FS=m
+CONFIG_MSDOS_FS=m
+CONFIG_UMSDOS_FS=m
+CONFIG_VFAT_FS=m
+# CONFIG_EFS_FS is not set
+# CONFIG_JFFS_FS is not set
+# CONFIG_JFFS2_FS is not set
+CONFIG_CRAMFS=m
+CONFIG_TMPFS=y
+CONFIG_RAMFS=y
+CONFIG_ISO9660_FS=y
+CONFIG_JOLIET=y
+CONFIG_ZISOFS=y
+CONFIG_JFS_FS=m
+# CONFIG_JFS_DEBUG is not set
+# CONFIG_JFS_STATISTICS is not set
+CONFIG_MINIX_FS=m
+CONFIG_VXFS_FS=m
+# CONFIG_NTFS_FS is not set
+# CONFIG_NTFS_RW is not set
+# CONFIG_HPFS_FS is not set
+CONFIG_PROC_FS=y
+# CONFIG_DEVFS_FS is not set
+# CONFIG_DEVFS_MOUNT is not set
+# CONFIG_DEVFS_DEBUG is not set
+CONFIG_DEVPTS_FS=y
+# CONFIG_QNX4FS_FS is not set
+# CONFIG_QNX4FS_RW is not set
+CONFIG_ROMFS_FS=m
+CONFIG_EXT2_FS=y
+CONFIG_EXT2_FS_XATTR=y
+CONFIG_EXT2_FS_XATTR_SHARING=y
+CONFIG_EXT2_FS_XATTR_USER=y
+CONFIG_SYSV_FS=m
+CONFIG_UDF_FS=m
+CONFIG_UDF_RW=y
+CONFIG_UFS_FS=m
+# CONFIG_UFS_FS_WRITE is not set
+
+#
+# Network File Systems
+#
+CONFIG_CODA_FS=m
+CONFIG_INTERMEZZO_FS=m
+CONFIG_NFS_FS=m
+CONFIG_NFS_V3=y
+# CONFIG_ROOT_NFS is not set
+CONFIG_NFSD=m
+CONFIG_NFSD_V3=y
+# CONFIG_NFSD_TCP is not set
+CONFIG_SUNRPC=m
+CONFIG_LOCKD=m
+CONFIG_LOCKD_V4=y
+CONFIG_SMB_FS=m
+# CONFIG_SMB_NLS_DEFAULT is not set
+CONFIG_NCP_FS=m
+CONFIG_NCPFS_PACKET_SIGNING=y
+CONFIG_NCPFS_IOCTL_LOCKING=y
+CONFIG_NCPFS_STRONG=y
+CONFIG_NCPFS_NFS_NS=y
+CONFIG_NCPFS_OS2_NS=y
+CONFIG_NCPFS_SMALLDOS=y
+CONFIG_NCPFS_NLS=y
+CONFIG_NCPFS_EXTRAS=y
+CONFIG_ZISOFS_FS=y
+CONFIG_FS_MBCACHE=y
+
+#
+# Partition Types
+#
+CONFIG_PARTITION_ADVANCED=y
+# CONFIG_ACORN_PARTITION is not set
+CONFIG_OSF_PARTITION=y
+# CONFIG_AMIGA_PARTITION is not set
+# CONFIG_ATARI_PARTITION is not set
+CONFIG_MAC_PARTITION=y
+CONFIG_MSDOS_PARTITION=y
+CONFIG_BSD_DISKLABEL=y
+CONFIG_MINIX_SUBPARTITION=y
+CONFIG_SOLARIS_X86_PARTITION=y
+CONFIG_UNIXWARE_DISKLABEL=y
+# CONFIG_LDM_PARTITION is not set
+CONFIG_SGI_PARTITION=y
+# CONFIG_ULTRIX_PARTITION is not set
+CONFIG_SUN_PARTITION=y
+# CONFIG_EFI_PARTITION is not set
+CONFIG_SMB_NLS=y
+CONFIG_NLS=y
+
+#
+# Native Language Support
+#
+CONFIG_NLS_DEFAULT="iso8859-1"
+CONFIG_NLS_CODEPAGE_437=m
+CONFIG_NLS_CODEPAGE_737=m
+CONFIG_NLS_CODEPAGE_775=m
+CONFIG_NLS_CODEPAGE_850=m
+CONFIG_NLS_CODEPAGE_852=m
+CONFIG_NLS_CODEPAGE_855=m
+CONFIG_NLS_CODEPAGE_857=m
+CONFIG_NLS_CODEPAGE_860=m
+CONFIG_NLS_CODEPAGE_861=m
+CONFIG_NLS_CODEPAGE_862=m
+CONFIG_NLS_CODEPAGE_863=m
+CONFIG_NLS_CODEPAGE_864=m
+CONFIG_NLS_CODEPAGE_865=m
+CONFIG_NLS_CODEPAGE_866=m
+CONFIG_NLS_CODEPAGE_869=m
+CONFIG_NLS_CODEPAGE_936=m
+CONFIG_NLS_CODEPAGE_950=m
+CONFIG_NLS_CODEPAGE_932=m
+CONFIG_NLS_CODEPAGE_949=m
+CONFIG_NLS_CODEPAGE_874=m
+CONFIG_NLS_ISO8859_8=m
+CONFIG_NLS_CODEPAGE_1250=m
+CONFIG_NLS_CODEPAGE_1251=m
+CONFIG_NLS_ISO8859_1=m
+CONFIG_NLS_ISO8859_2=m
+CONFIG_NLS_ISO8859_3=m
+CONFIG_NLS_ISO8859_4=m
+CONFIG_NLS_ISO8859_5=m
+CONFIG_NLS_ISO8859_6=m
+CONFIG_NLS_ISO8859_7=m
+CONFIG_NLS_ISO8859_9=m
+CONFIG_NLS_ISO8859_13=m
+CONFIG_NLS_ISO8859_14=m
+CONFIG_NLS_ISO8859_15=m
+CONFIG_NLS_KOI8_R=m
+CONFIG_NLS_KOI8_U=m
+CONFIG_NLS_UTF8=m
+
+#
+# Console drivers
+#
+CONFIG_VGA_CONSOLE=y
+CONFIG_VIDEO_SELECT=y
+# CONFIG_VIDEO_IGNORE_BAD_MODE is not set
+CONFIG_MDA_CONSOLE=m
+
+#
+# Frame-buffer support
+#
+CONFIG_FB=y
+CONFIG_DUMMY_CONSOLE=y
+CONFIG_FB_RIVA=m
+CONFIG_FB_CLGEN=m
+CONFIG_FB_PM2=m
+# CONFIG_FB_PM2_FIFO_DISCONNECT is not set
+CONFIG_FB_PM2_PCI=y
+CONFIG_FB_PM3=m
+# CONFIG_FB_CYBER2000 is not set
+CONFIG_FB_VESA=y
+CONFIG_FB_VGA16=m
+CONFIG_FB_HGA=m
+CONFIG_VIDEO_SELECT=y
+CONFIG_FB_MATROX=m
+CONFIG_FB_MATROX_MILLENIUM=y
+CONFIG_FB_MATROX_MYSTIQUE=y
+# CONFIG_FB_MATROX_G450 is not set
+CONFIG_FB_MATROX_G100A=y
+CONFIG_FB_MATROX_G100=y
+CONFIG_FB_MATROX_I2C=m
+CONFIG_FB_MATROX_MAVEN=m
+# CONFIG_FB_MATROX_PROC is not set
+CONFIG_FB_MATROX_MULTIHEAD=y
+CONFIG_FB_ATY=m
+CONFIG_FB_ATY_GX=y
+CONFIG_FB_ATY_CT=y
+CONFIG_FB_ATY_CT_VAIO_LCD=y
+CONFIG_FB_RADEON=m
+CONFIG_FB_ATY128=m
+CONFIG_FB_SIS=m
+CONFIG_FB_SIS_300=y
+CONFIG_FB_SIS_315=y
+CONFIG_FB_NEOMAGIC=m
+CONFIG_FB_3DFX=m
+CONFIG_FB_VOODOO1=m
+# CONFIG_FB_TRIDENT is not set
+# CONFIG_FB_VIRTUAL is not set
+# CONFIG_FBCON_ADVANCED is not set
+CONFIG_FBCON_MFB=m
+CONFIG_FBCON_CFB8=y
+CONFIG_FBCON_CFB16=y
+CONFIG_FBCON_CFB24=y
+CONFIG_FBCON_CFB32=y
+CONFIG_FBCON_VGA_PLANES=m
+CONFIG_FBCON_HGA=m
+# CONFIG_FBCON_FONTWIDTH8_ONLY is not set
+# CONFIG_FBCON_FONTS is not set
+CONFIG_FONT_8x8=y
+CONFIG_FONT_8x16=y
+
+#
+# Sound
+#
+CONFIG_SOUND=m
+CONFIG_SOUND_ALI5455=m
+CONFIG_SOUND_BT878=m
+CONFIG_SOUND_CMPCI=m
+CONFIG_SOUND_CMPCI_FM=y
+CONFIG_SOUND_CMPCI_FMIO=388
+CONFIG_SOUND_CMPCI_FMIO=388
+CONFIG_SOUND_CMPCI_MIDI=y
+CONFIG_SOUND_CMPCI_MPUIO=330
+CONFIG_SOUND_CMPCI_JOYSTICK=y
+CONFIG_SOUND_CMPCI_CM8738=y
+# CONFIG_SOUND_CMPCI_SPDIFINVERSE is not set
+CONFIG_SOUND_CMPCI_SPDIFLOOP=y
+CONFIG_SOUND_CMPCI_SPEAKERS=2
+CONFIG_SOUND_EMU10K1=m
+CONFIG_MIDI_EMU10K1=y
+CONFIG_SOUND_AUDIGY=m
+CONFIG_SOUND_FUSION=m
+CONFIG_SOUND_CS4281=m
+CONFIG_SOUND_ES1370=m
+CONFIG_SOUND_ES1371=m
+CONFIG_SOUND_ESSSOLO1=m
+CONFIG_SOUND_MAESTRO=m
+CONFIG_SOUND_MAESTRO3=m
+CONFIG_SOUND_FORTE=m
+CONFIG_SOUND_ICH=m
+CONFIG_SOUND_RME96XX=m
+CONFIG_SOUND_SONICVIBES=m
+CONFIG_SOUND_TRIDENT=m
+CONFIG_SOUND_MSNDCLAS=m
+# CONFIG_MSNDCLAS_HAVE_BOOT is not set
+CONFIG_MSNDCLAS_INIT_FILE="/etc/sound/msndinit.bin"
+CONFIG_MSNDCLAS_PERM_FILE="/etc/sound/msndperm.bin"
+CONFIG_SOUND_MSNDPIN=m
+# CONFIG_MSNDPIN_HAVE_BOOT is not set
+CONFIG_MSNDPIN_INIT_FILE="/etc/sound/pndspini.bin"
+CONFIG_MSNDPIN_PERM_FILE="/etc/sound/pndsperm.bin"
+CONFIG_SOUND_VIA82CXXX=m
+CONFIG_MIDI_VIA82CXXX=y
+CONFIG_SOUND_OSS=m
+# CONFIG_SOUND_TRACEINIT is not set
+CONFIG_SOUND_DMAP=y
+CONFIG_SOUND_AD1816=m
+CONFIG_SOUND_AD1889=m
+CONFIG_SOUND_SGALAXY=m
+CONFIG_SOUND_ADLIB=m
+CONFIG_SOUND_ACI_MIXER=m
+CONFIG_SOUND_CS4232=m
+CONFIG_SOUND_SSCAPE=m
+CONFIG_SOUND_GUS=m
+CONFIG_SOUND_GUS16=y
+CONFIG_SOUND_GUSMAX=y
+CONFIG_SOUND_VMIDI=m
+CONFIG_SOUND_TRIX=m
+CONFIG_SOUND_MSS=m
+CONFIG_SOUND_MPU401=m
+CONFIG_SOUND_NM256=m
+CONFIG_SOUND_MAD16=m
+CONFIG_MAD16_OLDCARD=y
+CONFIG_SOUND_PAS=m
+# CONFIG_PAS_JOYSTICK is not set
+CONFIG_SOUND_PSS=m
+# CONFIG_PSS_MIXER is not set
+# CONFIG_PSS_HAVE_BOOT is not set
+CONFIG_SOUND_SB=m
+CONFIG_SOUND_AWE32_SYNTH=m
+CONFIG_SOUND_KAHLUA=m
+CONFIG_SOUND_WAVEFRONT=m
+CONFIG_SOUND_MAUI=m
+CONFIG_SOUND_YM3812=m
+CONFIG_SOUND_OPL3SA1=m
+CONFIG_SOUND_OPL3SA2=m
+CONFIG_SOUND_YMFPCI=m
+CONFIG_SOUND_YMFPCI_LEGACY=y
+CONFIG_SOUND_UART6850=m
+CONFIG_SOUND_AEDSP16=m
+CONFIG_SC6600=y
+CONFIG_SC6600_JOY=y
+CONFIG_SC6600_CDROM=4
+CONFIG_SC6600_CDROMBASE=0
+CONFIG_AEDSP16_SBPRO=y
+CONFIG_AEDSP16_MPU401=y
+CONFIG_SOUND_TVMIXER=m
+
+#
+# USB support
+#
+CONFIG_USB=m
+# CONFIG_USB_DEBUG is not set
+CONFIG_USB_DEVICEFS=y
+# CONFIG_USB_BANDWIDTH is not set
+CONFIG_USB_EHCI_HCD=m
+CONFIG_USB_UHCI=m
+CONFIG_USB_UHCI_ALT=m
+CONFIG_USB_OHCI=m
+CONFIG_USB_AUDIO=m
+# CONFIG_USB_EMI26 is not set
+CONFIG_USB_MIDI=m
+CONFIG_USB_STORAGE=m
+# CONFIG_USB_STORAGE_DEBUG is not set
+CONFIG_USB_STORAGE_DATAFAB=y
+CONFIG_USB_STORAGE_FREECOM=y
+CONFIG_USB_STORAGE_ISD200=y
+CONFIG_USB_STORAGE_DPCM=y
+CONFIG_USB_STORAGE_HP8200e=y
+CONFIG_USB_STORAGE_SDDR09=y
+CONFIG_USB_STORAGE_SDDR55=y
+CONFIG_USB_STORAGE_JUMPSHOT=y
+CONFIG_USB_ACM=m
+CONFIG_USB_PRINTER=m
+CONFIG_USB_HID=m
+CONFIG_USB_HIDINPUT=y
+CONFIG_USB_HIDDEV=y
+# CONFIG_USB_KBD is not set
+# CONFIG_USB_MOUSE is not set
+CONFIG_USB_AIPTEK=m
+CONFIG_USB_WACOM=m
+CONFIG_USB_KBTAB=m
+CONFIG_USB_POWERMATE=m
+# CONFIG_USB_DC2XX is not set
+CONFIG_USB_MDC800=m
+CONFIG_USB_SCANNER=m
+CONFIG_USB_MICROTEK=m
+CONFIG_USB_HPUSBSCSI=m
+CONFIG_USB_IBMCAM=m
+CONFIG_USB_KONICAWC=m
+CONFIG_USB_OV511=m
+CONFIG_USB_PWC=m
+CONFIG_USB_SE401=m
+CONFIG_USB_STV680=m
+CONFIG_USB_VICAM=m
+CONFIG_USB_DSBR=m
+CONFIG_USB_DABUSB=m
+CONFIG_USB_PEGASUS=m
+CONFIG_USB_RTL8150=m
+CONFIG_USB_KAWETH=m
+CONFIG_USB_CATC=m
+CONFIG_USB_CDCETHER=m
+CONFIG_USB_USBNET=m
+CONFIG_USB_USS720=m
+
+#
+# USB Serial Converter support
+#
+CONFIG_USB_SERIAL=m
+# CONFIG_USB_SERIAL_DEBUG is not set
+CONFIG_USB_SERIAL_GENERIC=y
+CONFIG_USB_SERIAL_BELKIN=m
+CONFIG_USB_SERIAL_WHITEHEAT=m
+CONFIG_USB_SERIAL_DIGI_ACCELEPORT=m
+CONFIG_USB_SERIAL_EMPEG=m
+CONFIG_USB_SERIAL_FTDI_SIO=m
+CONFIG_USB_SERIAL_VISOR=m
+CONFIG_USB_SERIAL_IPAQ=m
+CONFIG_USB_SERIAL_IR=m
+CONFIG_USB_SERIAL_EDGEPORT=m
+CONFIG_USB_SERIAL_EDGEPORT_TI=m
+CONFIG_USB_SERIAL_KEYSPAN_PDA=m
+CONFIG_USB_SERIAL_KEYSPAN=m
+# CONFIG_USB_SERIAL_KEYSPAN_USA28 is not set
+# CONFIG_USB_SERIAL_KEYSPAN_USA28X is not set
+CONFIG_USB_SERIAL_KEYSPAN_USA28XA=y
+CONFIG_USB_SERIAL_KEYSPAN_USA28XB=y
+# CONFIG_USB_SERIAL_KEYSPAN_USA19 is not set
+# CONFIG_USB_SERIAL_KEYSPAN_USA18X is not set
+CONFIG_USB_SERIAL_KEYSPAN_USA19W=y
+CONFIG_USB_SERIAL_KEYSPAN_USA19QW=y
+CONFIG_USB_SERIAL_KEYSPAN_USA19QI=y
+CONFIG_USB_SERIAL_KEYSPAN_MPR=y
+CONFIG_USB_SERIAL_KEYSPAN_USA49W=y
+CONFIG_USB_SERIAL_KEYSPAN_USA49WLC=y
+CONFIG_USB_SERIAL_MCT_U232=m
+CONFIG_USB_SERIAL_KLSI=m
+CONFIG_USB_SERIAL_KOBIL_SCT=m
+CONFIG_USB_SERIAL_PL2303=m
+CONFIG_USB_SERIAL_CYBERJACK=m
+CONFIG_USB_SERIAL_XIRCOM=m
+CONFIG_USB_SERIAL_OMNINET=m
+CONFIG_USB_RIO500=m
+CONFIG_USB_AUERSWALD=m
+CONFIG_USB_TIGL=m
+CONFIG_USB_BRLVGER=m
+CONFIG_USB_LCD=m
+
+#
+# Additional device driver support
+#
+CONFIG_NET_BROADCOM=m
+CONFIG_CIPE=m
+CONFIG_CRYPTO_AEP=m
+CONFIG_MEGARAC=m
+CONFIG_FC_QLA2200=m
+CONFIG_FC_QLA2300=m
+CONFIG_SCSI_ISCSI=m
+
+#
+# Bluetooth support
+#
+CONFIG_BLUEZ=m
+CONFIG_BLUEZ_L2CAP=m
+CONFIG_BLUEZ_SCO=m
+CONFIG_BLUEZ_RFCOMM=m
+CONFIG_BLUEZ_RFCOMM_TTY=y
+CONFIG_BLUEZ_BNEP=m
+CONFIG_BLUEZ_BNEP_MC_FILTER=y
+CONFIG_BLUEZ_BNEP_PROTO_FILTER=y
+
+#
+# Bluetooth device drivers
+#
+CONFIG_BLUEZ_HCIUSB=m
+CONFIG_BLUEZ_USB_SCO=y
+CONFIG_BLUEZ_USB_ZERO_PACKET=y
+CONFIG_BLUEZ_HCIUART=m
+CONFIG_BLUEZ_HCIUART_H4=y
+CONFIG_BLUEZ_HCIUART_BCSP=y
+CONFIG_BLUEZ_HCIUART_BCSP_TXCRC=y
+CONFIG_BLUEZ_HCIDTL1=m
+CONFIG_BLUEZ_HCIBT3C=m
+CONFIG_BLUEZ_HCIBLUECARD=m
+CONFIG_BLUEZ_HCIBTUART=m
+CONFIG_BLUEZ_HCIVHCI=m
+
+#
+# Profiling support
+#
+CONFIG_PROFILING=y
+CONFIG_OPROFILE=m
+
+#
+# Kernel hacking
+#
+CONFIG_DEBUG_KERNEL=y
+CONFIG_DEBUG_STACKOVERFLOW=y
+# CONFIG_DEBUG_HIGHMEM is not set
+# CONFIG_DEBUG_SLAB is not set
+# CONFIG_DEBUG_IOVIRT is not set
+CONFIG_MAGIC_SYSRQ=y
+# CONFIG_DEBUG_SPINLOCK is not set
+# CONFIG_FRAME_POINTER is not set
+# CONFIG_MCL_COREDUMP is not set
+
+#
+# Library routines
+#
+CONFIG_ZLIB_INFLATE=y
+CONFIG_ZLIB_DEFLATE=m
--- /dev/null
+Name: kernel
+Summary: The Linux Kernel
+Version: @KERNEL_VERSION@
+Release: @KERNEL_RELEASE@
+License: GPL
+Group: System Environment/Kernel
+Vendor: Cluster File Systems, Inc.
+URL: http://www.kernel.org/
+Buildroot: /var/tmp/%{name}-%{PACKAGE_VERSION}-root
+
+Source0: @LUSTRE_SOURCE@
+Source1: @KERNEL_SOURCE@
+
+%define __spec_install_post /usr/lib/rpm/brp-compress || :
+%define debug_package %{nil}
+
+%description
+The Linux Kernel, the operating system core itself.
+
+%package -n lustre-lite-utils
+Summary: Lustre utils for Linux
+Group: Applications/System
+
+%description -n lustre-lite-utils
+The Lustre Lite file system utilities.
+
+#%package -n lustre-doc
+#Summary: Sample Lustre configurations and documentation
+#Group: Documentation
+
+#%description -n lustre-doc
+#The Lustre book, sample configurations, and other documentation for
+#Lustre.
+
+%package -n lustre-ldap
+Summary: LDAP schema files for Lustre
+Group: System Environment/Daemons
+
+%description -n lustre-ldap
+LDAP schema files for Lustre.
+
+%prep
+%setup -n lustre-kernel-%{version} -q -c
+[ -d lustre ] || ln -sf lustre* lustre
+
+%build
+# if RPM_BUILD_NCPUS unset, set it
+if [ -z "$RPM_BUILD_NCPUS" ] ; then
+ RPM_BUILD_NCPUS=$(egrep -c "^cpu[0-9]+" /proc/stat || :)
+ if [ $RPM_BUILD_NCPUS -eq 0 ] ; then
+ RPM_BUILD_NCPUS=1
+ fi
+ if [ $RPM_BUILD_NCPUS -gt 8 ] ; then
+ RPM_BUILD_NCPUS=8
+ fi
+fi
+
+pushd lustre >/dev/null
+./scripts/lmake \
+ --phase build \
+ --target @LUSTRE_TARGET@ \
+ --extraversion %{release} \
+ --kerneldir $RPM_SOURCE_DIR \
+ -j $RPM_BUILD_NCPUS \
+ -- @CONFIGURE_FLAGS@
+popd >/dev/null
+
+%install
+rm -rf $RPM_BUILD_ROOT
+mkdir -p $RPM_BUILD_ROOT
+pushd lustre >/dev/null
+./scripts/lmake \
+ --phase install \
+ --target @LUSTRE_TARGET@ \
+ --extraversion %{release} \
+ --destdir $RPM_BUILD_ROOT
+popd >/dev/null
+
+%clean
+rm -rf $RPM_BUILD_ROOT
+
+%files
+%doc lustre/linux/COPYING lustre/linux/CREDITS lustre/linux/README
+%doc lustre/linux/REPORTING-BUGS
+%defattr(-, root, root)
+%dir /lib/modules
+/lib/modules/*
+/boot/*
+
+%files -n lustre-lite-utils
+%doc lustre/COPYING lustre/BUGS lustre/ChangeLog lustre/README lustre/doc/lustre.pdf
+%defattr(-, root, root)
+%{_sbindir}/*
+%{_bindir}/*
+%{_libdir}/lustre/python
+%{_sysconfdir}/init.d/lustre
+/usr/include/lustre
+/lib/lib*.a
+
+#%files -n lustre-doc
+#%defattr(-, root, root)
+#/usr/share/doc/lustre/COPYING
+#/usr/share/doc/lustre/lustre.pdf
+#/usr/share/doc/lustre/COPYING
+
+/usr/lib/lustre/examples
+
+%files -n lustre-ldap
+%defattr(-, root, root)
+/etc/openldap/slapd-lustre.conf
+/etc/openldap/schema/lustre.schema
+/usr/lib/lustre/lustre2ldif.xsl
+/usr/lib/lustre/top.ldif
--- /dev/null
+ fs/ext3/Makefile | 2
+ fs/ext3/dir.c | 302 +++++++++
+ fs/ext3/file.c | 3
+ fs/ext3/hash.c | 215 ++++++
+ fs/ext3/namei.c | 1421 ++++++++++++++++++++++++++++++++++++++++-----
+ fs/ext3/super.c | 7
+ include/linux/ext3_fs.h | 85 ++
+ include/linux/ext3_fs_sb.h | 2
+ include/linux/ext3_jbd.h | 2
+ include/linux/rbtree.h | 2
+ lib/rbtree.c | 42 +
+ 11 files changed, 1922 insertions(+), 161 deletions(-)
+
+--- linux-2.4.22-ac1/fs/ext3/dir.c~ext3-htree-2.4.22-rh 2001-11-10 01:25:04.000000000 +0300
++++ linux-2.4.22-ac1-alexey/fs/ext3/dir.c 2003-09-25 14:58:30.000000000 +0400
+@@ -21,12 +21,16 @@
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
++#include <linux/slab.h>
++#include <linux/rbtree.h>
+
+ static unsigned char ext3_filetype_table[] = {
+ DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+ };
+
+ static int ext3_readdir(struct file *, void *, filldir_t);
++static int ext3_dx_readdir(struct file * filp,
++ void * dirent, filldir_t filldir);
+
+ struct file_operations ext3_dir_operations = {
+ read: generic_read_dir,
+@@ -35,6 +39,17 @@ struct file_operations ext3_dir_operatio
+ fsync: ext3_sync_file, /* BKL held */
+ };
+
++
++static unsigned char get_dtype(struct super_block *sb, int filetype)
++{
++ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE) ||
++ (filetype >= EXT3_FT_MAX))
++ return DT_UNKNOWN;
++
++ return (ext3_filetype_table[filetype]);
++}
++
++
+ int ext3_check_dir_entry (const char * function, struct inode * dir,
+ struct ext3_dir_entry_2 * de,
+ struct buffer_head * bh,
+@@ -79,6 +94,16 @@ static int ext3_readdir(struct file * fi
+
+ sb = inode->i_sb;
+
++ if (is_dx(inode)) {
++ err = ext3_dx_readdir(filp, dirent, filldir);
++ if (err != ERR_BAD_DX_DIR)
++ return err;
++ /*
++ * We don't set the inode dirty flag since it's not
++ * critical that it get flushed back to the disk.
++ */
++ EXT3_I(filp->f_dentry->d_inode)->i_flags &= ~EXT3_INDEX_FL;
++ }
+ stored = 0;
+ bh = NULL;
+ offset = filp->f_pos & (sb->s_blocksize - 1);
+@@ -162,18 +187,12 @@ revalidate:
+ * during the copy operation.
+ */
+ unsigned long version = filp->f_version;
+- unsigned char d_type = DT_UNKNOWN;
+
+- if (EXT3_HAS_INCOMPAT_FEATURE(sb,
+- EXT3_FEATURE_INCOMPAT_FILETYPE)
+- && de->file_type < EXT3_FT_MAX)
+- d_type =
+- ext3_filetype_table[de->file_type];
+ error = filldir(dirent, de->name,
+ de->name_len,
+ filp->f_pos,
+ le32_to_cpu(de->inode),
+- d_type);
++ get_dtype(sb, de->file_type));
+ if (error)
+ break;
+ if (version != filp->f_version)
+@@ -188,3 +207,272 @@ revalidate:
+ UPDATE_ATIME(inode);
+ return 0;
+ }
++
++#ifdef CONFIG_EXT3_INDEX
++/*
++ * These functions convert from the major/minor hash to an f_pos
++ * value.
++ *
++ * Currently we only use major hash numer. This is unfortunate, but
++ * on 32-bit machines, the same VFS interface is used for lseek and
++ * llseek, so if we use the 64 bit offset, then the 32-bit versions of
++ * lseek/telldir/seekdir will blow out spectacularly, and from within
++ * the ext2 low-level routine, we don't know if we're being called by
++ * a 64-bit version of the system call or the 32-bit version of the
++ * system call. Worse yet, NFSv2 only allows for a 32-bit readdir
++ * cookie. Sigh.
++ */
++#define hash2pos(major, minor) (major >> 1)
++#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff)
++#define pos2min_hash(pos) (0)
++
++/*
++ * This structure holds the nodes of the red-black tree used to store
++ * the directory entry in hash order.
++ */
++struct fname {
++ __u32 hash;
++ __u32 minor_hash;
++ rb_node_t rb_hash;
++ struct fname *next;
++ __u32 inode;
++ __u8 name_len;
++ __u8 file_type;
++ char name[0];
++};
++
++/*
++ * This functoin implements a non-recursive way of freeing all of the
++ * nodes in the red-black tree.
++ */
++static void free_rb_tree_fname(rb_root_t *root)
++{
++ rb_node_t *n = root->rb_node;
++ rb_node_t *parent;
++ struct fname *fname;
++
++ while (n) {
++ /* Do the node's children first */
++ if ((n)->rb_left) {
++ n = n->rb_left;
++ continue;
++ }
++ if (n->rb_right) {
++ n = n->rb_right;
++ continue;
++ }
++ /*
++ * The node has no children; free it, and then zero
++ * out parent's link to it. Finally go to the
++ * beginning of the loop and try to free the parent
++ * node.
++ */
++ parent = n->rb_parent;
++ fname = rb_entry(n, struct fname, rb_hash);
++ kfree(fname);
++ if (!parent)
++ root->rb_node = 0;
++ else if (parent->rb_left == n)
++ parent->rb_left = 0;
++ else if (parent->rb_right == n)
++ parent->rb_right = 0;
++ n = parent;
++ }
++ root->rb_node = 0;
++}
++
++
++struct dir_private_info *create_dir_info(loff_t pos)
++{
++ struct dir_private_info *p;
++
++ p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
++ if (!p)
++ return NULL;
++ p->root.rb_node = 0;
++ p->curr_node = 0;
++ p->extra_fname = 0;
++ p->last_pos = 0;
++ p->curr_hash = pos2maj_hash(pos);
++ p->curr_minor_hash = pos2min_hash(pos);
++ p->next_hash = 0;
++ return p;
++}
++
++void ext3_htree_free_dir_info(struct dir_private_info *p)
++{
++ free_rb_tree_fname(&p->root);
++ kfree(p);
++}
++
++/*
++ * Given a directory entry, enter it into the fname rb tree.
++ */
++int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
++ __u32 minor_hash,
++ struct ext3_dir_entry_2 *dirent)
++{
++ rb_node_t **p, *parent = NULL;
++ struct fname * fname, *new_fn;
++ struct dir_private_info *info;
++ int len;
++
++ info = (struct dir_private_info *) dir_file->private_data;
++ p = &info->root.rb_node;
++
++ /* Create and allocate the fname structure */
++ len = sizeof(struct fname) + dirent->name_len + 1;
++ new_fn = kmalloc(len, GFP_KERNEL);
++ if (!new_fn)
++ return -ENOMEM;
++ memset(new_fn, 0, len);
++ new_fn->hash = hash;
++ new_fn->minor_hash = minor_hash;
++ new_fn->inode = le32_to_cpu(dirent->inode);
++ new_fn->name_len = dirent->name_len;
++ new_fn->file_type = dirent->file_type;
++ memcpy(new_fn->name, dirent->name, dirent->name_len);
++ new_fn->name[dirent->name_len] = 0;
++
++ while (*p) {
++ parent = *p;
++ fname = rb_entry(parent, struct fname, rb_hash);
++
++ /*
++ * If the hash and minor hash match up, then we put
++ * them on a linked list. This rarely happens...
++ */
++ if ((new_fn->hash == fname->hash) &&
++ (new_fn->minor_hash == fname->minor_hash)) {
++ new_fn->next = fname->next;
++ fname->next = new_fn;
++ return 0;
++ }
++
++ if (new_fn->hash < fname->hash)
++ p = &(*p)->rb_left;
++ else if (new_fn->hash > fname->hash)
++ p = &(*p)->rb_right;
++ else if (new_fn->minor_hash < fname->minor_hash)
++ p = &(*p)->rb_left;
++ else /* if (new_fn->minor_hash > fname->minor_hash) */
++ p = &(*p)->rb_right;
++ }
++
++ rb_link_node(&new_fn->rb_hash, parent, p);
++ rb_insert_color(&new_fn->rb_hash, &info->root);
++ return 0;
++}
++
++
++
++/*
++ * This is a helper function for ext3_dx_readdir. It calls filldir
++ * for all entres on the fname linked list. (Normally there is only
++ * one entry on the linked list, unless there are 62 bit hash collisions.)
++ */
++static int call_filldir(struct file * filp, void * dirent,
++ filldir_t filldir, struct fname *fname)
++{
++ struct dir_private_info *info = filp->private_data;
++ loff_t curr_pos;
++ struct inode *inode = filp->f_dentry->d_inode;
++ struct super_block * sb;
++ int error;
++
++ sb = inode->i_sb;
++
++ if (!fname) {
++ printk("call_filldir: called with null fname?!?\n");
++ return 0;
++ }
++ curr_pos = hash2pos(fname->hash, fname->minor_hash);
++ while (fname) {
++ error = filldir(dirent, fname->name,
++ fname->name_len, curr_pos,
++ fname->inode,
++ get_dtype(sb, fname->file_type));
++ if (error) {
++ filp->f_pos = curr_pos;
++ info->extra_fname = fname->next;
++ return error;
++ }
++ fname = fname->next;
++ }
++ return 0;
++}
++
++static int ext3_dx_readdir(struct file * filp,
++ void * dirent, filldir_t filldir)
++{
++ struct dir_private_info *info = filp->private_data;
++ struct inode *inode = filp->f_dentry->d_inode;
++ struct fname *fname;
++ int ret;
++
++ if (!info) {
++ info = create_dir_info(filp->f_pos);
++ if (!info)
++ return -ENOMEM;
++ filp->private_data = info;
++ }
++
++ /* Some one has messed with f_pos; reset the world */
++ if (info->last_pos != filp->f_pos) {
++ free_rb_tree_fname(&info->root);
++ info->curr_node = 0;
++ info->extra_fname = 0;
++ info->curr_hash = pos2maj_hash(filp->f_pos);
++ info->curr_minor_hash = pos2min_hash(filp->f_pos);
++ }
++
++ /*
++ * If there are any leftover names on the hash collision
++ * chain, return them first.
++ */
++ if (info->extra_fname &&
++ call_filldir(filp, dirent, filldir, info->extra_fname))
++ goto finished;
++
++ if (!info->curr_node)
++ info->curr_node = rb_get_first(&info->root);
++
++ while (1) {
++ /*
++ * Fill the rbtree if we have no more entries,
++ * or the inode has changed since we last read in the
++ * cached entries.
++ */
++ if ((!info->curr_node) ||
++ (filp->f_version != inode->i_version)) {
++ info->curr_node = 0;
++ free_rb_tree_fname(&info->root);
++ filp->f_version = inode->i_version;
++ ret = ext3_htree_fill_tree(filp, info->curr_hash,
++ info->curr_minor_hash,
++ &info->next_hash);
++ if (ret < 0)
++ return ret;
++ if (ret == 0)
++ break;
++ info->curr_node = rb_get_first(&info->root);
++ }
++
++ fname = rb_entry(info->curr_node, struct fname, rb_hash);
++ info->curr_hash = fname->hash;
++ info->curr_minor_hash = fname->minor_hash;
++ if (call_filldir(filp, dirent, filldir, fname))
++ break;
++
++ info->curr_node = rb_get_next(info->curr_node);
++ if (!info->curr_node) {
++ info->curr_hash = info->next_hash;
++ info->curr_minor_hash = 0;
++ }
++ }
++finished:
++ info->last_pos = filp->f_pos;
++ UPDATE_ATIME(inode);
++ return 0;
++}
++#endif
+--- linux-2.4.22-ac1/fs/ext3/file.c~ext3-htree-2.4.22-rh 2003-08-25 15:44:43.000000000 +0400
++++ linux-2.4.22-ac1-alexey/fs/ext3/file.c 2003-09-25 14:55:12.000000000 +0400
+@@ -35,6 +35,9 @@ static int ext3_release_file (struct ino
+ {
+ if (filp->f_mode & FMODE_WRITE)
+ ext3_discard_prealloc (inode);
++ if (is_dx(inode) && filp->private_data)
++ ext3_htree_free_dir_info(filp->private_data);
++
+ return 0;
+ }
+
+--- /dev/null 2003-01-30 13:24:37.000000000 +0300
++++ linux-2.4.22-ac1-alexey/fs/ext3/hash.c 2003-09-25 14:55:12.000000000 +0400
+@@ -0,0 +1,215 @@
++/*
++ * linux/fs/ext3/hash.c
++ *
++ * Copyright (C) 2002 by Theodore Ts'o
++ *
++ * This file is released under the GPL v2.
++ *
++ * This file may be redistributed under the terms of the GNU Public
++ * License.
++ */
++
++#include <linux/fs.h>
++#include <linux/jbd.h>
++#include <linux/sched.h>
++#include <linux/ext3_fs.h>
++
++#define DELTA 0x9E3779B9
++
++static void TEA_transform(__u32 buf[4], __u32 const in[])
++{
++ __u32 sum = 0;
++ __u32 b0 = buf[0], b1 = buf[1];
++ __u32 a = in[0], b = in[1], c = in[2], d = in[3];
++ int n = 16;
++
++ do {
++ sum += DELTA;
++ b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
++ b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
++ } while(--n);
++
++ buf[0] += b0;
++ buf[1] += b1;
++}
++
++/* F, G and H are basic MD4 functions: selection, majority, parity */
++#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
++#define G(x, y, z) (((x) & (y)) + (((x) ^ (y)) & (z)))
++#define H(x, y, z) ((x) ^ (y) ^ (z))
++
++/*
++ * The generic round function. The application is so specific that
++ * we don't bother protecting all the arguments with parens, as is generally
++ * good macro practice, in favor of extra legibility.
++ * Rotation is separate from addition to prevent recomputation
++ */
++#define ROUND(f, a, b, c, d, x, s) \
++ (a += f(b, c, d) + x, a = (a << s) | (a >> (32-s)))
++#define K1 0
++#define K2 013240474631UL
++#define K3 015666365641UL
++
++/*
++ * Basic cut-down MD4 transform. Returns only 32 bits of result.
++ */
++static void halfMD4Transform (__u32 buf[4], __u32 const in[])
++{
++ __u32 a = buf[0], b = buf[1], c = buf[2], d = buf[3];
++
++ /* Round 1 */
++ ROUND(F, a, b, c, d, in[0] + K1, 3);
++ ROUND(F, d, a, b, c, in[1] + K1, 7);
++ ROUND(F, c, d, a, b, in[2] + K1, 11);
++ ROUND(F, b, c, d, a, in[3] + K1, 19);
++ ROUND(F, a, b, c, d, in[4] + K1, 3);
++ ROUND(F, d, a, b, c, in[5] + K1, 7);
++ ROUND(F, c, d, a, b, in[6] + K1, 11);
++ ROUND(F, b, c, d, a, in[7] + K1, 19);
++
++ /* Round 2 */
++ ROUND(G, a, b, c, d, in[1] + K2, 3);
++ ROUND(G, d, a, b, c, in[3] + K2, 5);
++ ROUND(G, c, d, a, b, in[5] + K2, 9);
++ ROUND(G, b, c, d, a, in[7] + K2, 13);
++ ROUND(G, a, b, c, d, in[0] + K2, 3);
++ ROUND(G, d, a, b, c, in[2] + K2, 5);
++ ROUND(G, c, d, a, b, in[4] + K2, 9);
++ ROUND(G, b, c, d, a, in[6] + K2, 13);
++
++ /* Round 3 */
++ ROUND(H, a, b, c, d, in[3] + K3, 3);
++ ROUND(H, d, a, b, c, in[7] + K3, 9);
++ ROUND(H, c, d, a, b, in[2] + K3, 11);
++ ROUND(H, b, c, d, a, in[6] + K3, 15);
++ ROUND(H, a, b, c, d, in[1] + K3, 3);
++ ROUND(H, d, a, b, c, in[5] + K3, 9);
++ ROUND(H, c, d, a, b, in[0] + K3, 11);
++ ROUND(H, b, c, d, a, in[4] + K3, 15);
++
++ buf[0] += a;
++ buf[1] += b;
++ buf[2] += c;
++ buf[3] += d;
++}
++
++#undef ROUND
++#undef F
++#undef G
++#undef H
++#undef K1
++#undef K2
++#undef K3
++
++/* The old legacy hash */
++static __u32 dx_hack_hash (const char *name, int len)
++{
++ __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
++ while (len--) {
++ __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373));
++
++ if (hash & 0x80000000) hash -= 0x7fffffff;
++ hash1 = hash0;
++ hash0 = hash;
++ }
++ return (hash0 << 1);
++}
++
++static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
++{
++ __u32 pad, val;
++ int i;
++
++ pad = (__u32)len | ((__u32)len << 8);
++ pad |= pad << 16;
++
++ val = pad;
++ if (len > num*4)
++ len = num * 4;
++ for (i=0; i < len; i++) {
++ if ((i % 4) == 0)
++ val = pad;
++ val = msg[i] + (val << 8);
++ if ((i % 4) == 3) {
++ *buf++ = val;
++ val = pad;
++ num--;
++ }
++ }
++ if (--num >= 0)
++ *buf++ = val;
++ while (--num >= 0)
++ *buf++ = pad;
++}
++
++/*
++ * Returns the hash of a filename. If len is 0 and name is NULL, then
++ * this function can be used to test whether or not a hash version is
++ * supported.
++ *
++ * The seed is an 4 longword (32 bits) "secret" which can be used to
++ * uniquify a hash. If the seed is all zero's, then some default seed
++ * may be used.
++ *
++ * A particular hash version specifies whether or not the seed is
++ * represented, and whether or not the returned hash is 32 bits or 64
++ * bits. 32 bit hashes will return 0 for the minor hash.
++ */
++int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
++{
++ __u32 hash;
++ __u32 minor_hash = 0;
++ const char *p;
++ int i;
++ __u32 in[8], buf[4];
++
++ /* Initialize the default seed for the hash checksum functions */
++ buf[0] = 0x67452301;
++ buf[1] = 0xefcdab89;
++ buf[2] = 0x98badcfe;
++ buf[3] = 0x10325476;
++
++ /* Check to see if the seed is all zero's */
++ if (hinfo->seed) {
++ for (i=0; i < 4; i++) {
++ if (hinfo->seed[i])
++ break;
++ }
++ if (i < 4)
++ memcpy(buf, hinfo->seed, sizeof(buf));
++ }
++
++ switch (hinfo->hash_version) {
++ case DX_HASH_LEGACY:
++ hash = dx_hack_hash(name, len);
++ break;
++ case DX_HASH_HALF_MD4:
++ p = name;
++ while (len > 0) {
++ str2hashbuf(p, len, in, 8);
++ halfMD4Transform(buf, in);
++ len -= 32;
++ p += 32;
++ }
++ minor_hash = buf[2];
++ hash = buf[1];
++ break;
++ case DX_HASH_TEA:
++ p = name;
++ while (len > 0) {
++ str2hashbuf(p, len, in, 4);
++ TEA_transform(buf, in);
++ len -= 16;
++ p += 16;
++ }
++ hash = buf[0];
++ minor_hash = buf[1];
++ break;
++ default:
++ hinfo->hash = 0;
++ return -1;
++ }
++ hinfo->hash = hash & ~1;
++ hinfo->minor_hash = minor_hash;
++ return 0;
++}
+--- linux-2.4.22-ac1/fs/ext3/Makefile~ext3-htree-2.4.22-rh 2003-09-25 14:39:01.000000000 +0400
++++ linux-2.4.22-ac1-alexey/fs/ext3/Makefile 2003-09-25 14:55:12.000000000 +0400
+@@ -12,7 +12,7 @@ O_TARGET := ext3.o
+ export-objs := super.o inode.o
+
+ obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+- ioctl.o namei.o super.o symlink.o
++ ioctl.o namei.o super.o symlink.o hash.o
+ obj-m := $(O_TARGET)
+
+ include $(TOPDIR)/Rules.make
+--- linux-2.4.22-ac1/fs/ext3/namei.c~ext3-htree-2.4.22-rh 2003-09-25 14:16:29.000000000 +0400
++++ linux-2.4.22-ac1-alexey/fs/ext3/namei.c 2003-09-25 14:58:37.000000000 +0400
+@@ -16,6 +16,12 @@
+ * David S. Miller (davem@caip.rutgers.edu), 1995
+ * Directory entry file type support and forward compatibility hooks
+ * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998
++ * Hash Tree Directory indexing (c)
++ * Daniel Phillips, 2001
++ * Hash Tree Directory indexing porting
++ * Christopher Li, 2002
++ * Hash Tree Directory indexing cleanup
++ * Theodore Ts'o, 2002
+ */
+
+ #include <linux/fs.h>
+@@ -38,6 +44,642 @@
+ #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
+ #define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
+
++static struct buffer_head *ext3_append(handle_t *handle,
++ struct inode *inode,
++ u32 *block, int *err)
++{
++ struct buffer_head *bh;
++
++ *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
++
++ if ((bh = ext3_bread(handle, inode, *block, 1, err))) {
++ inode->i_size += inode->i_sb->s_blocksize;
++ EXT3_I(inode)->i_disksize = inode->i_size;
++ ext3_journal_get_write_access(handle,bh);
++ }
++ return bh;
++}
++
++#ifndef assert
++#define assert(test) J_ASSERT(test)
++#endif
++
++#ifndef swap
++#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
++#endif
++
++typedef struct { u32 v; } le_u32;
++typedef struct { u16 v; } le_u16;
++
++#ifdef DX_DEBUG
++#define dxtrace(command) command
++#else
++#define dxtrace(command)
++#endif
++
++struct fake_dirent
++{
++ /*le*/u32 inode;
++ /*le*/u16 rec_len;
++ u8 name_len;
++ u8 file_type;
++};
++
++struct dx_countlimit
++{
++ le_u16 limit;
++ le_u16 count;
++};
++
++struct dx_entry
++{
++ le_u32 hash;
++ le_u32 block;
++};
++
++/*
++ * dx_root_info is laid out so that if it should somehow get overlaid by a
++ * dirent the two low bits of the hash version will be zero. Therefore, the
++ * hash version mod 4 should never be 0. Sincerely, the paranoia department.
++ */
++
++struct dx_root
++{
++ struct fake_dirent dot;
++ char dot_name[4];
++ struct fake_dirent dotdot;
++ char dotdot_name[4];
++ struct dx_root_info
++ {
++ le_u32 reserved_zero;
++ u8 hash_version;
++ u8 info_length; /* 8 */
++ u8 indirect_levels;
++ u8 unused_flags;
++ }
++ info;
++ struct dx_entry entries[0];
++};
++
++struct dx_node
++{
++ struct fake_dirent fake;
++ struct dx_entry entries[0];
++};
++
++
++struct dx_frame
++{
++ struct buffer_head *bh;
++ struct dx_entry *entries;
++ struct dx_entry *at;
++};
++
++struct dx_map_entry
++{
++ u32 hash;
++ u32 offs;
++};
++
++#ifdef CONFIG_EXT3_INDEX
++static inline unsigned dx_get_block (struct dx_entry *entry);
++static void dx_set_block (struct dx_entry *entry, unsigned value);
++static inline unsigned dx_get_hash (struct dx_entry *entry);
++static void dx_set_hash (struct dx_entry *entry, unsigned value);
++static unsigned dx_get_count (struct dx_entry *entries);
++static unsigned dx_get_limit (struct dx_entry *entries);
++static void dx_set_count (struct dx_entry *entries, unsigned value);
++static void dx_set_limit (struct dx_entry *entries, unsigned value);
++static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
++static unsigned dx_node_limit (struct inode *dir);
++static struct dx_frame *dx_probe(struct dentry *dentry,
++ struct inode *dir,
++ struct dx_hash_info *hinfo,
++ struct dx_frame *frame,
++ int *err);
++static void dx_release (struct dx_frame *frames);
++static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
++ struct dx_hash_info *hinfo, struct dx_map_entry map[]);
++static void dx_sort_map(struct dx_map_entry *map, unsigned count);
++static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
++ struct dx_map_entry *offsets, int count);
++static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size);
++static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
++static int ext3_htree_next_block(struct inode *dir, __u32 hash,
++ struct dx_frame *frame,
++ struct dx_frame *frames, int *err,
++ __u32 *start_hash);
++static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
++ struct ext3_dir_entry_2 **res_dir, int *err);
++static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
++ struct inode *inode);
++
++/*
++ * Future: use high four bits of block for coalesce-on-delete flags
++ * Mask them off for now.
++ */
++
++static inline unsigned dx_get_block (struct dx_entry *entry)
++{
++ return le32_to_cpu(entry->block.v) & 0x00ffffff;
++}
++
++static inline void dx_set_block (struct dx_entry *entry, unsigned value)
++{
++ entry->block.v = cpu_to_le32(value);
++}
++
++static inline unsigned dx_get_hash (struct dx_entry *entry)
++{
++ return le32_to_cpu(entry->hash.v);
++}
++
++static inline void dx_set_hash (struct dx_entry *entry, unsigned value)
++{
++ entry->hash.v = cpu_to_le32(value);
++}
++
++static inline unsigned dx_get_count (struct dx_entry *entries)
++{
++ return le16_to_cpu(((struct dx_countlimit *) entries)->count.v);
++}
++
++static inline unsigned dx_get_limit (struct dx_entry *entries)
++{
++ return le16_to_cpu(((struct dx_countlimit *) entries)->limit.v);
++}
++
++static inline void dx_set_count (struct dx_entry *entries, unsigned value)
++{
++ ((struct dx_countlimit *) entries)->count.v = cpu_to_le16(value);
++}
++
++static inline void dx_set_limit (struct dx_entry *entries, unsigned value)
++{
++ ((struct dx_countlimit *) entries)->limit.v = cpu_to_le16(value);
++}
++
++static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
++{
++ unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) -
++ EXT3_DIR_REC_LEN(2) - infosize;
++ return 0? 20: entry_space / sizeof(struct dx_entry);
++}
++
++static inline unsigned dx_node_limit (struct inode *dir)
++{
++ unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0);
++ return 0? 22: entry_space / sizeof(struct dx_entry);
++}
++
++/*
++ * Debug
++ */
++#ifdef DX_DEBUG
++struct stats
++{
++ unsigned names;
++ unsigned space;
++ unsigned bcount;
++};
++
++static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_entry_2 *de,
++ int size, int show_names)
++{
++ unsigned names = 0, space = 0;
++ char *base = (char *) de;
++ struct dx_hash_info h = *hinfo;
++
++ printk("names: ");
++ while ((char *) de < base + size)
++ {
++ if (de->inode)
++ {
++ if (show_names)
++ {
++ int len = de->name_len;
++ char *name = de->name;
++ while (len--) printk("%c", *name++);
++ ext3fs_dirhash(de->name, de->name_len, &h);
++ printk(":%x.%u ", h.hash,
++ ((char *) de - base));
++ }
++ space += EXT3_DIR_REC_LEN(de->name_len);
++ names++;
++ }
++ de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
++ }
++ printk("(%i)\n", names);
++ return (struct stats) { names, space, 1 };
++}
++
++struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
++ struct dx_entry *entries, int levels)
++{
++ unsigned blocksize = dir->i_sb->s_blocksize;
++ unsigned count = dx_get_count (entries), names = 0, space = 0, i;
++ unsigned bcount = 0;
++ struct buffer_head *bh;
++ int err;
++ printk("%i indexed blocks...\n", count);
++ for (i = 0; i < count; i++, entries++)
++ {
++ u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0;
++ u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash;
++ struct stats stats;
++ printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range);
++ if (!(bh = ext3_bread (NULL,dir, block, 0,&err))) continue;
++ stats = levels?
++ dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1):
++ dx_show_leaf(hinfo, (struct ext3_dir_entry_2 *) bh->b_data, blocksize, 0);
++ names += stats.names;
++ space += stats.space;
++ bcount += stats.bcount;
++ brelse (bh);
++ }
++ if (bcount)
++ printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ",
++ names, space/bcount,(space/bcount)*100/blocksize);
++ return (struct stats) { names, space, bcount};
++}
++#endif /* DX_DEBUG */
++
++/*
++ * Probe for a directory leaf block to search.
++ *
++ * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
++ * error in the directory index, and the caller should fall back to
++ * searching the directory normally. The callers of dx_probe **MUST**
++ * check for this error code, and make sure it never gets reflected
++ * back to userspace.
++ */
++static struct dx_frame *
++dx_probe(struct dentry *dentry, struct inode *dir,
++ struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
++{
++ unsigned count, indirect;
++ struct dx_entry *at, *entries, *p, *q, *m;
++ struct dx_root *root;
++ struct buffer_head *bh;
++ struct dx_frame *frame = frame_in;
++ u32 hash;
++
++ frame->bh = NULL;
++ if (dentry)
++ dir = dentry->d_parent->d_inode;
++ if (!(bh = ext3_bread (NULL,dir, 0, 0, err)))
++ goto fail;
++ root = (struct dx_root *) bh->b_data;
++ if (root->info.hash_version != DX_HASH_TEA &&
++ root->info.hash_version != DX_HASH_HALF_MD4 &&
++ root->info.hash_version != DX_HASH_LEGACY) {
++ ext3_warning(dir->i_sb, __FUNCTION__,
++ "Unrecognised inode hash code %d",
++ root->info.hash_version);
++ brelse(bh);
++ *err = ERR_BAD_DX_DIR;
++ goto fail;
++ }
++ hinfo->hash_version = root->info.hash_version;
++ hinfo->seed = dir->i_sb->u.ext3_sb.s_hash_seed;
++ if (dentry)
++ ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
++ hash = hinfo->hash;
++
++ if (root->info.unused_flags & 1) {
++ ext3_warning(dir->i_sb, __FUNCTION__,
++ "Unimplemented inode hash flags: %#06x",
++ root->info.unused_flags);
++ brelse(bh);
++ *err = ERR_BAD_DX_DIR;
++ goto fail;
++ }
++
++ if ((indirect = root->info.indirect_levels) > 1) {
++ ext3_warning(dir->i_sb, __FUNCTION__,
++ "Unimplemented inode hash depth: %#06x",
++ root->info.indirect_levels);
++ brelse(bh);
++ *err = ERR_BAD_DX_DIR;
++ goto fail;
++ }
++
++ entries = (struct dx_entry *) (((char *)&root->info) +
++ root->info.info_length);
++ assert(dx_get_limit(entries) == dx_root_limit(dir,
++ root->info.info_length));
++ dxtrace (printk("Look up %x", hash));
++ while (1)
++ {
++ count = dx_get_count(entries);
++ assert (count && count <= dx_get_limit(entries));
++ p = entries + 1;
++ q = entries + count - 1;
++ while (p <= q)
++ {
++ m = p + (q - p)/2;
++ dxtrace(printk("."));
++ if (dx_get_hash(m) > hash)
++ q = m - 1;
++ else
++ p = m + 1;
++ }
++
++ if (0) // linear search cross check
++ {
++ unsigned n = count - 1;
++ at = entries;
++ while (n--)
++ {
++ dxtrace(printk(","));
++ if (dx_get_hash(++at) > hash)
++ {
++ at--;
++ break;
++ }
++ }
++ assert (at == p - 1);
++ }
++
++ at = p - 1;
++ dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
++ frame->bh = bh;
++ frame->entries = entries;
++ frame->at = at;
++ if (!indirect--) return frame;
++ if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err)))
++ goto fail2;
++ at = entries = ((struct dx_node *) bh->b_data)->entries;
++ assert (dx_get_limit(entries) == dx_node_limit (dir));
++ frame++;
++ }
++fail2:
++ while (frame >= frame_in) {
++ brelse(frame->bh);
++ frame--;
++ }
++fail:
++ return NULL;
++}
++
++static void dx_release (struct dx_frame *frames)
++{
++ if (frames[0].bh == NULL)
++ return;
++
++ if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
++ brelse(frames[1].bh);
++ brelse(frames[0].bh);
++}
++
++/*
++ * This function increments the frame pointer to search the next leaf
++ * block, and reads in the necessary intervening nodes if the search
++ * should be necessary. Whether or not the search is necessary is
++ * controlled by the hash parameter. If the hash value is even, then
++ * the search is only continued if the next block starts with that
++ * hash value. This is used if we are searching for a specific file.
++ *
++ * If the hash value is HASH_NB_ALWAYS, then always go to the next block.
++ *
++ * This function returns 1 if the caller should continue to search,
++ * or 0 if it should not. If there is an error reading one of the
++ * index blocks, it will return -1.
++ *
++ * If start_hash is non-null, it will be filled in with the starting
++ * hash of the next page.
++ */
++static int ext3_htree_next_block(struct inode *dir, __u32 hash,
++ struct dx_frame *frame,
++ struct dx_frame *frames, int *err,
++ __u32 *start_hash)
++{
++ struct dx_frame *p;
++ struct buffer_head *bh;
++ int num_frames = 0;
++ __u32 bhash;
++
++ *err = ENOENT;
++ p = frame;
++ /*
++ * Find the next leaf page by incrementing the frame pointer.
++ * If we run out of entries in the interior node, loop around and
++ * increment pointer in the parent node. When we break out of
++ * this loop, num_frames indicates the number of interior
++ * nodes need to be read.
++ */
++ while (1) {
++ if (++(p->at) < p->entries + dx_get_count(p->entries))
++ break;
++ if (p == frames)
++ return 0;
++ num_frames++;
++ p--;
++ }
++
++ /*
++ * If the hash is 1, then continue only if the next page has a
++ * continuation hash of any value. This is used for readdir
++ * handling. Otherwise, check to see if the hash matches the
++ * desired contiuation hash. If it doesn't, return since
++ * there's no point to read in the successive index pages.
++ */
++ bhash = dx_get_hash(p->at);
++ if (start_hash)
++ *start_hash = bhash;
++ if ((hash & 1) == 0) {
++ if ((bhash & ~1) != hash)
++ return 0;
++ }
++ /*
++ * If the hash is HASH_NB_ALWAYS, we always go to the next
++ * block so no check is necessary
++ */
++ while (num_frames--) {
++ if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at),
++ 0, err)))
++ return -1; /* Failure */
++ p++;
++ brelse (p->bh);
++ p->bh = bh;
++ p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
++ }
++ return 1;
++}
++
++
++/*
++ * p is at least 6 bytes before the end of page
++ */
++static inline struct ext3_dir_entry_2 *ext3_next_entry(struct ext3_dir_entry_2 *p)
++{
++ return (struct ext3_dir_entry_2 *)((char*)p + le16_to_cpu(p->rec_len));
++}
++
++/*
++ * This function fills a red-black tree with information from a
++ * directory. We start scanning the directory in hash order, starting
++ * at start_hash and start_minor_hash.
++ *
++ * This function returns the number of entries inserted into the tree,
++ * or a negative error code.
++ */
++int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
++ __u32 start_minor_hash, __u32 *next_hash)
++{
++ struct dx_hash_info hinfo;
++ struct buffer_head *bh;
++ struct ext3_dir_entry_2 *de, *top;
++ static struct dx_frame frames[2], *frame;
++ struct inode *dir;
++ int block, err;
++ int count = 0;
++ int ret;
++ __u32 hashval;
++
++ dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
++ start_minor_hash));
++ dir = dir_file->f_dentry->d_inode;
++ hinfo.hash = start_hash;
++ hinfo.minor_hash = 0;
++ frame = dx_probe(0, dir_file->f_dentry->d_inode, &hinfo, frames, &err);
++ if (!frame)
++ return err;
++
++ /* Add '.' and '..' from the htree header */
++ if (!start_hash && !start_minor_hash) {
++ de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data;
++ if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0)
++ goto errout;
++ de = ext3_next_entry(de);
++ if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0)
++ goto errout;
++ count += 2;
++ }
++
++ while (1) {
++ block = dx_get_block(frame->at);
++ dxtrace(printk("Reading block %d\n", block));
++ if (!(bh = ext3_bread (NULL, dir, block, 0, &err)))
++ goto errout;
++
++ de = (struct ext3_dir_entry_2 *) bh->b_data;
++ top = (struct ext3_dir_entry_2 *) ((char *) de + dir->i_sb->s_blocksize -
++ EXT3_DIR_REC_LEN(0));
++ for (; de < top; de = ext3_next_entry(de)) {
++ ext3fs_dirhash(de->name, de->name_len, &hinfo);
++ if ((hinfo.hash < start_hash) ||
++ ((hinfo.hash == start_hash) &&
++ (hinfo.minor_hash < start_minor_hash)))
++ continue;
++ if ((err = ext3_htree_store_dirent(dir_file,
++ hinfo.hash, hinfo.minor_hash, de)) != 0)
++ goto errout;
++ count++;
++ }
++ brelse (bh);
++ hashval = ~1;
++ ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS,
++ frame, frames, &err, &hashval);
++ if (next_hash)
++ *next_hash = hashval;
++ if (ret == -1)
++ goto errout;
++ /*
++ * Stop if: (a) there are no more entries, or
++ * (b) we have inserted at least one entry and the
++ * next hash value is not a continuation
++ */
++ if ((ret == 0) ||
++ (count && ((hashval & 1) == 0)))
++ break;
++ }
++ dx_release(frames);
++ dxtrace(printk("Fill tree: returned %d entries\n", count));
++ return count;
++errout:
++ dx_release(frames);
++ return (err);
++}
++
++
++/*
++ * Directory block splitting, compacting
++ */
++
++static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
++ struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
++{
++ int count = 0;
++ char *base = (char *) de;
++ struct dx_hash_info h = *hinfo;
++
++ while ((char *) de < base + size)
++ {
++ if (de->name_len && de->inode) {
++ ext3fs_dirhash(de->name, de->name_len, &h);
++ map_tail--;
++ map_tail->hash = h.hash;
++ map_tail->offs = (u32) ((char *) de - base);
++ count++;
++ }
++ /* XXX: do we need to check rec_len == 0 case? -Chris */
++ de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
++ }
++ return count;
++}
++
++static void dx_sort_map (struct dx_map_entry *map, unsigned count)
++{
++ struct dx_map_entry *p, *q, *top = map + count - 1;
++ int more;
++ /* Combsort until bubble sort doesn't suck */
++ while (count > 2)
++ {
++ count = count*10/13;
++ if (count - 9 < 2) /* 9, 10 -> 11 */
++ count = 11;
++ for (p = top, q = p - count; q >= map; p--, q--)
++ if (p->hash < q->hash)
++ swap(*p, *q);
++ }
++ /* Garden variety bubble sort */
++ do {
++ more = 0;
++ q = top;
++ while (q-- > map)
++ {
++ if (q[1].hash >= q[0].hash)
++ continue;
++ swap(*(q+1), *q);
++ more = 1;
++ }
++ } while(more);
++}
++
++static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block)
++{
++ struct dx_entry *entries = frame->entries;
++ struct dx_entry *old = frame->at, *new = old + 1;
++ int count = dx_get_count(entries);
++
++ assert(count < dx_get_limit(entries));
++ assert(old < entries + count);
++ memmove(new + 1, new, (char *)(entries + count) - (char *)(new));
++ dx_set_hash(new, hash);
++ dx_set_block(new, block);
++ dx_set_count(entries, count + 1);
++}
++#endif
++
++
++static void ext3_update_dx_flag(struct inode *inode)
++{
++ if (!EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
++ EXT3_FEATURE_COMPAT_DIR_INDEX))
++ EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL;
++}
++
+ /*
+ * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure.
+ *
+@@ -94,6 +736,7 @@ static int inline search_dirblock(struct
+ return 0;
+ }
+
++
+ /*
+ * ext3_find_entry()
+ *
+@@ -105,6 +748,8 @@ static int inline search_dirblock(struct
+ * The returned buffer_head has ->b_count elevated. The caller is expected
+ * to brelse() it when appropriate.
+ */
++
++
+ static struct buffer_head * ext3_find_entry (struct dentry *dentry,
+ struct ext3_dir_entry_2 ** res_dir)
+ {
+@@ -119,12 +764,32 @@ static struct buffer_head * ext3_find_en
+ int num = 0;
+ int nblocks, i, err;
+ struct inode *dir = dentry->d_parent->d_inode;
++ int namelen;
++ const u8 *name;
++ unsigned blocksize;
+
+ *res_dir = NULL;
+ sb = dir->i_sb;
+-
++ blocksize = sb->s_blocksize;
++ namelen = dentry->d_name.len;
++ name = dentry->d_name.name;
++ if (namelen > EXT3_NAME_LEN)
++ return NULL;
++#ifdef CONFIG_EXT3_INDEX
++ if (is_dx(dir)) {
++ bh = ext3_dx_find_entry(dentry, res_dir, &err);
++ /*
++ * On success, or if the error was file not found,
++ * return. Otherwise, fall back to doing a search the
++ * old fashioned way.
++ */
++ if (bh || (err != ERR_BAD_DX_DIR))
++ return bh;
++ dxtrace(printk("ext3_find_entry: dx failed, falling back\n"));
++ }
++#endif
+ nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
+- start = dir->u.ext3_i.i_dir_start_lookup;
++ start = EXT3_I(dir)->i_dir_start_lookup;
+ if (start >= nblocks)
+ start = 0;
+ block = start;
+@@ -166,7 +831,7 @@ restart:
+ i = search_dirblock(bh, dir, dentry,
+ block << EXT3_BLOCK_SIZE_BITS(sb), res_dir);
+ if (i == 1) {
+- dir->u.ext3_i.i_dir_start_lookup = block;
++ EXT3_I(dir)->i_dir_start_lookup = block;
+ ret = bh;
+ goto cleanup_and_exit;
+ } else {
+@@ -197,6 +862,66 @@ cleanup_and_exit:
+ return ret;
+ }
+
++#ifdef CONFIG_EXT3_INDEX
++static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
++ struct ext3_dir_entry_2 **res_dir, int *err)
++{
++ struct super_block * sb;
++ struct dx_hash_info hinfo;
++ u32 hash;
++ struct dx_frame frames[2], *frame;
++ struct ext3_dir_entry_2 *de, *top;
++ struct buffer_head *bh;
++ unsigned long block;
++ int retval;
++ int namelen = dentry->d_name.len;
++ const u8 *name = dentry->d_name.name;
++ struct inode *dir = dentry->d_parent->d_inode;
++
++ sb = dir->i_sb;
++ if (!(frame = dx_probe (dentry, 0, &hinfo, frames, err)))
++ return NULL;
++ hash = hinfo.hash;
++ do {
++ block = dx_get_block(frame->at);
++ if (!(bh = ext3_bread (NULL,dir, block, 0, err)))
++ goto errout;
++ de = (struct ext3_dir_entry_2 *) bh->b_data;
++ top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize -
++ EXT3_DIR_REC_LEN(0));
++ for (; de < top; de = ext3_next_entry(de))
++ if (ext3_match (namelen, name, de)) {
++ if (!ext3_check_dir_entry("ext3_find_entry",
++ dir, de, bh,
++ (block<<EXT3_BLOCK_SIZE_BITS(sb))
++ +((char *)de - bh->b_data))) {
++ brelse (bh);
++ goto errout;
++ }
++ *res_dir = de;
++ dx_release (frames);
++ return bh;
++ }
++ brelse (bh);
++ /* Check to see if we should continue to search */
++ retval = ext3_htree_next_block(dir, hash, frame,
++ frames, err, 0);
++ if (retval == -1) {
++ ext3_warning(sb, __FUNCTION__,
++ "error reading index page in directory #%lu",
++ dir->i_ino);
++ goto errout;
++ }
++ } while (retval == 1);
++
++ *err = -ENOENT;
++errout:
++ dxtrace(printk("%s not found\n", name));
++ dx_release (frames);
++ return NULL;
++}
++#endif
++
+ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry)
+ {
+ struct inode * inode;
+@@ -213,8 +938,9 @@ static struct dentry *ext3_lookup(struct
+ brelse (bh);
+ inode = iget(dir->i_sb, ino);
+
+- if (!inode)
++ if (!inode) {
+ return ERR_PTR(-EACCES);
++ }
+ }
+ d_add(dentry, inode);
+ return NULL;
+@@ -238,6 +964,301 @@ static inline void ext3_set_de_type(stru
+ de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+ }
+
++#ifdef CONFIG_EXT3_INDEX
++static struct ext3_dir_entry_2 *
++dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
++{
++ unsigned rec_len = 0;
++
++ while (count--) {
++ struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs);
++ rec_len = EXT3_DIR_REC_LEN(de->name_len);
++ memcpy (to, de, rec_len);
++ ((struct ext3_dir_entry_2 *) to)->rec_len = rec_len;
++ de->inode = 0;
++ map++;
++ to += rec_len;
++ }
++ return (struct ext3_dir_entry_2 *) (to - rec_len);
++}
++
++static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size)
++{
++ struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base;
++ unsigned rec_len = 0;
++
++ prev = to = de;
++ while ((char*)de < base + size) {
++ next = (struct ext3_dir_entry_2 *) ((char *) de +
++ le16_to_cpu(de->rec_len));
++ if (de->inode && de->name_len) {
++ rec_len = EXT3_DIR_REC_LEN(de->name_len);
++ if (de > to)
++ memmove(to, de, rec_len);
++ to->rec_len = rec_len;
++ prev = to;
++ to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len);
++ }
++ de = next;
++ }
++ return prev;
++}
++
++static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
++ struct buffer_head **bh,struct dx_frame *frame,
++ struct dx_hash_info *hinfo, int *error)
++{
++ unsigned blocksize = dir->i_sb->s_blocksize;
++ unsigned count, continued;
++ struct buffer_head *bh2;
++ u32 newblock;
++ u32 hash2;
++ struct dx_map_entry *map;
++ char *data1 = (*bh)->b_data, *data2;
++ unsigned split;
++ struct ext3_dir_entry_2 *de = NULL, *de2;
++ int err;
++
++ bh2 = ext3_append (handle, dir, &newblock, error);
++ if (!(bh2)) {
++ brelse(*bh);
++ *bh = NULL;
++ goto errout;
++ }
++
++ BUFFER_TRACE(*bh, "get_write_access");
++ err = ext3_journal_get_write_access(handle, *bh);
++ if (err) {
++ journal_error:
++ brelse(*bh);
++ brelse(bh2);
++ *bh = NULL;
++ ext3_std_error(dir->i_sb, err);
++ goto errout;
++ }
++ BUFFER_TRACE(frame->bh, "get_write_access");
++ err = ext3_journal_get_write_access(handle, frame->bh);
++ if (err)
++ goto journal_error;
++
++ data2 = bh2->b_data;
++
++ /* create map in the end of data2 block */
++ map = (struct dx_map_entry *) (data2 + blocksize);
++ count = dx_make_map ((struct ext3_dir_entry_2 *) data1,
++ blocksize, hinfo, map);
++ map -= count;
++ split = count/2; // need to adjust to actual middle
++ dx_sort_map (map, count);
++ hash2 = map[split].hash;
++ continued = hash2 == map[split - 1].hash;
++ dxtrace(printk("Split block %i at %x, %i/%i\n",
++ dx_get_block(frame->at), hash2, split, count-split));
++
++ /* Fancy dance to stay within two buffers */
++ de2 = dx_move_dirents(data1, data2, map + split, count - split);
++ de = dx_pack_dirents(data1,blocksize);
++ de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
++ de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2);
++ dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1));
++ dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1));
++
++ /* Which block gets the new entry? */
++ if (hinfo->hash >= hash2)
++ {
++ swap(*bh, bh2);
++ de = de2;
++ }
++ dx_insert_block (frame, hash2 + continued, newblock);
++ err = ext3_journal_dirty_metadata (handle, bh2);
++ if (err)
++ goto journal_error;
++ err = ext3_journal_dirty_metadata (handle, frame->bh);
++ if (err)
++ goto journal_error;
++ brelse (bh2);
++ dxtrace(dx_show_index ("frame", frame->entries));
++errout:
++ return de;
++}
++#endif
++
++
++/*
++ * Add a new entry into a directory (leaf) block. If de is non-NULL,
++ * it points to a directory entry which is guaranteed to be large
++ * enough for new directory entry. If de is NULL, then
++ * add_dirent_to_buf will attempt search the directory block for
++ * space. It will return -ENOSPC if no space is available, and -EIO
++ * and -EEXIST if directory entry already exists.
++ *
++ * NOTE! bh is NOT released in the case where ENOSPC is returned. In
++ * all other cases bh is released.
++ */
++static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
++ struct inode *inode, struct ext3_dir_entry_2 *de,
++ struct buffer_head * bh)
++{
++ struct inode *dir = dentry->d_parent->d_inode;
++ const char *name = dentry->d_name.name;
++ int namelen = dentry->d_name.len;
++ unsigned long offset = 0;
++ unsigned short reclen;
++ int nlen, rlen, err;
++ char *top;
++
++ reclen = EXT3_DIR_REC_LEN(namelen);
++ if (!de) {
++ de = (struct ext3_dir_entry_2 *)bh->b_data;
++ top = bh->b_data + dir->i_sb->s_blocksize - reclen;
++ while ((char *) de <= top) {
++ if (!ext3_check_dir_entry("ext3_add_entry", dir, de,
++ bh, offset)) {
++ brelse (bh);
++ return -EIO;
++ }
++ if (ext3_match (namelen, name, de)) {
++ brelse (bh);
++ return -EEXIST;
++ }
++ nlen = EXT3_DIR_REC_LEN(de->name_len);
++ rlen = le16_to_cpu(de->rec_len);
++ if ((de->inode? rlen - nlen: rlen) >= reclen)
++ break;
++ de = (struct ext3_dir_entry_2 *)((char *)de + rlen);
++ offset += rlen;
++ }
++ if ((char *) de > top)
++ return -ENOSPC;
++ }
++ BUFFER_TRACE(bh, "get_write_access");
++ err = ext3_journal_get_write_access(handle, bh);
++ if (err) {
++ ext3_std_error(dir->i_sb, err);
++ brelse(bh);
++ return err;
++ }
++
++ /* By now the buffer is marked for journaling */
++ nlen = EXT3_DIR_REC_LEN(de->name_len);
++ rlen = le16_to_cpu(de->rec_len);
++ if (de->inode) {
++ struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen);
++ de1->rec_len = cpu_to_le16(rlen - nlen);
++ de->rec_len = cpu_to_le16(nlen);
++ de = de1;
++ }
++ de->file_type = EXT3_FT_UNKNOWN;
++ if (inode) {
++ de->inode = cpu_to_le32(inode->i_ino);
++ ext3_set_de_type(dir->i_sb, de, inode->i_mode);
++ } else
++ de->inode = 0;
++ de->name_len = namelen;
++ memcpy (de->name, name, namelen);
++ /*
++ * XXX shouldn't update any times until successful
++ * completion of syscall, but too many callers depend
++ * on this.
++ *
++ * XXX similarly, too many callers depend on
++ * ext3_new_inode() setting the times, but error
++ * recovery deletes the inode, so the worst that can
++ * happen is that the times are slightly out of date
++ * and/or different from the directory change time.
++ */
++ dir->i_mtime = dir->i_ctime = CURRENT_TIME;
++ ext3_update_dx_flag(dir);
++ dir->i_version = ++event;
++ ext3_mark_inode_dirty(handle, dir);
++ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
++ err = ext3_journal_dirty_metadata(handle, bh);
++ if (err)
++ ext3_std_error(dir->i_sb, err);
++ brelse(bh);
++ return 0;
++}
++
++#ifdef CONFIG_EXT3_INDEX
++/*
++ * This converts a one block unindexed directory to a 3 block indexed
++ * directory, and adds the dentry to the indexed directory.
++ */
++static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
++ struct inode *inode, struct buffer_head *bh)
++{
++ struct inode *dir = dentry->d_parent->d_inode;
++ const char *name = dentry->d_name.name;
++ int namelen = dentry->d_name.len;
++ struct buffer_head *bh2;
++ struct dx_root *root;
++ struct dx_frame frames[2], *frame;
++ struct dx_entry *entries;
++ struct ext3_dir_entry_2 *de, *de2;
++ char *data1, *top;
++ unsigned len;
++ int retval;
++ unsigned blocksize;
++ struct dx_hash_info hinfo;
++ u32 block;
++
++ blocksize = dir->i_sb->s_blocksize;
++ dxtrace(printk("Creating index\n"));
++ retval = ext3_journal_get_write_access(handle, bh);
++ if (retval) {
++ ext3_std_error(dir->i_sb, retval);
++ brelse(bh);
++ return retval;
++ }
++ root = (struct dx_root *) bh->b_data;
++
++ EXT3_I(dir)->i_flags |= EXT3_INDEX_FL;
++ bh2 = ext3_append (handle, dir, &block, &retval);
++ if (!(bh2)) {
++ brelse(bh);
++ return retval;
++ }
++ data1 = bh2->b_data;
++
++ /* The 0th block becomes the root, move the dirents out */
++ de = &root->dotdot;
++ de = (struct ext3_dir_entry_2 *) ((char *)de + de->rec_len);
++ len = ((char *) root) + blocksize - (char *) de;
++ memcpy (data1, de, len);
++ de = (struct ext3_dir_entry_2 *) data1;
++ top = data1 + len;
++ while (((char *) de2=(char*)de+le16_to_cpu(de->rec_len)) < top)
++ de = de2;
++ de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
++ /* Initialize the root; the dot dirents already exist */
++ de = (struct ext3_dir_entry_2 *) (&root->dotdot);
++ de->rec_len = cpu_to_le16(blocksize - EXT3_DIR_REC_LEN(2));
++ memset (&root->info, 0, sizeof(root->info));
++ root->info.info_length = sizeof(root->info);
++ root->info.hash_version = dir->i_sb->u.ext3_sb.s_def_hash_version;
++ entries = root->entries;
++ dx_set_block (entries, 1);
++ dx_set_count (entries, 1);
++ dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info)));
++
++ /* Initialize as for dx_probe */
++ hinfo.hash_version = root->info.hash_version;
++ hinfo.seed = dir->i_sb->u.ext3_sb.s_hash_seed;
++ ext3fs_dirhash(name, namelen, &hinfo);
++ frame = frames;
++ frame->entries = entries;
++ frame->at = entries;
++ frame->bh = bh;
++ bh = bh2;
++ de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
++ dx_release (frames);
++ if (!(de))
++ return retval;
++
++ return add_dirent_to_buf(handle, dentry, inode, de, bh);
++}
++#endif
++
+ /*
+ * ext3_add_entry()
+ *
+@@ -248,127 +1269,198 @@ static inline void ext3_set_de_type(stru
+ * may not sleep between calling this and putting something into
+ * the entry, as someone else might have used it while you slept.
+ */
+-
+-/*
+- * AKPM: the journalling code here looks wrong on the error paths
+- */
+ static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
+ struct inode *inode)
+ {
+ struct inode *dir = dentry->d_parent->d_inode;
+- const char *name = dentry->d_name.name;
+- int namelen = dentry->d_name.len;
+ unsigned long offset;
+- unsigned short rec_len;
+ struct buffer_head * bh;
+- struct ext3_dir_entry_2 * de, * de1;
++ struct ext3_dir_entry_2 *de;
+ struct super_block * sb;
+ int retval;
++#ifdef CONFIG_EXT3_INDEX
++ int dx_fallback=0;
++#endif
++ unsigned blocksize;
++ unsigned nlen, rlen;
++ u32 block, blocks;
+
+ sb = dir->i_sb;
+-
+- if (!namelen)
++ blocksize = sb->s_blocksize;
++ if (!dentry->d_name.len)
+ return -EINVAL;
+- bh = ext3_bread (handle, dir, 0, 0, &retval);
++#ifdef CONFIG_EXT3_INDEX
++ if (is_dx(dir)) {
++ retval = ext3_dx_add_entry(handle, dentry, inode);
++ if (!retval || (retval != ERR_BAD_DX_DIR))
++ return retval;
++ EXT3_I(dir)->i_flags &= ~EXT3_INDEX_FL;
++ dx_fallback++;
++ ext3_mark_inode_dirty(handle, dir);
++ }
++#endif
++ blocks = dir->i_size >> sb->s_blocksize_bits;
++ for (block = 0, offset = 0; block < blocks; block++) {
++ bh = ext3_bread(handle, dir, block, 0, &retval);
++ if(!bh)
++ return retval;
++ retval = add_dirent_to_buf(handle, dentry, inode, 0, bh);
++ if (retval != -ENOSPC)
++ return retval;
++
++#ifdef CONFIG_EXT3_INDEX
++ if (blocks == 1 && !dx_fallback &&
++ EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX))
++ return make_indexed_dir(handle, dentry, inode, bh);
++#endif
++ brelse(bh);
++ }
++ bh = ext3_append(handle, dir, &block, &retval);
+ if (!bh)
+ return retval;
+- rec_len = EXT3_DIR_REC_LEN(namelen);
+- offset = 0;
+ de = (struct ext3_dir_entry_2 *) bh->b_data;
+- while (1) {
+- if ((char *)de >= sb->s_blocksize + bh->b_data) {
+- brelse (bh);
+- bh = NULL;
+- bh = ext3_bread (handle, dir,
+- offset >> EXT3_BLOCK_SIZE_BITS(sb), 1, &retval);
+- if (!bh)
+- return retval;
+- if (dir->i_size <= offset) {
+- if (dir->i_size == 0) {
+- brelse(bh);
+- return -ENOENT;
+- }
++ de->inode = 0;
++ de->rec_len = cpu_to_le16(rlen = blocksize);
++ nlen = 0;
++ return add_dirent_to_buf(handle, dentry, inode, de, bh);
++}
+
+- ext3_debug ("creating next block\n");
++#ifdef CONFIG_EXT3_INDEX
++/*
++ * Returns 0 for success, or a negative error value
++ */
++static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
++ struct inode *inode)
++{
++ struct dx_frame frames[2], *frame;
++ struct dx_entry *entries, *at;
++ struct dx_hash_info hinfo;
++ struct buffer_head * bh;
++ struct inode *dir = dentry->d_parent->d_inode;
++ struct super_block * sb = dir->i_sb;
++ struct ext3_dir_entry_2 *de;
++ int err;
+
+- BUFFER_TRACE(bh, "get_write_access");
+- ext3_journal_get_write_access(handle, bh);
+- de = (struct ext3_dir_entry_2 *) bh->b_data;
+- de->inode = 0;
+- de->rec_len = le16_to_cpu(sb->s_blocksize);
+- dir->u.ext3_i.i_disksize =
+- dir->i_size = offset + sb->s_blocksize;
+- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
+- ext3_mark_inode_dirty(handle, dir);
+- } else {
++ frame = dx_probe(dentry, 0, &hinfo, frames, &err);
++ if (!frame)
++ return err;
++ entries = frame->entries;
++ at = frame->at;
+
+- ext3_debug ("skipping to next block\n");
++ if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
++ goto cleanup;
+
+- de = (struct ext3_dir_entry_2 *) bh->b_data;
+- }
+- }
+- if (!ext3_check_dir_entry ("ext3_add_entry", dir, de, bh,
+- offset)) {
+- brelse (bh);
+- return -ENOENT;
+- }
+- if (ext3_match (namelen, name, de)) {
+- brelse (bh);
+- return -EEXIST;
++ BUFFER_TRACE(bh, "get_write_access");
++ err = ext3_journal_get_write_access(handle, bh);
++ if (err)
++ goto journal_error;
++
++ err = add_dirent_to_buf(handle, dentry, inode, 0, bh);
++ if (err != -ENOSPC) {
++ bh = 0;
++ goto cleanup;
++ }
++
++ /* Block full, should compress but for now just split */
++ dxtrace(printk("using %u of %u node entries\n",
++ dx_get_count(entries), dx_get_limit(entries)));
++ /* Need to split index? */
++ if (dx_get_count(entries) == dx_get_limit(entries)) {
++ u32 newblock;
++ unsigned icount = dx_get_count(entries);
++ int levels = frame - frames;
++ struct dx_entry *entries2;
++ struct dx_node *node2;
++ struct buffer_head *bh2;
++
++ if (levels && (dx_get_count(frames->entries) ==
++ dx_get_limit(frames->entries))) {
++ ext3_warning(sb, __FUNCTION__,
++ "Directory index full!\n");
++ err = -ENOSPC;
++ goto cleanup;
+ }
+- if ((le32_to_cpu(de->inode) == 0 &&
+- le16_to_cpu(de->rec_len) >= rec_len) ||
+- (le16_to_cpu(de->rec_len) >=
+- EXT3_DIR_REC_LEN(de->name_len) + rec_len)) {
+- BUFFER_TRACE(bh, "get_write_access");
+- ext3_journal_get_write_access(handle, bh);
+- /* By now the buffer is marked for journaling */
+- offset += le16_to_cpu(de->rec_len);
+- if (le32_to_cpu(de->inode)) {
+- de1 = (struct ext3_dir_entry_2 *) ((char *) de +
+- EXT3_DIR_REC_LEN(de->name_len));
+- de1->rec_len =
+- cpu_to_le16(le16_to_cpu(de->rec_len) -
+- EXT3_DIR_REC_LEN(de->name_len));
+- de->rec_len = cpu_to_le16(
+- EXT3_DIR_REC_LEN(de->name_len));
+- de = de1;
++ bh2 = ext3_append (handle, dir, &newblock, &err);
++ if (!(bh2))
++ goto cleanup;
++ node2 = (struct dx_node *)(bh2->b_data);
++ entries2 = node2->entries;
++ node2->fake.rec_len = cpu_to_le16(sb->s_blocksize);
++ node2->fake.inode = 0;
++ BUFFER_TRACE(frame->bh, "get_write_access");
++ err = ext3_journal_get_write_access(handle, frame->bh);
++ if (err)
++ goto journal_error;
++ if (levels) {
++ unsigned icount1 = icount/2, icount2 = icount - icount1;
++ unsigned hash2 = dx_get_hash(entries + icount1);
++ dxtrace(printk("Split index %i/%i\n", icount1, icount2));
++
++ BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
++ err = ext3_journal_get_write_access(handle,
++ frames[0].bh);
++ if (err)
++ goto journal_error;
++
++ memcpy ((char *) entries2, (char *) (entries + icount1),
++ icount2 * sizeof(struct dx_entry));
++ dx_set_count (entries, icount1);
++ dx_set_count (entries2, icount2);
++ dx_set_limit (entries2, dx_node_limit(dir));
++
++ /* Which index block gets the new entry? */
++ if (at - entries >= icount1) {
++ frame->at = at = at - entries - icount1 + entries2;
++ frame->entries = entries = entries2;
++ swap(frame->bh, bh2);
+ }
+- de->file_type = EXT3_FT_UNKNOWN;
+- if (inode) {
+- de->inode = cpu_to_le32(inode->i_ino);
+- ext3_set_de_type(dir->i_sb, de, inode->i_mode);
+- } else
+- de->inode = 0;
+- de->name_len = namelen;
+- memcpy (de->name, name, namelen);
+- /*
+- * XXX shouldn't update any times until successful
+- * completion of syscall, but too many callers depend
+- * on this.
+- *
+- * XXX similarly, too many callers depend on
+- * ext3_new_inode() setting the times, but error
+- * recovery deletes the inode, so the worst that can
+- * happen is that the times are slightly out of date
+- * and/or different from the directory change time.
+- */
+- dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
+- dir->i_version = ++event;
+- ext3_mark_inode_dirty(handle, dir);
+- BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+- ext3_journal_dirty_metadata(handle, bh);
+- brelse(bh);
+- return 0;
++ dx_insert_block (frames + 0, hash2, newblock);
++ dxtrace(dx_show_index ("node", frames[1].entries));
++ dxtrace(dx_show_index ("node",
++ ((struct dx_node *) bh2->b_data)->entries));
++ err = ext3_journal_dirty_metadata(handle, bh2);
++ if (err)
++ goto journal_error;
++ brelse (bh2);
++ } else {
++ dxtrace(printk("Creating second level index...\n"));
++ memcpy((char *) entries2, (char *) entries,
++ icount * sizeof(struct dx_entry));
++ dx_set_limit(entries2, dx_node_limit(dir));
++
++ /* Set up root */
++ dx_set_count(entries, 1);
++ dx_set_block(entries + 0, newblock);
++ ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
++
++ /* Add new access path frame */
++ frame = frames + 1;
++ frame->at = at = at - entries + entries2;
++ frame->entries = entries = entries2;
++ frame->bh = bh2;
++ err = ext3_journal_get_write_access(handle,
++ frame->bh);
++ if (err)
++ goto journal_error;
+ }
+- offset += le16_to_cpu(de->rec_len);
+- de = (struct ext3_dir_entry_2 *)
+- ((char *) de + le16_to_cpu(de->rec_len));
++ ext3_journal_dirty_metadata(handle, frames[0].bh);
+ }
+- brelse (bh);
+- return -ENOSPC;
++ de = do_split(handle, dir, &bh, frame, &hinfo, &err);
++ if (!de)
++ goto cleanup;
++ err = add_dirent_to_buf(handle, dentry, inode, de, bh);
++ bh = 0;
++ goto cleanup;
++
++journal_error:
++ ext3_std_error(dir->i_sb, err);
++cleanup:
++ if (bh)
++ brelse(bh);
++ dx_release(frames);
++ return err;
+ }
++#endif
+
+ /*
+ * ext3_delete_entry deletes a directory entry by merging it with the
+@@ -455,9 +1547,11 @@ static int ext3_create (struct inode * d
+ struct inode * inode;
+ int err;
+
+- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);
+- if (IS_ERR(handle))
++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3);
++ if (IS_ERR(handle)) {
+ return PTR_ERR(handle);
++ }
+
+ if (IS_SYNC(dir))
+ handle->h_sync = 1;
+@@ -481,9 +1575,11 @@ static int ext3_mknod (struct inode * di
+ struct inode *inode;
+ int err;
+
+- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);
+- if (IS_ERR(handle))
++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3);
++ if (IS_ERR(handle)) {
+ return PTR_ERR(handle);
++ }
+
+ if (IS_SYNC(dir))
+ handle->h_sync = 1;
+@@ -509,9 +1605,11 @@ static int ext3_mkdir(struct inode * dir
+ if (dir->i_nlink >= EXT3_LINK_MAX)
+ return -EMLINK;
+
+- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);
+- if (IS_ERR(handle))
++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3);
++ if (IS_ERR(handle)) {
+ return PTR_ERR(handle);
++ }
+
+ if (IS_SYNC(dir))
+ handle->h_sync = 1;
+@@ -523,7 +1621,7 @@ static int ext3_mkdir(struct inode * dir
+
+ inode->i_op = &ext3_dir_inode_operations;
+ inode->i_fop = &ext3_dir_operations;
+- inode->i_size = inode->u.ext3_i.i_disksize = inode->i_sb->s_blocksize;
++ inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
+ inode->i_blocks = 0;
+ dir_block = ext3_bread (handle, inode, 0, 1, &err);
+ if (!dir_block) {
+@@ -556,21 +1654,19 @@ static int ext3_mkdir(struct inode * dir
+ inode->i_mode |= S_ISGID;
+ ext3_mark_inode_dirty(handle, inode);
+ err = ext3_add_entry (handle, dentry, inode);
+- if (err)
+- goto out_no_entry;
++ if (err) {
++ inode->i_nlink = 0;
++ ext3_mark_inode_dirty(handle, inode);
++ iput (inode);
++ goto out_stop;
++ }
+ dir->i_nlink++;
+- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
++ ext3_update_dx_flag(dir);
+ ext3_mark_inode_dirty(handle, dir);
+ d_instantiate(dentry, inode);
+ out_stop:
+ ext3_journal_stop(handle, dir);
+ return err;
+-
+-out_no_entry:
+- inode->i_nlink = 0;
+- ext3_mark_inode_dirty(handle, inode);
+- iput (inode);
+- goto out_stop;
+ }
+
+ /*
+@@ -657,7 +1753,7 @@ int ext3_orphan_add(handle_t *handle, st
+ int err = 0, rc;
+
+ lock_super(sb);
+- if (!list_empty(&inode->u.ext3_i.i_orphan))
++ if (!list_empty(&EXT3_I(inode)->i_orphan))
+ goto out_unlock;
+
+ /* Orphan handling is only valid for files with data blocks
+@@ -698,7 +1794,7 @@ int ext3_orphan_add(handle_t *handle, st
+ * This is safe: on error we're going to ignore the orphan list
+ * anyway on the next recovery. */
+ if (!err)
+- list_add(&inode->u.ext3_i.i_orphan, &EXT3_SB(sb)->s_orphan);
++ list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
+
+ jbd_debug(4, "superblock will point to %ld\n", inode->i_ino);
+ jbd_debug(4, "orphan inode %ld will point to %d\n",
+@@ -716,25 +1812,26 @@ out_unlock:
+ int ext3_orphan_del(handle_t *handle, struct inode *inode)
+ {
+ struct list_head *prev;
++ struct ext3_inode_info *ei = EXT3_I(inode);
+ struct ext3_sb_info *sbi;
+ unsigned long ino_next;
+ struct ext3_iloc iloc;
+ int err = 0;
+
+ lock_super(inode->i_sb);
+- if (list_empty(&inode->u.ext3_i.i_orphan)) {
++ if (list_empty(&ei->i_orphan)) {
+ unlock_super(inode->i_sb);
+ return 0;
+ }
+
+ ino_next = NEXT_ORPHAN(inode);
+- prev = inode->u.ext3_i.i_orphan.prev;
++ prev = ei->i_orphan.prev;
+ sbi = EXT3_SB(inode->i_sb);
+
+ jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
+
+- list_del(&inode->u.ext3_i.i_orphan);
+- INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan);
++ list_del(&ei->i_orphan);
++ INIT_LIST_HEAD(&ei->i_orphan);
+
+ /* If we're on an error path, we may not have a valid
+ * transaction handle with which to update the orphan list on
+@@ -795,8 +1892,9 @@ static int ext3_rmdir (struct inode * di
+ handle_t *handle;
+
+ handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);
+- if (IS_ERR(handle))
++ if (IS_ERR(handle)) {
+ return PTR_ERR(handle);
++ }
+
+ retval = -ENOENT;
+ bh = ext3_find_entry (dentry, &de);
+@@ -834,7 +1932,7 @@ static int ext3_rmdir (struct inode * di
+ dir->i_nlink--;
+ inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ ext3_mark_inode_dirty(handle, inode);
+- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
++ ext3_update_dx_flag(dir);
+ ext3_mark_inode_dirty(handle, dir);
+
+ end_rmdir:
+@@ -852,8 +1950,9 @@ static int ext3_unlink(struct inode * di
+ handle_t *handle;
+
+ handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);
+- if (IS_ERR(handle))
++ if (IS_ERR(handle)) {
+ return PTR_ERR(handle);
++ }
+
+ if (IS_SYNC(dir))
+ handle->h_sync = 1;
+@@ -880,7 +1979,7 @@ static int ext3_unlink(struct inode * di
+ if (retval)
+ goto end_unlink;
+ dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
++ ext3_update_dx_flag(dir);
+ ext3_mark_inode_dirty(handle, dir);
+ inode->i_nlink--;
+ if (!inode->i_nlink)
+@@ -906,9 +2005,11 @@ static int ext3_symlink (struct inode *
+ if (l > dir->i_sb->s_blocksize)
+ return -ENAMETOOLONG;
+
+- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 5);
+- if (IS_ERR(handle))
++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5);
++ if (IS_ERR(handle)) {
+ return PTR_ERR(handle);
++ }
+
+ if (IS_SYNC(dir))
+ handle->h_sync = 1;
+@@ -918,7 +2019,7 @@ static int ext3_symlink (struct inode *
+ if (IS_ERR(inode))
+ goto out_stop;
+
+- if (l > sizeof (inode->u.ext3_i.i_data)) {
++ if (l > sizeof (EXT3_I(inode)->i_data)) {
+ inode->i_op = &page_symlink_inode_operations;
+ inode->i_mapping->a_ops = &ext3_aops;
+ /*
+@@ -927,24 +2028,22 @@ static int ext3_symlink (struct inode *
+ * i_size in generic_commit_write().
+ */
+ err = block_symlink(inode, symname, l);
+- if (err)
+- goto out_no_entry;
++ if (err) {
++ ext3_dec_count(handle, inode);
++ ext3_mark_inode_dirty(handle, inode);
++ iput (inode);
++ goto out_stop;
++ }
+ } else {
+ inode->i_op = &ext3_fast_symlink_inode_operations;
+- memcpy((char*)&inode->u.ext3_i.i_data,symname,l);
++ memcpy((char*)&EXT3_I(inode)->i_data,symname,l);
+ inode->i_size = l-1;
+ }
+- inode->u.ext3_i.i_disksize = inode->i_size;
++ EXT3_I(inode)->i_disksize = inode->i_size;
+ err = ext3_add_nondir(handle, dentry, inode);
+ out_stop:
+ ext3_journal_stop(handle, dir);
+ return err;
+-
+-out_no_entry:
+- ext3_dec_count(handle, inode);
+- ext3_mark_inode_dirty(handle, inode);
+- iput (inode);
+- goto out_stop;
+ }
+
+ static int ext3_link (struct dentry * old_dentry,
+@@ -957,12 +2056,15 @@ static int ext3_link (struct dentry * ol
+ if (S_ISDIR(inode->i_mode))
+ return -EPERM;
+
+- if (inode->i_nlink >= EXT3_LINK_MAX)
++ if (inode->i_nlink >= EXT3_LINK_MAX) {
+ return -EMLINK;
++ }
+
+- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS);
+- if (IS_ERR(handle))
++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
++ EXT3_INDEX_EXTRA_TRANS_BLOCKS);
++ if (IS_ERR(handle)) {
+ return PTR_ERR(handle);
++ }
+
+ if (IS_SYNC(dir))
+ handle->h_sync = 1;
+@@ -995,9 +2097,11 @@ static int ext3_rename (struct inode * o
+
+ old_bh = new_bh = dir_bh = NULL;
+
+- handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + 2);
+- if (IS_ERR(handle))
++ handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS +
++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2);
++ if (IS_ERR(handle)) {
+ return PTR_ERR(handle);
++ }
+
+ if (IS_SYNC(old_dir) || IS_SYNC(new_dir))
+ handle->h_sync = 1;
+@@ -1070,14 +2174,33 @@ static int ext3_rename (struct inode * o
+ /*
+ * ok, that's it
+ */
+- ext3_delete_entry(handle, old_dir, old_de, old_bh);
++ retval = ext3_delete_entry(handle, old_dir, old_de, old_bh);
++ if (retval == -ENOENT) {
++ /*
++ * old_de could have moved out from under us.
++ */
++ struct buffer_head *old_bh2;
++ struct ext3_dir_entry_2 *old_de2;
++
++ old_bh2 = ext3_find_entry(old_dentry, &old_de2);
++ if (old_bh2) {
++ retval = ext3_delete_entry(handle, old_dir,
++ old_de2, old_bh2);
++ brelse(old_bh2);
++ }
++ }
++ if (retval) {
++ ext3_warning(old_dir->i_sb, "ext3_rename",
++ "Deleting old file (%lu), %d, error=%d",
++ old_dir->i_ino, old_dir->i_nlink, retval);
++ }
+
+ if (new_inode) {
+ new_inode->i_nlink--;
+ new_inode->i_ctime = CURRENT_TIME;
+ }
+ old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
+- old_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
++ ext3_update_dx_flag(old_dir);
+ if (dir_bh) {
+ BUFFER_TRACE(dir_bh, "get_write_access");
+ ext3_journal_get_write_access(handle, dir_bh);
+@@ -1089,7 +2212,7 @@ static int ext3_rename (struct inode * o
+ new_inode->i_nlink--;
+ } else {
+ new_dir->i_nlink++;
+- new_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
++ ext3_update_dx_flag(new_dir);
+ ext3_mark_inode_dirty(handle, new_dir);
+ }
+ }
+--- linux-2.4.22-ac1/fs/ext3/super.c~ext3-htree-2.4.22-rh 2003-09-25 14:39:01.000000000 +0400
++++ linux-2.4.22-ac1-alexey/fs/ext3/super.c 2003-09-25 14:55:12.000000000 +0400
+@@ -714,6 +714,7 @@ static int ext3_setup_super(struct super
+ es->s_mtime = cpu_to_le32(CURRENT_TIME);
+ ext3_update_dynamic_rev(sb);
+ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
++
+ ext3_commit_super (sb, es, 1);
+ if (test_opt (sb, DEBUG))
+ printk (KERN_INFO
+@@ -724,6 +725,7 @@ static int ext3_setup_super(struct super
+ EXT3_BLOCKS_PER_GROUP(sb),
+ EXT3_INODES_PER_GROUP(sb),
+ sbi->s_mount_opt);
++
+ printk(KERN_INFO "EXT3 FS " EXT3FS_VERSION ", " EXT3FS_DATE " on %s, ",
+ bdevname(sb->s_dev));
+ if (EXT3_SB(sb)->s_journal->j_inode == NULL) {
+@@ -897,6 +899,7 @@ static loff_t ext3_max_size(int bits)
+ return res;
+ }
+
++
+ struct super_block * ext3_read_super (struct super_block * sb, void * data,
+ int silent)
+ {
+@@ -1073,6 +1076,9 @@ struct super_block * ext3_read_super (st
+ sbi->s_mount_state = le16_to_cpu(es->s_state);
+ sbi->s_addr_per_block_bits = log2(EXT3_ADDR_PER_BLOCK(sb));
+ sbi->s_desc_per_block_bits = log2(EXT3_DESC_PER_BLOCK(sb));
++ for (i=0; i < 4; i++)
++ sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
++ sbi->s_def_hash_version = es->s_def_hash_version;
+
+ if (sbi->s_blocks_per_group > blocksize * 8) {
+ printk (KERN_ERR
+@@ -1846,6 +1852,7 @@ static void __exit exit_ext3_fs(void)
+ unregister_filesystem(&ext3_fs_type);
+ }
+
++EXPORT_SYMBOL(ext3_force_commit);
+ EXPORT_SYMBOL(ext3_bread);
+
+ MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
+--- linux-2.4.22-ac1/include/linux/ext3_fs.h~ext3-htree-2.4.22-rh 2003-09-25 14:16:29.000000000 +0400
++++ linux-2.4.22-ac1-alexey/include/linux/ext3_fs.h 2003-09-25 14:58:30.000000000 +0400
+@@ -40,6 +40,11 @@
+ #define EXT3FS_VERSION "2.4-0.9.19"
+
+ /*
++ * Always enable hashed directories
++ */
++#define CONFIG_EXT3_INDEX
++
++/*
+ * Debug code
+ */
+ #ifdef EXT3FS_DEBUG
+@@ -440,8 +445,11 @@ struct ext3_super_block {
+ /*E0*/ __u32 s_journal_inum; /* inode number of journal file */
+ __u32 s_journal_dev; /* device number of journal file */
+ __u32 s_last_orphan; /* start of list of inodes to delete */
+-
+-/*EC*/ __u32 s_reserved[197]; /* Padding to the end of the block */
++ __u32 s_hash_seed[4]; /* HTREE hash seed */
++ __u8 s_def_hash_version; /* Default hash version to use */
++ __u8 s_reserved_char_pad;
++ __u16 s_reserved_word_pad;
++ __u32 s_reserved[192]; /* Padding to the end of the block */
+ };
+
+ #ifdef __KERNEL__
+@@ -578,9 +586,46 @@ struct ext3_dir_entry_2 {
+ #define EXT3_DIR_ROUND (EXT3_DIR_PAD - 1)
+ #define EXT3_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT3_DIR_ROUND) & \
+ ~EXT3_DIR_ROUND)
++/*
++ * Hash Tree Directory indexing
++ * (c) Daniel Phillips, 2001
++ */
++
++#ifdef CONFIG_EXT3_INDEX
++ #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
++ EXT3_FEATURE_COMPAT_DIR_INDEX) && \
++ (EXT3_I(dir)->i_flags & EXT3_INDEX_FL))
++#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX)
++#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
++#else
++ #define is_dx(dir) 0
++#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
++#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2)
++#endif
++
++/* Legal values for the dx_root hash_version field: */
++
++#define DX_HASH_LEGACY 0
++#define DX_HASH_HALF_MD4 1
++#define DX_HASH_TEA 2
++
++/* hash info structure used by the directory hash */
++struct dx_hash_info
++{
++ u32 hash;
++ u32 minor_hash;
++ int hash_version;
++ u32 *seed;
++};
+
+ #ifdef __KERNEL__
+ /*
++ * Control parameters used by ext3_htree_next_block
++ */
++#define HASH_NB_ALWAYS 1
++
++
++/*
+ * Describe an inode's exact location on disk and in memory
+ */
+ struct ext3_iloc
+@@ -590,6 +635,27 @@ struct ext3_iloc
+ unsigned long block_group;
+ };
+
++
++/*
++ * This structure is stuffed into the struct file's private_data field
++ * for directories. It is where we put information so that we can do
++ * readdir operations in hash tree order.
++ */
++struct dir_private_info {
++ rb_root_t root;
++ rb_node_t *curr_node;
++ struct fname *extra_fname;
++ loff_t last_pos;
++ __u32 curr_hash;
++ __u32 curr_minor_hash;
++ __u32 next_hash;
++};
++
++/*
++ * Special error return code only used by dx_probe() and its callers.
++ */
++#define ERR_BAD_DX_DIR -75000
++
+ /*
+ * Function prototypes
+ */
+@@ -617,11 +683,20 @@ extern struct ext3_group_desc * ext3_get
+
+ /* dir.c */
+ extern int ext3_check_dir_entry(const char *, struct inode *,
+- struct ext3_dir_entry_2 *, struct buffer_head *,
+- unsigned long);
++ struct ext3_dir_entry_2 *,
++ struct buffer_head *, unsigned long);
++extern int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
++ __u32 minor_hash,
++ struct ext3_dir_entry_2 *dirent);
++extern void ext3_htree_free_dir_info(struct dir_private_info *p);
++
+ /* fsync.c */
+ extern int ext3_sync_file (struct file *, struct dentry *, int);
+
++/* hash.c */
++extern int ext3fs_dirhash(const char *name, int len, struct
++ dx_hash_info *hinfo);
++
+ /* ialloc.c */
+ extern struct inode * ext3_new_inode (handle_t *, const struct inode *, int);
+ extern void ext3_free_inode (handle_t *, struct inode *);
+@@ -655,6 +730,8 @@ extern int ext3_ioctl (struct inode *, s
+ /* namei.c */
+ extern int ext3_orphan_add(handle_t *, struct inode *);
+ extern int ext3_orphan_del(handle_t *, struct inode *);
++extern int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
++ __u32 start_minor_hash, __u32 *next_hash);
+
+ /* super.c */
+ extern void ext3_error (struct super_block *, const char *, const char *, ...)
+--- linux-2.4.22-ac1/include/linux/ext3_fs_sb.h~ext3-htree-2.4.22-rh 2003-09-25 14:16:34.000000000 +0400
++++ linux-2.4.22-ac1-alexey/include/linux/ext3_fs_sb.h 2003-09-25 14:55:12.000000000 +0400
+@@ -62,6 +62,8 @@ struct ext3_sb_info {
+ int s_inode_size;
+ int s_first_ino;
+ u32 s_next_generation;
++ u32 s_hash_seed[4];
++ int s_def_hash_version;
+
+ unsigned long s_dir_count;
+ u8 *s_debts;
+--- linux-2.4.22-ac1/include/linux/ext3_jbd.h~ext3-htree-2.4.22-rh 2003-06-13 18:51:38.000000000 +0400
++++ linux-2.4.22-ac1-alexey/include/linux/ext3_jbd.h 2003-09-25 14:55:12.000000000 +0400
+@@ -63,6 +63,8 @@ extern int ext3_writepage_trans_blocks(s
+
+ #define EXT3_RESERVE_TRANS_BLOCKS 12U
+
++#define EXT3_INDEX_EXTRA_TRANS_BLOCKS 8
++
+ int
+ ext3_mark_iloc_dirty(handle_t *handle,
+ struct inode *inode,
+--- linux-2.4.22-ac1/include/linux/rbtree.h~ext3-htree-2.4.22-rh 2001-11-22 22:46:18.000000000 +0300
++++ linux-2.4.22-ac1-alexey/include/linux/rbtree.h 2003-09-25 14:55:12.000000000 +0400
+@@ -120,6 +120,8 @@ rb_root_t;
+
+ extern void rb_insert_color(rb_node_t *, rb_root_t *);
+ extern void rb_erase(rb_node_t *, rb_root_t *);
++extern rb_node_t *rb_get_first(rb_root_t *root);
++extern rb_node_t *rb_get_next(rb_node_t *n);
+
+ static inline void rb_link_node(rb_node_t * node, rb_node_t * parent, rb_node_t ** rb_link)
+ {
+--- linux-2.4.22-ac1/lib/rbtree.c~ext3-htree-2.4.22-rh 2002-08-03 04:39:46.000000000 +0400
++++ linux-2.4.22-ac1-alexey/lib/rbtree.c 2003-09-25 14:55:12.000000000 +0400
+@@ -17,6 +17,8 @@
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+ linux/lib/rbtree.c
++
++ rb_get_first and rb_get_next written by Theodore Ts'o, 9/8/2002
+ */
+
+ #include <linux/rbtree.h>
+@@ -294,3 +296,43 @@ void rb_erase(rb_node_t * node, rb_root_
+ __rb_erase_color(child, parent, root);
+ }
+ EXPORT_SYMBOL(rb_erase);
++
++/*
++ * This function returns the first node (in sort order) of the tree.
++ */
++rb_node_t *rb_get_first(rb_root_t *root)
++{
++ rb_node_t *n;
++
++ n = root->rb_node;
++ if (!n)
++ return 0;
++ while (n->rb_left)
++ n = n->rb_left;
++ return n;
++}
++EXPORT_SYMBOL(rb_get_first);
++
++/*
++ * Given a node, this function will return the next node in the tree.
++ */
++rb_node_t *rb_get_next(rb_node_t *n)
++{
++ rb_node_t *parent;
++
++ if (n->rb_right) {
++ n = n->rb_right;
++ while (n->rb_left)
++ n = n->rb_left;
++ return n;
++ } else {
++ while ((parent = n->rb_parent)) {
++ if (n == parent->rb_left)
++ return parent;
++ n = parent;
++ }
++ return 0;
++ }
++}
++EXPORT_SYMBOL(rb_get_next);
++
+
+_
--- /dev/null
+ fs/ext3/Makefile | 2
+ fs/ext3/dir.c | 302 +++++++++
+ fs/ext3/file.c | 3
+ fs/ext3/hash.c | 215 ++++++
+ fs/ext3/namei.c | 1420 ++++++++++++++++++++++++++++++++++++++++-----
+ fs/ext3/super.c | 7
+ include/linux/ext3_fs.h | 85 ++
+ include/linux/ext3_fs_sb.h | 2
+ include/linux/ext3_jbd.h | 2
+ include/linux/rbtree.h | 2
+ lib/rbtree.c | 42 +
+ 11 files changed, 1921 insertions(+), 161 deletions(-)
+
+--- linux-2.4.18-chaos-pdirops/fs/ext3/dir.c~ext3-htree 2001-11-10 01:25:04.000000000 +0300
++++ linux-2.4.18-chaos-pdirops-alexey/fs/ext3/dir.c 2003-09-23 12:12:04.000000000 +0400
+@@ -21,12 +21,16 @@
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
++#include <linux/slab.h>
++#include <linux/rbtree.h>
+
+ static unsigned char ext3_filetype_table[] = {
+ DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+ };
+
+ static int ext3_readdir(struct file *, void *, filldir_t);
++static int ext3_dx_readdir(struct file * filp,
++ void * dirent, filldir_t filldir);
+
+ struct file_operations ext3_dir_operations = {
+ read: generic_read_dir,
+@@ -35,6 +39,17 @@ struct file_operations ext3_dir_operatio
+ fsync: ext3_sync_file, /* BKL held */
+ };
+
++
++static unsigned char get_dtype(struct super_block *sb, int filetype)
++{
++ if (!EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_FILETYPE) ||
++ (filetype >= EXT3_FT_MAX))
++ return DT_UNKNOWN;
++
++ return (ext3_filetype_table[filetype]);
++}
++
++
+ int ext3_check_dir_entry (const char * function, struct inode * dir,
+ struct ext3_dir_entry_2 * de,
+ struct buffer_head * bh,
+@@ -79,6 +94,16 @@ static int ext3_readdir(struct file * fi
+
+ sb = inode->i_sb;
+
++ if (is_dx(inode)) {
++ err = ext3_dx_readdir(filp, dirent, filldir);
++ if (err != ERR_BAD_DX_DIR)
++ return err;
++ /*
++ * We don't set the inode dirty flag since it's not
++ * critical that it get flushed back to the disk.
++ */
++ EXT3_I(filp->f_dentry->d_inode)->i_flags &= ~EXT3_INDEX_FL;
++ }
+ stored = 0;
+ bh = NULL;
+ offset = filp->f_pos & (sb->s_blocksize - 1);
+@@ -162,18 +187,12 @@ revalidate:
+ * during the copy operation.
+ */
+ unsigned long version = filp->f_version;
+- unsigned char d_type = DT_UNKNOWN;
+
+- if (EXT3_HAS_INCOMPAT_FEATURE(sb,
+- EXT3_FEATURE_INCOMPAT_FILETYPE)
+- && de->file_type < EXT3_FT_MAX)
+- d_type =
+- ext3_filetype_table[de->file_type];
+ error = filldir(dirent, de->name,
+ de->name_len,
+ filp->f_pos,
+ le32_to_cpu(de->inode),
+- d_type);
++ get_dtype(sb, de->file_type));
+ if (error)
+ break;
+ if (version != filp->f_version)
+@@ -188,3 +207,272 @@ revalidate:
+ UPDATE_ATIME(inode);
+ return 0;
+ }
++
++#ifdef CONFIG_EXT3_INDEX
++/*
++ * These functions convert from the major/minor hash to an f_pos
++ * value.
++ *
++ * Currently we only use major hash numer. This is unfortunate, but
++ * on 32-bit machines, the same VFS interface is used for lseek and
++ * llseek, so if we use the 64 bit offset, then the 32-bit versions of
++ * lseek/telldir/seekdir will blow out spectacularly, and from within
++ * the ext2 low-level routine, we don't know if we're being called by
++ * a 64-bit version of the system call or the 32-bit version of the
++ * system call. Worse yet, NFSv2 only allows for a 32-bit readdir
++ * cookie. Sigh.
++ */
++#define hash2pos(major, minor) (major >> 1)
++#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff)
++#define pos2min_hash(pos) (0)
++
++/*
++ * This structure holds the nodes of the red-black tree used to store
++ * the directory entry in hash order.
++ */
++struct fname {
++ __u32 hash;
++ __u32 minor_hash;
++ rb_node_t rb_hash;
++ struct fname *next;
++ __u32 inode;
++ __u8 name_len;
++ __u8 file_type;
++ char name[0];
++};
++
++/*
++ * This functoin implements a non-recursive way of freeing all of the
++ * nodes in the red-black tree.
++ */
++static void free_rb_tree_fname(rb_root_t *root)
++{
++ rb_node_t *n = root->rb_node;
++ rb_node_t *parent;
++ struct fname *fname;
++
++ while (n) {
++ /* Do the node's children first */
++ if ((n)->rb_left) {
++ n = n->rb_left;
++ continue;
++ }
++ if (n->rb_right) {
++ n = n->rb_right;
++ continue;
++ }
++ /*
++ * The node has no children; free it, and then zero
++ * out parent's link to it. Finally go to the
++ * beginning of the loop and try to free the parent
++ * node.
++ */
++ parent = n->rb_parent;
++ fname = rb_entry(n, struct fname, rb_hash);
++ kfree(fname);
++ if (!parent)
++ root->rb_node = 0;
++ else if (parent->rb_left == n)
++ parent->rb_left = 0;
++ else if (parent->rb_right == n)
++ parent->rb_right = 0;
++ n = parent;
++ }
++ root->rb_node = 0;
++}
++
++
++struct dir_private_info *create_dir_info(loff_t pos)
++{
++ struct dir_private_info *p;
++
++ p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
++ if (!p)
++ return NULL;
++ p->root.rb_node = 0;
++ p->curr_node = 0;
++ p->extra_fname = 0;
++ p->last_pos = 0;
++ p->curr_hash = pos2maj_hash(pos);
++ p->curr_minor_hash = pos2min_hash(pos);
++ p->next_hash = 0;
++ return p;
++}
++
++void ext3_htree_free_dir_info(struct dir_private_info *p)
++{
++ free_rb_tree_fname(&p->root);
++ kfree(p);
++}
++
++/*
++ * Given a directory entry, enter it into the fname rb tree.
++ */
++int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
++ __u32 minor_hash,
++ struct ext3_dir_entry_2 *dirent)
++{
++ rb_node_t **p, *parent = NULL;
++ struct fname * fname, *new_fn;
++ struct dir_private_info *info;
++ int len;
++
++ info = (struct dir_private_info *) dir_file->private_data;
++ p = &info->root.rb_node;
++
++ /* Create and allocate the fname structure */
++ len = sizeof(struct fname) + dirent->name_len + 1;
++ new_fn = kmalloc(len, GFP_KERNEL);
++ if (!new_fn)
++ return -ENOMEM;
++ memset(new_fn, 0, len);
++ new_fn->hash = hash;
++ new_fn->minor_hash = minor_hash;
++ new_fn->inode = le32_to_cpu(dirent->inode);
++ new_fn->name_len = dirent->name_len;
++ new_fn->file_type = dirent->file_type;
++ memcpy(new_fn->name, dirent->name, dirent->name_len);
++ new_fn->name[dirent->name_len] = 0;
++
++ while (*p) {
++ parent = *p;
++ fname = rb_entry(parent, struct fname, rb_hash);
++
++ /*
++ * If the hash and minor hash match up, then we put
++ * them on a linked list. This rarely happens...
++ */
++ if ((new_fn->hash == fname->hash) &&
++ (new_fn->minor_hash == fname->minor_hash)) {
++ new_fn->next = fname->next;
++ fname->next = new_fn;
++ return 0;
++ }
++
++ if (new_fn->hash < fname->hash)
++ p = &(*p)->rb_left;
++ else if (new_fn->hash > fname->hash)
++ p = &(*p)->rb_right;
++ else if (new_fn->minor_hash < fname->minor_hash)
++ p = &(*p)->rb_left;
++ else /* if (new_fn->minor_hash > fname->minor_hash) */
++ p = &(*p)->rb_right;
++ }
++
++ rb_link_node(&new_fn->rb_hash, parent, p);
++ rb_insert_color(&new_fn->rb_hash, &info->root);
++ return 0;
++}
++
++
++
++/*
++ * This is a helper function for ext3_dx_readdir. It calls filldir
++ * for all entres on the fname linked list. (Normally there is only
++ * one entry on the linked list, unless there are 62 bit hash collisions.)
++ */
++static int call_filldir(struct file * filp, void * dirent,
++ filldir_t filldir, struct fname *fname)
++{
++ struct dir_private_info *info = filp->private_data;
++ loff_t curr_pos;
++ struct inode *inode = filp->f_dentry->d_inode;
++ struct super_block * sb;
++ int error;
++
++ sb = inode->i_sb;
++
++ if (!fname) {
++ printk("call_filldir: called with null fname?!?\n");
++ return 0;
++ }
++ curr_pos = hash2pos(fname->hash, fname->minor_hash);
++ while (fname) {
++ error = filldir(dirent, fname->name,
++ fname->name_len, curr_pos,
++ fname->inode,
++ get_dtype(sb, fname->file_type));
++ if (error) {
++ filp->f_pos = curr_pos;
++ info->extra_fname = fname->next;
++ return error;
++ }
++ fname = fname->next;
++ }
++ return 0;
++}
++
++static int ext3_dx_readdir(struct file * filp,
++ void * dirent, filldir_t filldir)
++{
++ struct dir_private_info *info = filp->private_data;
++ struct inode *inode = filp->f_dentry->d_inode;
++ struct fname *fname;
++ int ret;
++
++ if (!info) {
++ info = create_dir_info(filp->f_pos);
++ if (!info)
++ return -ENOMEM;
++ filp->private_data = info;
++ }
++
++ /* Some one has messed with f_pos; reset the world */
++ if (info->last_pos != filp->f_pos) {
++ free_rb_tree_fname(&info->root);
++ info->curr_node = 0;
++ info->extra_fname = 0;
++ info->curr_hash = pos2maj_hash(filp->f_pos);
++ info->curr_minor_hash = pos2min_hash(filp->f_pos);
++ }
++
++ /*
++ * If there are any leftover names on the hash collision
++ * chain, return them first.
++ */
++ if (info->extra_fname &&
++ call_filldir(filp, dirent, filldir, info->extra_fname))
++ goto finished;
++
++ if (!info->curr_node)
++ info->curr_node = rb_get_first(&info->root);
++
++ while (1) {
++ /*
++ * Fill the rbtree if we have no more entries,
++ * or the inode has changed since we last read in the
++ * cached entries.
++ */
++ if ((!info->curr_node) ||
++ (filp->f_version != inode->i_version)) {
++ info->curr_node = 0;
++ free_rb_tree_fname(&info->root);
++ filp->f_version = inode->i_version;
++ ret = ext3_htree_fill_tree(filp, info->curr_hash,
++ info->curr_minor_hash,
++ &info->next_hash);
++ if (ret < 0)
++ return ret;
++ if (ret == 0)
++ break;
++ info->curr_node = rb_get_first(&info->root);
++ }
++
++ fname = rb_entry(info->curr_node, struct fname, rb_hash);
++ info->curr_hash = fname->hash;
++ info->curr_minor_hash = fname->minor_hash;
++ if (call_filldir(filp, dirent, filldir, fname))
++ break;
++
++ info->curr_node = rb_get_next(info->curr_node);
++ if (!info->curr_node) {
++ info->curr_hash = info->next_hash;
++ info->curr_minor_hash = 0;
++ }
++ }
++finished:
++ info->last_pos = filp->f_pos;
++ UPDATE_ATIME(inode);
++ return 0;
++}
++#endif
+--- linux-2.4.18-chaos-pdirops/fs/ext3/file.c~ext3-htree 2003-07-28 17:52:04.000000000 +0400
++++ linux-2.4.18-chaos-pdirops-alexey/fs/ext3/file.c 2003-09-23 12:11:54.000000000 +0400
+@@ -35,6 +35,9 @@ static int ext3_release_file (struct ino
+ {
+ if (filp->f_mode & FMODE_WRITE)
+ ext3_discard_prealloc (inode);
++ if (is_dx(inode) && filp->private_data)
++ ext3_htree_free_dir_info(filp->private_data);
++
+ return 0;
+ }
+
+--- /dev/null 2003-01-30 13:24:37.000000000 +0300
++++ linux-2.4.18-chaos-pdirops-alexey/fs/ext3/hash.c 2003-09-23 12:11:54.000000000 +0400
+@@ -0,0 +1,215 @@
++/*
++ * linux/fs/ext3/hash.c
++ *
++ * Copyright (C) 2002 by Theodore Ts'o
++ *
++ * This file is released under the GPL v2.
++ *
++ * This file may be redistributed under the terms of the GNU Public
++ * License.
++ */
++
++#include <linux/fs.h>
++#include <linux/jbd.h>
++#include <linux/sched.h>
++#include <linux/ext3_fs.h>
++
++#define DELTA 0x9E3779B9
++
++static void TEA_transform(__u32 buf[4], __u32 const in[])
++{
++ __u32 sum = 0;
++ __u32 b0 = buf[0], b1 = buf[1];
++ __u32 a = in[0], b = in[1], c = in[2], d = in[3];
++ int n = 16;
++
++ do {
++ sum += DELTA;
++ b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b);
++ b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d);
++ } while(--n);
++
++ buf[0] += b0;
++ buf[1] += b1;
++}
++
++/* F, G and H are basic MD4 functions: selection, majority, parity */
++#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
++#define G(x, y, z) (((x) & (y)) + (((x) ^ (y)) & (z)))
++#define H(x, y, z) ((x) ^ (y) ^ (z))
++
++/*
++ * The generic round function. The application is so specific that
++ * we don't bother protecting all the arguments with parens, as is generally
++ * good macro practice, in favor of extra legibility.
++ * Rotation is separate from addition to prevent recomputation
++ */
++#define ROUND(f, a, b, c, d, x, s) \
++ (a += f(b, c, d) + x, a = (a << s) | (a >> (32-s)))
++#define K1 0
++#define K2 013240474631UL
++#define K3 015666365641UL
++
++/*
++ * Basic cut-down MD4 transform. Returns only 32 bits of result.
++ */
++static void halfMD4Transform (__u32 buf[4], __u32 const in[])
++{
++ __u32 a = buf[0], b = buf[1], c = buf[2], d = buf[3];
++
++ /* Round 1 */
++ ROUND(F, a, b, c, d, in[0] + K1, 3);
++ ROUND(F, d, a, b, c, in[1] + K1, 7);
++ ROUND(F, c, d, a, b, in[2] + K1, 11);
++ ROUND(F, b, c, d, a, in[3] + K1, 19);
++ ROUND(F, a, b, c, d, in[4] + K1, 3);
++ ROUND(F, d, a, b, c, in[5] + K1, 7);
++ ROUND(F, c, d, a, b, in[6] + K1, 11);
++ ROUND(F, b, c, d, a, in[7] + K1, 19);
++
++ /* Round 2 */
++ ROUND(G, a, b, c, d, in[1] + K2, 3);
++ ROUND(G, d, a, b, c, in[3] + K2, 5);
++ ROUND(G, c, d, a, b, in[5] + K2, 9);
++ ROUND(G, b, c, d, a, in[7] + K2, 13);
++ ROUND(G, a, b, c, d, in[0] + K2, 3);
++ ROUND(G, d, a, b, c, in[2] + K2, 5);
++ ROUND(G, c, d, a, b, in[4] + K2, 9);
++ ROUND(G, b, c, d, a, in[6] + K2, 13);
++
++ /* Round 3 */
++ ROUND(H, a, b, c, d, in[3] + K3, 3);
++ ROUND(H, d, a, b, c, in[7] + K3, 9);
++ ROUND(H, c, d, a, b, in[2] + K3, 11);
++ ROUND(H, b, c, d, a, in[6] + K3, 15);
++ ROUND(H, a, b, c, d, in[1] + K3, 3);
++ ROUND(H, d, a, b, c, in[5] + K3, 9);
++ ROUND(H, c, d, a, b, in[0] + K3, 11);
++ ROUND(H, b, c, d, a, in[4] + K3, 15);
++
++ buf[0] += a;
++ buf[1] += b;
++ buf[2] += c;
++ buf[3] += d;
++}
++
++#undef ROUND
++#undef F
++#undef G
++#undef H
++#undef K1
++#undef K2
++#undef K3
++
++/* The old legacy hash */
++static __u32 dx_hack_hash (const char *name, int len)
++{
++ __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9;
++ while (len--) {
++ __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373));
++
++ if (hash & 0x80000000) hash -= 0x7fffffff;
++ hash1 = hash0;
++ hash0 = hash;
++ }
++ return (hash0 << 1);
++}
++
++static void str2hashbuf(const char *msg, int len, __u32 *buf, int num)
++{
++ __u32 pad, val;
++ int i;
++
++ pad = (__u32)len | ((__u32)len << 8);
++ pad |= pad << 16;
++
++ val = pad;
++ if (len > num*4)
++ len = num * 4;
++ for (i=0; i < len; i++) {
++ if ((i % 4) == 0)
++ val = pad;
++ val = msg[i] + (val << 8);
++ if ((i % 4) == 3) {
++ *buf++ = val;
++ val = pad;
++ num--;
++ }
++ }
++ if (--num >= 0)
++ *buf++ = val;
++ while (--num >= 0)
++ *buf++ = pad;
++}
++
++/*
++ * Returns the hash of a filename. If len is 0 and name is NULL, then
++ * this function can be used to test whether or not a hash version is
++ * supported.
++ *
++ * The seed is an 4 longword (32 bits) "secret" which can be used to
++ * uniquify a hash. If the seed is all zero's, then some default seed
++ * may be used.
++ *
++ * A particular hash version specifies whether or not the seed is
++ * represented, and whether or not the returned hash is 32 bits or 64
++ * bits. 32 bit hashes will return 0 for the minor hash.
++ */
++int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo)
++{
++ __u32 hash;
++ __u32 minor_hash = 0;
++ const char *p;
++ int i;
++ __u32 in[8], buf[4];
++
++ /* Initialize the default seed for the hash checksum functions */
++ buf[0] = 0x67452301;
++ buf[1] = 0xefcdab89;
++ buf[2] = 0x98badcfe;
++ buf[3] = 0x10325476;
++
++ /* Check to see if the seed is all zero's */
++ if (hinfo->seed) {
++ for (i=0; i < 4; i++) {
++ if (hinfo->seed[i])
++ break;
++ }
++ if (i < 4)
++ memcpy(buf, hinfo->seed, sizeof(buf));
++ }
++
++ switch (hinfo->hash_version) {
++ case DX_HASH_LEGACY:
++ hash = dx_hack_hash(name, len);
++ break;
++ case DX_HASH_HALF_MD4:
++ p = name;
++ while (len > 0) {
++ str2hashbuf(p, len, in, 8);
++ halfMD4Transform(buf, in);
++ len -= 32;
++ p += 32;
++ }
++ minor_hash = buf[2];
++ hash = buf[1];
++ break;
++ case DX_HASH_TEA:
++ p = name;
++ while (len > 0) {
++ str2hashbuf(p, len, in, 4);
++ TEA_transform(buf, in);
++ len -= 16;
++ p += 16;
++ }
++ hash = buf[0];
++ minor_hash = buf[1];
++ break;
++ default:
++ hinfo->hash = 0;
++ return -1;
++ }
++ hinfo->hash = hash & ~1;
++ hinfo->minor_hash = minor_hash;
++ return 0;
++}
+--- linux-2.4.18-chaos-pdirops/fs/ext3/Makefile~ext3-htree 2003-09-23 11:52:24.000000000 +0400
++++ linux-2.4.18-chaos-pdirops-alexey/fs/ext3/Makefile 2003-09-23 12:11:54.000000000 +0400
+@@ -12,7 +12,7 @@ O_TARGET := ext3.o
+ export-objs := super.o inode.o
+
+ obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+- ioctl.o namei.o super.o symlink.o
++ ioctl.o namei.o super.o symlink.o hash.o
+ obj-m := $(O_TARGET)
+
+ include $(TOPDIR)/Rules.make
+--- linux-2.4.18-chaos-pdirops/fs/ext3/namei.c~ext3-htree 2003-09-23 11:52:34.000000000 +0400
++++ linux-2.4.18-chaos-pdirops-alexey/fs/ext3/namei.c 2003-09-23 12:12:09.000000000 +0400
+@@ -16,6 +16,12 @@
+ * David S. Miller (davem@caip.rutgers.edu), 1995
+ * Directory entry file type support and forward compatibility hooks
+ * for B-tree directories by Theodore Ts'o (tytso@mit.edu), 1998
++ * Hash Tree Directory indexing (c)
++ * Daniel Phillips, 2001
++ * Hash Tree Directory indexing porting
++ * Christopher Li, 2002
++ * Hash Tree Directory indexing cleanup
++ * Theodore Ts'o, 2002
+ */
+
+ #include <linux/fs.h>
+@@ -38,6 +44,642 @@
+ #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
+ #define NAMEI_RA_INDEX(c,b) (((c) * NAMEI_RA_BLOCKS) + (b))
+
++static struct buffer_head *ext3_append(handle_t *handle,
++ struct inode *inode,
++ u32 *block, int *err)
++{
++ struct buffer_head *bh;
++
++ *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
++
++ if ((bh = ext3_bread(handle, inode, *block, 1, err))) {
++ inode->i_size += inode->i_sb->s_blocksize;
++ EXT3_I(inode)->i_disksize = inode->i_size;
++ ext3_journal_get_write_access(handle,bh);
++ }
++ return bh;
++}
++
++#ifndef assert
++#define assert(test) J_ASSERT(test)
++#endif
++
++#ifndef swap
++#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0)
++#endif
++
++typedef struct { u32 v; } le_u32;
++typedef struct { u16 v; } le_u16;
++
++#ifdef DX_DEBUG
++#define dxtrace(command) command
++#else
++#define dxtrace(command)
++#endif
++
++struct fake_dirent
++{
++ /*le*/u32 inode;
++ /*le*/u16 rec_len;
++ u8 name_len;
++ u8 file_type;
++};
++
++struct dx_countlimit
++{
++ le_u16 limit;
++ le_u16 count;
++};
++
++struct dx_entry
++{
++ le_u32 hash;
++ le_u32 block;
++};
++
++/*
++ * dx_root_info is laid out so that if it should somehow get overlaid by a
++ * dirent the two low bits of the hash version will be zero. Therefore, the
++ * hash version mod 4 should never be 0. Sincerely, the paranoia department.
++ */
++
++struct dx_root
++{
++ struct fake_dirent dot;
++ char dot_name[4];
++ struct fake_dirent dotdot;
++ char dotdot_name[4];
++ struct dx_root_info
++ {
++ le_u32 reserved_zero;
++ u8 hash_version;
++ u8 info_length; /* 8 */
++ u8 indirect_levels;
++ u8 unused_flags;
++ }
++ info;
++ struct dx_entry entries[0];
++};
++
++struct dx_node
++{
++ struct fake_dirent fake;
++ struct dx_entry entries[0];
++};
++
++
++struct dx_frame
++{
++ struct buffer_head *bh;
++ struct dx_entry *entries;
++ struct dx_entry *at;
++};
++
++struct dx_map_entry
++{
++ u32 hash;
++ u32 offs;
++};
++
++#ifdef CONFIG_EXT3_INDEX
++static inline unsigned dx_get_block (struct dx_entry *entry);
++static void dx_set_block (struct dx_entry *entry, unsigned value);
++static inline unsigned dx_get_hash (struct dx_entry *entry);
++static void dx_set_hash (struct dx_entry *entry, unsigned value);
++static unsigned dx_get_count (struct dx_entry *entries);
++static unsigned dx_get_limit (struct dx_entry *entries);
++static void dx_set_count (struct dx_entry *entries, unsigned value);
++static void dx_set_limit (struct dx_entry *entries, unsigned value);
++static unsigned dx_root_limit (struct inode *dir, unsigned infosize);
++static unsigned dx_node_limit (struct inode *dir);
++static struct dx_frame *dx_probe(struct dentry *dentry,
++ struct inode *dir,
++ struct dx_hash_info *hinfo,
++ struct dx_frame *frame,
++ int *err);
++static void dx_release (struct dx_frame *frames);
++static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
++ struct dx_hash_info *hinfo, struct dx_map_entry map[]);
++static void dx_sort_map(struct dx_map_entry *map, unsigned count);
++static struct ext3_dir_entry_2 *dx_move_dirents (char *from, char *to,
++ struct dx_map_entry *offsets, int count);
++static struct ext3_dir_entry_2* dx_pack_dirents (char *base, int size);
++static void dx_insert_block (struct dx_frame *frame, u32 hash, u32 block);
++static int ext3_htree_next_block(struct inode *dir, __u32 hash,
++ struct dx_frame *frame,
++ struct dx_frame *frames, int *err,
++ __u32 *start_hash);
++static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
++ struct ext3_dir_entry_2 **res_dir, int *err);
++static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
++ struct inode *inode);
++
++/*
++ * Future: use high four bits of block for coalesce-on-delete flags
++ * Mask them off for now.
++ */
++
++static inline unsigned dx_get_block (struct dx_entry *entry)
++{
++ return le32_to_cpu(entry->block.v) & 0x00ffffff;
++}
++
++static inline void dx_set_block (struct dx_entry *entry, unsigned value)
++{
++ entry->block.v = cpu_to_le32(value);
++}
++
++static inline unsigned dx_get_hash (struct dx_entry *entry)
++{
++ return le32_to_cpu(entry->hash.v);
++}
++
++static inline void dx_set_hash (struct dx_entry *entry, unsigned value)
++{
++ entry->hash.v = cpu_to_le32(value);
++}
++
++static inline unsigned dx_get_count (struct dx_entry *entries)
++{
++ return le16_to_cpu(((struct dx_countlimit *) entries)->count.v);
++}
++
++static inline unsigned dx_get_limit (struct dx_entry *entries)
++{
++ return le16_to_cpu(((struct dx_countlimit *) entries)->limit.v);
++}
++
++static inline void dx_set_count (struct dx_entry *entries, unsigned value)
++{
++ ((struct dx_countlimit *) entries)->count.v = cpu_to_le16(value);
++}
++
++static inline void dx_set_limit (struct dx_entry *entries, unsigned value)
++{
++ ((struct dx_countlimit *) entries)->limit.v = cpu_to_le16(value);
++}
++
++static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
++{
++ unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(1) -
++ EXT3_DIR_REC_LEN(2) - infosize;
++ return 0? 20: entry_space / sizeof(struct dx_entry);
++}
++
++static inline unsigned dx_node_limit (struct inode *dir)
++{
++ unsigned entry_space = dir->i_sb->s_blocksize - EXT3_DIR_REC_LEN(0);
++ return 0? 22: entry_space / sizeof(struct dx_entry);
++}
++
++/*
++ * Debug
++ */
++#ifdef DX_DEBUG
++struct stats
++{
++ unsigned names;
++ unsigned space;
++ unsigned bcount;
++};
++
++static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext3_dir_entry_2 *de,
++ int size, int show_names)
++{
++ unsigned names = 0, space = 0;
++ char *base = (char *) de;
++ struct dx_hash_info h = *hinfo;
++
++ printk("names: ");
++ while ((char *) de < base + size)
++ {
++ if (de->inode)
++ {
++ if (show_names)
++ {
++ int len = de->name_len;
++ char *name = de->name;
++ while (len--) printk("%c", *name++);
++ ext3fs_dirhash(de->name, de->name_len, &h);
++ printk(":%x.%u ", h.hash,
++ ((char *) de - base));
++ }
++ space += EXT3_DIR_REC_LEN(de->name_len);
++ names++;
++ }
++ de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
++ }
++ printk("(%i)\n", names);
++ return (struct stats) { names, space, 1 };
++}
++
++struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
++ struct dx_entry *entries, int levels)
++{
++ unsigned blocksize = dir->i_sb->s_blocksize;
++ unsigned count = dx_get_count (entries), names = 0, space = 0, i;
++ unsigned bcount = 0;
++ struct buffer_head *bh;
++ int err;
++ printk("%i indexed blocks...\n", count);
++ for (i = 0; i < count; i++, entries++)
++ {
++ u32 block = dx_get_block(entries), hash = i? dx_get_hash(entries): 0;
++ u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash;
++ struct stats stats;
++ printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range);
++ if (!(bh = ext3_bread (NULL,dir, block, 0,&err))) continue;
++ stats = levels?
++ dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1):
++ dx_show_leaf(hinfo, (struct ext3_dir_entry_2 *) bh->b_data, blocksize, 0);
++ names += stats.names;
++ space += stats.space;
++ bcount += stats.bcount;
++ brelse (bh);
++ }
++ if (bcount)
++ printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ",
++ names, space/bcount,(space/bcount)*100/blocksize);
++ return (struct stats) { names, space, bcount};
++}
++#endif /* DX_DEBUG */
++
++/*
++ * Probe for a directory leaf block to search.
++ *
++ * dx_probe can return ERR_BAD_DX_DIR, which means there was a format
++ * error in the directory index, and the caller should fall back to
++ * searching the directory normally. The callers of dx_probe **MUST**
++ * check for this error code, and make sure it never gets reflected
++ * back to userspace.
++ */
++static struct dx_frame *
++dx_probe(struct dentry *dentry, struct inode *dir,
++ struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err)
++{
++ unsigned count, indirect;
++ struct dx_entry *at, *entries, *p, *q, *m;
++ struct dx_root *root;
++ struct buffer_head *bh;
++ struct dx_frame *frame = frame_in;
++ u32 hash;
++
++ frame->bh = NULL;
++ if (dentry)
++ dir = dentry->d_parent->d_inode;
++ if (!(bh = ext3_bread (NULL,dir, 0, 0, err)))
++ goto fail;
++ root = (struct dx_root *) bh->b_data;
++ if (root->info.hash_version != DX_HASH_TEA &&
++ root->info.hash_version != DX_HASH_HALF_MD4 &&
++ root->info.hash_version != DX_HASH_LEGACY) {
++ ext3_warning(dir->i_sb, __FUNCTION__,
++ "Unrecognised inode hash code %d",
++ root->info.hash_version);
++ brelse(bh);
++ *err = ERR_BAD_DX_DIR;
++ goto fail;
++ }
++ hinfo->hash_version = root->info.hash_version;
++ hinfo->seed = dir->i_sb->u.ext3_sb.s_hash_seed;
++ if (dentry)
++ ext3fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo);
++ hash = hinfo->hash;
++
++ if (root->info.unused_flags & 1) {
++ ext3_warning(dir->i_sb, __FUNCTION__,
++ "Unimplemented inode hash flags: %#06x",
++ root->info.unused_flags);
++ brelse(bh);
++ *err = ERR_BAD_DX_DIR;
++ goto fail;
++ }
++
++ if ((indirect = root->info.indirect_levels) > 1) {
++ ext3_warning(dir->i_sb, __FUNCTION__,
++ "Unimplemented inode hash depth: %#06x",
++ root->info.indirect_levels);
++ brelse(bh);
++ *err = ERR_BAD_DX_DIR;
++ goto fail;
++ }
++
++ entries = (struct dx_entry *) (((char *)&root->info) +
++ root->info.info_length);
++ assert(dx_get_limit(entries) == dx_root_limit(dir,
++ root->info.info_length));
++ dxtrace (printk("Look up %x", hash));
++ while (1)
++ {
++ count = dx_get_count(entries);
++ assert (count && count <= dx_get_limit(entries));
++ p = entries + 1;
++ q = entries + count - 1;
++ while (p <= q)
++ {
++ m = p + (q - p)/2;
++ dxtrace(printk("."));
++ if (dx_get_hash(m) > hash)
++ q = m - 1;
++ else
++ p = m + 1;
++ }
++
++ if (0) // linear search cross check
++ {
++ unsigned n = count - 1;
++ at = entries;
++ while (n--)
++ {
++ dxtrace(printk(","));
++ if (dx_get_hash(++at) > hash)
++ {
++ at--;
++ break;
++ }
++ }
++ assert (at == p - 1);
++ }
++
++ at = p - 1;
++ dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at)));
++ frame->bh = bh;
++ frame->entries = entries;
++ frame->at = at;
++ if (!indirect--) return frame;
++ if (!(bh = ext3_bread (NULL,dir, dx_get_block(at), 0, err)))
++ goto fail2;
++ at = entries = ((struct dx_node *) bh->b_data)->entries;
++ assert (dx_get_limit(entries) == dx_node_limit (dir));
++ frame++;
++ }
++fail2:
++ while (frame >= frame_in) {
++ brelse(frame->bh);
++ frame--;
++ }
++fail:
++ return NULL;
++}
++
++static void dx_release (struct dx_frame *frames)
++{
++ if (frames[0].bh == NULL)
++ return;
++
++ if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels)
++ brelse(frames[1].bh);
++ brelse(frames[0].bh);
++}
++
++/*
++ * This function increments the frame pointer to search the next leaf
++ * block, and reads in the necessary intervening nodes if the search
++ * should be necessary. Whether or not the search is necessary is
++ * controlled by the hash parameter. If the hash value is even, then
++ * the search is only continued if the next block starts with that
++ * hash value. This is used if we are searching for a specific file.
++ *
++ * If the hash value is HASH_NB_ALWAYS, then always go to the next block.
++ *
++ * This function returns 1 if the caller should continue to search,
++ * or 0 if it should not. If there is an error reading one of the
++ * index blocks, it will return -1.
++ *
++ * If start_hash is non-null, it will be filled in with the starting
++ * hash of the next page.
++ */
++static int ext3_htree_next_block(struct inode *dir, __u32 hash,
++ struct dx_frame *frame,
++ struct dx_frame *frames, int *err,
++ __u32 *start_hash)
++{
++ struct dx_frame *p;
++ struct buffer_head *bh;
++ int num_frames = 0;
++ __u32 bhash;
++
++ *err = ENOENT;
++ p = frame;
++ /*
++ * Find the next leaf page by incrementing the frame pointer.
++ * If we run out of entries in the interior node, loop around and
++ * increment pointer in the parent node. When we break out of
++ * this loop, num_frames indicates the number of interior
++ * nodes need to be read.
++ */
++ while (1) {
++ if (++(p->at) < p->entries + dx_get_count(p->entries))
++ break;
++ if (p == frames)
++ return 0;
++ num_frames++;
++ p--;
++ }
++
++ /*
++ * If the hash is 1, then continue only if the next page has a
++ * continuation hash of any value. This is used for readdir
++ * handling. Otherwise, check to see if the hash matches the
++ * desired contiuation hash. If it doesn't, return since
++ * there's no point to read in the successive index pages.
++ */
++ bhash = dx_get_hash(p->at);
++ if (start_hash)
++ *start_hash = bhash;
++ if ((hash & 1) == 0) {
++ if ((bhash & ~1) != hash)
++ return 0;
++ }
++ /*
++ * If the hash is HASH_NB_ALWAYS, we always go to the next
++ * block so no check is necessary
++ */
++ while (num_frames--) {
++ if (!(bh = ext3_bread(NULL, dir, dx_get_block(p->at),
++ 0, err)))
++ return -1; /* Failure */
++ p++;
++ brelse (p->bh);
++ p->bh = bh;
++ p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
++ }
++ return 1;
++}
++
++
++/*
++ * p is at least 6 bytes before the end of page
++ */
++static inline struct ext3_dir_entry_2 *ext3_next_entry(struct ext3_dir_entry_2 *p)
++{
++ return (struct ext3_dir_entry_2 *)((char*)p + le16_to_cpu(p->rec_len));
++}
++
++/*
++ * This function fills a red-black tree with information from a
++ * directory. We start scanning the directory in hash order, starting
++ * at start_hash and start_minor_hash.
++ *
++ * This function returns the number of entries inserted into the tree,
++ * or a negative error code.
++ */
++int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
++ __u32 start_minor_hash, __u32 *next_hash)
++{
++ struct dx_hash_info hinfo;
++ struct buffer_head *bh;
++ struct ext3_dir_entry_2 *de, *top;
++ static struct dx_frame frames[2], *frame;
++ struct inode *dir;
++ int block, err;
++ int count = 0;
++ int ret;
++ __u32 hashval;
++
++ dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash,
++ start_minor_hash));
++ dir = dir_file->f_dentry->d_inode;
++ hinfo.hash = start_hash;
++ hinfo.minor_hash = 0;
++ frame = dx_probe(0, dir_file->f_dentry->d_inode, &hinfo, frames, &err);
++ if (!frame)
++ return err;
++
++ /* Add '.' and '..' from the htree header */
++ if (!start_hash && !start_minor_hash) {
++ de = (struct ext3_dir_entry_2 *) frames[0].bh->b_data;
++ if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0)
++ goto errout;
++ de = ext3_next_entry(de);
++ if ((err = ext3_htree_store_dirent(dir_file, 0, 0, de)) != 0)
++ goto errout;
++ count += 2;
++ }
++
++ while (1) {
++ block = dx_get_block(frame->at);
++ dxtrace(printk("Reading block %d\n", block));
++ if (!(bh = ext3_bread (NULL, dir, block, 0, &err)))
++ goto errout;
++
++ de = (struct ext3_dir_entry_2 *) bh->b_data;
++ top = (struct ext3_dir_entry_2 *) ((char *) de + dir->i_sb->s_blocksize -
++ EXT3_DIR_REC_LEN(0));
++ for (; de < top; de = ext3_next_entry(de)) {
++ ext3fs_dirhash(de->name, de->name_len, &hinfo);
++ if ((hinfo.hash < start_hash) ||
++ ((hinfo.hash == start_hash) &&
++ (hinfo.minor_hash < start_minor_hash)))
++ continue;
++ if ((err = ext3_htree_store_dirent(dir_file,
++ hinfo.hash, hinfo.minor_hash, de)) != 0)
++ goto errout;
++ count++;
++ }
++ brelse (bh);
++ hashval = ~1;
++ ret = ext3_htree_next_block(dir, HASH_NB_ALWAYS,
++ frame, frames, &err, &hashval);
++ if (next_hash)
++ *next_hash = hashval;
++ if (ret == -1)
++ goto errout;
++ /*
++ * Stop if: (a) there are no more entries, or
++ * (b) we have inserted at least one entry and the
++ * next hash value is not a continuation
++ */
++ if ((ret == 0) ||
++ (count && ((hashval & 1) == 0)))
++ break;
++ }
++ dx_release(frames);
++ dxtrace(printk("Fill tree: returned %d entries\n", count));
++ return count;
++errout:
++ dx_release(frames);
++ return (err);
++}
++
++
++/*
++ * Directory block splitting, compacting
++ */
++
++static int dx_make_map (struct ext3_dir_entry_2 *de, int size,
++ struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
++{
++ int count = 0;
++ char *base = (char *) de;
++ struct dx_hash_info h = *hinfo;
++
++ while ((char *) de < base + size)
++ {
++ if (de->name_len && de->inode) {
++ ext3fs_dirhash(de->name, de->name_len, &h);
++ map_tail--;
++ map_tail->hash = h.hash;
++ map_tail->offs = (u32) ((char *) de - base);
++ count++;
++ }
++ /* XXX: do we need to check rec_len == 0 case? -Chris */
++ de = (struct ext3_dir_entry_2 *) ((char *) de + le16_to_cpu(de->rec_len));
++ }
++ return count;
++}
++
++static void dx_sort_map (struct dx_map_entry *map, unsigned count)
++{
++ struct dx_map_entry *p, *q, *top = map + count - 1;
++ int more;
++ /* Combsort until bubble sort doesn't suck */
++ while (count > 2)
++ {
++ count = count*10/13;
++ if (count - 9 < 2) /* 9, 10 -> 11 */
++ count = 11;
++ for (p = top, q = p - count; q >= map; p--, q--)
++ if (p->hash < q->hash)
++ swap(*p, *q);
++ }
++ /* Garden variety bubble sort */
++ do {
++ more = 0;
++ q = top;
++ while (q-- > map)
++ {
++ if (q[1].hash >= q[0].hash)
++ continue;
++ swap(*(q+1), *q);
++ more = 1;
++ }
++ } while(more);
++}
++
++static void dx_insert_block(struct dx_frame *frame, u32 hash, u32 block)
++{
++ struct dx_entry *entries = frame->entries;
++ struct dx_entry *old = frame->at, *new = old + 1;
++ int count = dx_get_count(entries);
++
++ assert(count < dx_get_limit(entries));
++ assert(old < entries + count);
++ memmove(new + 1, new, (char *)(entries + count) - (char *)(new));
++ dx_set_hash(new, hash);
++ dx_set_block(new, block);
++ dx_set_count(entries, count + 1);
++}
++#endif
++
++
++static void ext3_update_dx_flag(struct inode *inode)
++{
++ if (!EXT3_HAS_COMPAT_FEATURE(inode->i_sb,
++ EXT3_FEATURE_COMPAT_DIR_INDEX))
++ EXT3_I(inode)->i_flags &= ~EXT3_INDEX_FL;
++}
++
+ /*
+ * NOTE! unlike strncmp, ext3_match returns 1 for success, 0 for failure.
+ *
+@@ -94,6 +736,7 @@ static int inline search_dirblock(struct
+ return 0;
+ }
+
++
+ /*
+ * ext3_find_entry()
+ *
+@@ -105,6 +748,8 @@ static int inline search_dirblock(struct
+ * The returned buffer_head has ->b_count elevated. The caller is expected
+ * to brelse() it when appropriate.
+ */
++
++
+ static struct buffer_head * ext3_find_entry (struct dentry *dentry,
+ struct ext3_dir_entry_2 ** res_dir)
+ {
+@@ -119,12 +764,32 @@ static struct buffer_head * ext3_find_en
+ int num = 0;
+ int nblocks, i, err;
+ struct inode *dir = dentry->d_parent->d_inode;
++ int namelen;
++ const u8 *name;
++ unsigned blocksize;
+
+ *res_dir = NULL;
+ sb = dir->i_sb;
+-
++ blocksize = sb->s_blocksize;
++ namelen = dentry->d_name.len;
++ name = dentry->d_name.name;
++ if (namelen > EXT3_NAME_LEN)
++ return NULL;
++#ifdef CONFIG_EXT3_INDEX
++ if (is_dx(dir)) {
++ bh = ext3_dx_find_entry(dentry, res_dir, &err);
++ /*
++ * On success, or if the error was file not found,
++ * return. Otherwise, fall back to doing a search the
++ * old fashioned way.
++ */
++ if (bh || (err != ERR_BAD_DX_DIR))
++ return bh;
++ dxtrace(printk("ext3_find_entry: dx failed, falling back\n"));
++ }
++#endif
+ nblocks = dir->i_size >> EXT3_BLOCK_SIZE_BITS(sb);
+- start = dir->u.ext3_i.i_dir_start_lookup;
++ start = EXT3_I(dir)->i_dir_start_lookup;
+ if (start >= nblocks)
+ start = 0;
+ block = start;
+@@ -166,7 +831,7 @@ restart:
+ i = search_dirblock(bh, dir, dentry,
+ block << EXT3_BLOCK_SIZE_BITS(sb), res_dir);
+ if (i == 1) {
+- dir->u.ext3_i.i_dir_start_lookup = block;
++ EXT3_I(dir)->i_dir_start_lookup = block;
+ ret = bh;
+ goto cleanup_and_exit;
+ } else {
+@@ -197,6 +862,66 @@ cleanup_and_exit:
+ return ret;
+ }
+
++#ifdef CONFIG_EXT3_INDEX
++static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry,
++ struct ext3_dir_entry_2 **res_dir, int *err)
++{
++ struct super_block * sb;
++ struct dx_hash_info hinfo;
++ u32 hash;
++ struct dx_frame frames[2], *frame;
++ struct ext3_dir_entry_2 *de, *top;
++ struct buffer_head *bh;
++ unsigned long block;
++ int retval;
++ int namelen = dentry->d_name.len;
++ const u8 *name = dentry->d_name.name;
++ struct inode *dir = dentry->d_parent->d_inode;
++
++ sb = dir->i_sb;
++ if (!(frame = dx_probe (dentry, 0, &hinfo, frames, err)))
++ return NULL;
++ hash = hinfo.hash;
++ do {
++ block = dx_get_block(frame->at);
++ if (!(bh = ext3_bread (NULL,dir, block, 0, err)))
++ goto errout;
++ de = (struct ext3_dir_entry_2 *) bh->b_data;
++ top = (struct ext3_dir_entry_2 *) ((char *) de + sb->s_blocksize -
++ EXT3_DIR_REC_LEN(0));
++ for (; de < top; de = ext3_next_entry(de))
++ if (ext3_match (namelen, name, de)) {
++ if (!ext3_check_dir_entry("ext3_find_entry",
++ dir, de, bh,
++ (block<<EXT3_BLOCK_SIZE_BITS(sb))
++ +((char *)de - bh->b_data))) {
++ brelse (bh);
++ goto errout;
++ }
++ *res_dir = de;
++ dx_release (frames);
++ return bh;
++ }
++ brelse (bh);
++ /* Check to see if we should continue to search */
++ retval = ext3_htree_next_block(dir, hash, frame,
++ frames, err, 0);
++ if (retval == -1) {
++ ext3_warning(sb, __FUNCTION__,
++ "error reading index page in directory #%lu",
++ dir->i_ino);
++ goto errout;
++ }
++ } while (retval == 1);
++
++ *err = -ENOENT;
++errout:
++ dxtrace(printk("%s not found\n", name));
++ dx_release (frames);
++ return NULL;
++}
++#endif
++
+ static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry)
+ {
+ struct inode * inode;
+@@ -213,8 +938,9 @@ static struct dentry *ext3_lookup(struct
+ brelse (bh);
+ inode = iget(dir->i_sb, ino);
+
+- if (!inode)
++ if (!inode) {
+ return ERR_PTR(-EACCES);
++ }
+ }
+ d_add(dentry, inode);
+ return NULL;
+@@ -238,6 +964,301 @@ static inline void ext3_set_de_type(stru
+ de->file_type = ext3_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
+ }
+
++#ifdef CONFIG_EXT3_INDEX
++static struct ext3_dir_entry_2 *
++dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
++{
++ unsigned rec_len = 0;
++
++ while (count--) {
++ struct ext3_dir_entry_2 *de = (struct ext3_dir_entry_2 *) (from + map->offs);
++ rec_len = EXT3_DIR_REC_LEN(de->name_len);
++ memcpy (to, de, rec_len);
++ ((struct ext3_dir_entry_2 *) to)->rec_len = rec_len;
++ de->inode = 0;
++ map++;
++ to += rec_len;
++ }
++ return (struct ext3_dir_entry_2 *) (to - rec_len);
++}
++
++static struct ext3_dir_entry_2* dx_pack_dirents(char *base, int size)
++{
++ struct ext3_dir_entry_2 *next, *to, *prev, *de = (struct ext3_dir_entry_2 *) base;
++ unsigned rec_len = 0;
++
++ prev = to = de;
++ while ((char*)de < base + size) {
++ next = (struct ext3_dir_entry_2 *) ((char *) de +
++ le16_to_cpu(de->rec_len));
++ if (de->inode && de->name_len) {
++ rec_len = EXT3_DIR_REC_LEN(de->name_len);
++ if (de > to)
++ memmove(to, de, rec_len);
++ to->rec_len = rec_len;
++ prev = to;
++ to = (struct ext3_dir_entry_2 *) (((char *) to) + rec_len);
++ }
++ de = next;
++ }
++ return prev;
++}
++
++static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
++ struct buffer_head **bh,struct dx_frame *frame,
++ struct dx_hash_info *hinfo, int *error)
++{
++ unsigned blocksize = dir->i_sb->s_blocksize;
++ unsigned count, continued;
++ struct buffer_head *bh2;
++ u32 newblock;
++ u32 hash2;
++ struct dx_map_entry *map;
++ char *data1 = (*bh)->b_data, *data2;
++ unsigned split;
++ struct ext3_dir_entry_2 *de = NULL, *de2;
++ int err;
++
++ bh2 = ext3_append (handle, dir, &newblock, error);
++ if (!(bh2)) {
++ brelse(*bh);
++ *bh = NULL;
++ goto errout;
++ }
++
++ BUFFER_TRACE(*bh, "get_write_access");
++ err = ext3_journal_get_write_access(handle, *bh);
++ if (err) {
++ journal_error:
++ brelse(*bh);
++ brelse(bh2);
++ *bh = NULL;
++ ext3_std_error(dir->i_sb, err);
++ goto errout;
++ }
++ BUFFER_TRACE(frame->bh, "get_write_access");
++ err = ext3_journal_get_write_access(handle, frame->bh);
++ if (err)
++ goto journal_error;
++
++ data2 = bh2->b_data;
++
++ /* create map in the end of data2 block */
++ map = (struct dx_map_entry *) (data2 + blocksize);
++ count = dx_make_map ((struct ext3_dir_entry_2 *) data1,
++ blocksize, hinfo, map);
++ map -= count;
++ split = count/2; // need to adjust to actual middle
++ dx_sort_map (map, count);
++ hash2 = map[split].hash;
++ continued = hash2 == map[split - 1].hash;
++ dxtrace(printk("Split block %i at %x, %i/%i\n",
++ dx_get_block(frame->at), hash2, split, count-split));
++
++ /* Fancy dance to stay within two buffers */
++ de2 = dx_move_dirents(data1, data2, map + split, count - split);
++ de = dx_pack_dirents(data1,blocksize);
++ de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
++ de2->rec_len = cpu_to_le16(data2 + blocksize - (char *) de2);
++ dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data1, blocksize, 1));
++ dxtrace(dx_show_leaf (hinfo, (struct ext3_dir_entry_2 *) data2, blocksize, 1));
++
++ /* Which block gets the new entry? */
++ if (hinfo->hash >= hash2)
++ {
++ swap(*bh, bh2);
++ de = de2;
++ }
++ dx_insert_block (frame, hash2 + continued, newblock);
++ err = ext3_journal_dirty_metadata (handle, bh2);
++ if (err)
++ goto journal_error;
++ err = ext3_journal_dirty_metadata (handle, frame->bh);
++ if (err)
++ goto journal_error;
++ brelse (bh2);
++ dxtrace(dx_show_index ("frame", frame->entries));
++errout:
++ return de;
++}
++#endif
++
++
++/*
++ * Add a new entry into a directory (leaf) block. If de is non-NULL,
++ * it points to a directory entry which is guaranteed to be large
++ * enough for new directory entry. If de is NULL, then
++ * add_dirent_to_buf will attempt search the directory block for
++ * space. It will return -ENOSPC if no space is available, and -EIO
++ * and -EEXIST if directory entry already exists.
++ *
++ * NOTE! bh is NOT released in the case where ENOSPC is returned. In
++ * all other cases bh is released.
++ */
++static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
++ struct inode *inode, struct ext3_dir_entry_2 *de,
++ struct buffer_head * bh)
++{
++ struct inode *dir = dentry->d_parent->d_inode;
++ const char *name = dentry->d_name.name;
++ int namelen = dentry->d_name.len;
++ unsigned long offset = 0;
++ unsigned short reclen;
++ int nlen, rlen, err;
++ char *top;
++
++ reclen = EXT3_DIR_REC_LEN(namelen);
++ if (!de) {
++ de = (struct ext3_dir_entry_2 *)bh->b_data;
++ top = bh->b_data + dir->i_sb->s_blocksize - reclen;
++ while ((char *) de <= top) {
++ if (!ext3_check_dir_entry("ext3_add_entry", dir, de,
++ bh, offset)) {
++ brelse (bh);
++ return -EIO;
++ }
++ if (ext3_match (namelen, name, de)) {
++ brelse (bh);
++ return -EEXIST;
++ }
++ nlen = EXT3_DIR_REC_LEN(de->name_len);
++ rlen = le16_to_cpu(de->rec_len);
++ if ((de->inode? rlen - nlen: rlen) >= reclen)
++ break;
++ de = (struct ext3_dir_entry_2 *)((char *)de + rlen);
++ offset += rlen;
++ }
++ if ((char *) de > top)
++ return -ENOSPC;
++ }
++ BUFFER_TRACE(bh, "get_write_access");
++ err = ext3_journal_get_write_access(handle, bh);
++ if (err) {
++ ext3_std_error(dir->i_sb, err);
++ brelse(bh);
++ return err;
++ }
++
++ /* By now the buffer is marked for journaling */
++ nlen = EXT3_DIR_REC_LEN(de->name_len);
++ rlen = le16_to_cpu(de->rec_len);
++ if (de->inode) {
++ struct ext3_dir_entry_2 *de1 = (struct ext3_dir_entry_2 *)((char *)de + nlen);
++ de1->rec_len = cpu_to_le16(rlen - nlen);
++ de->rec_len = cpu_to_le16(nlen);
++ de = de1;
++ }
++ de->file_type = EXT3_FT_UNKNOWN;
++ if (inode) {
++ de->inode = cpu_to_le32(inode->i_ino);
++ ext3_set_de_type(dir->i_sb, de, inode->i_mode);
++ } else
++ de->inode = 0;
++ de->name_len = namelen;
++ memcpy (de->name, name, namelen);
++ /*
++ * XXX shouldn't update any times until successful
++ * completion of syscall, but too many callers depend
++ * on this.
++ *
++ * XXX similarly, too many callers depend on
++ * ext3_new_inode() setting the times, but error
++ * recovery deletes the inode, so the worst that can
++ * happen is that the times are slightly out of date
++ * and/or different from the directory change time.
++ */
++ dir->i_mtime = dir->i_ctime = CURRENT_TIME;
++ ext3_update_dx_flag(dir);
++ dir->i_version = ++event;
++ ext3_mark_inode_dirty(handle, dir);
++ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
++ err = ext3_journal_dirty_metadata(handle, bh);
++ if (err)
++ ext3_std_error(dir->i_sb, err);
++ brelse(bh);
++ return 0;
++}
++
++#ifdef CONFIG_EXT3_INDEX
++/*
++ * This converts a one block unindexed directory to a 3 block indexed
++ * directory, and adds the dentry to the indexed directory.
++ */
++static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
++ struct inode *inode, struct buffer_head *bh)
++{
++ struct inode *dir = dentry->d_parent->d_inode;
++ const char *name = dentry->d_name.name;
++ int namelen = dentry->d_name.len;
++ struct buffer_head *bh2;
++ struct dx_root *root;
++ struct dx_frame frames[2], *frame;
++ struct dx_entry *entries;
++ struct ext3_dir_entry_2 *de, *de2;
++ char *data1, *top;
++ unsigned len;
++ int retval;
++ unsigned blocksize;
++ struct dx_hash_info hinfo;
++ u32 block;
++
++ blocksize = dir->i_sb->s_blocksize;
++ dxtrace(printk("Creating index\n"));
++ retval = ext3_journal_get_write_access(handle, bh);
++ if (retval) {
++ ext3_std_error(dir->i_sb, retval);
++ brelse(bh);
++ return retval;
++ }
++ root = (struct dx_root *) bh->b_data;
++
++ EXT3_I(dir)->i_flags |= EXT3_INDEX_FL;
++ bh2 = ext3_append (handle, dir, &block, &retval);
++ if (!(bh2)) {
++ brelse(bh);
++ return retval;
++ }
++ data1 = bh2->b_data;
++
++ /* The 0th block becomes the root, move the dirents out */
++ de = &root->dotdot;
++ de = (struct ext3_dir_entry_2 *) ((char *)de + de->rec_len);
++ len = ((char *) root) + blocksize - (char *) de;
++ memcpy (data1, de, len);
++ de = (struct ext3_dir_entry_2 *) data1;
++ top = data1 + len;
++ while (((char *) de2=(char*)de+le16_to_cpu(de->rec_len)) < top)
++ de = de2;
++ de->rec_len = cpu_to_le16(data1 + blocksize - (char *) de);
++ /* Initialize the root; the dot dirents already exist */
++ de = (struct ext3_dir_entry_2 *) (&root->dotdot);
++ de->rec_len = cpu_to_le16(blocksize - EXT3_DIR_REC_LEN(2));
++ memset (&root->info, 0, sizeof(root->info));
++ root->info.info_length = sizeof(root->info);
++ root->info.hash_version = dir->i_sb->u.ext3_sb.s_def_hash_version;
++ entries = root->entries;
++ dx_set_block (entries, 1);
++ dx_set_count (entries, 1);
++ dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info)));
++
++ /* Initialize as for dx_probe */
++ hinfo.hash_version = root->info.hash_version;
++ hinfo.seed = dir->i_sb->u.ext3_sb.s_hash_seed;
++ ext3fs_dirhash(name, namelen, &hinfo);
++ frame = frames;
++ frame->entries = entries;
++ frame->at = entries;
++ frame->bh = bh;
++ bh = bh2;
++ de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
++ dx_release (frames);
++ if (!(de))
++ return retval;
++
++ return add_dirent_to_buf(handle, dentry, inode, de, bh);
++}
++#endif
++
+ /*
+ * ext3_add_entry()
+ *
+@@ -248,127 +1268,198 @@ static inline void ext3_set_de_type(stru
+ * may not sleep between calling this and putting something into
+ * the entry, as someone else might have used it while you slept.
+ */
+-
+-/*
+- * AKPM: the journalling code here looks wrong on the error paths
+- */
+ static int ext3_add_entry (handle_t *handle, struct dentry *dentry,
+ struct inode *inode)
+ {
+ struct inode *dir = dentry->d_parent->d_inode;
+- const char *name = dentry->d_name.name;
+- int namelen = dentry->d_name.len;
+ unsigned long offset;
+- unsigned short rec_len;
+ struct buffer_head * bh;
+- struct ext3_dir_entry_2 * de, * de1;
++ struct ext3_dir_entry_2 *de;
+ struct super_block * sb;
+ int retval;
++#ifdef CONFIG_EXT3_INDEX
++ int dx_fallback=0;
++#endif
++ unsigned blocksize;
++ unsigned nlen, rlen;
++ u32 block, blocks;
+
+ sb = dir->i_sb;
+-
+- if (!namelen)
++ blocksize = sb->s_blocksize;
++ if (!dentry->d_name.len)
+ return -EINVAL;
+- bh = ext3_bread (handle, dir, 0, 0, &retval);
++#ifdef CONFIG_EXT3_INDEX
++ if (is_dx(dir)) {
++ retval = ext3_dx_add_entry(handle, dentry, inode);
++ if (!retval || (retval != ERR_BAD_DX_DIR))
++ return retval;
++ EXT3_I(dir)->i_flags &= ~EXT3_INDEX_FL;
++ dx_fallback++;
++ ext3_mark_inode_dirty(handle, dir);
++ }
++#endif
++ blocks = dir->i_size >> sb->s_blocksize_bits;
++ for (block = 0, offset = 0; block < blocks; block++) {
++ bh = ext3_bread(handle, dir, block, 0, &retval);
++ if(!bh)
++ return retval;
++ retval = add_dirent_to_buf(handle, dentry, inode, 0, bh);
++ if (retval != -ENOSPC)
++ return retval;
++
++#ifdef CONFIG_EXT3_INDEX
++ if (blocks == 1 && !dx_fallback &&
++ EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX))
++ return make_indexed_dir(handle, dentry, inode, bh);
++#endif
++ brelse(bh);
++ }
++ bh = ext3_append(handle, dir, &block, &retval);
+ if (!bh)
+ return retval;
+- rec_len = EXT3_DIR_REC_LEN(namelen);
+- offset = 0;
+ de = (struct ext3_dir_entry_2 *) bh->b_data;
+- while (1) {
+- if ((char *)de >= sb->s_blocksize + bh->b_data) {
+- brelse (bh);
+- bh = NULL;
+- bh = ext3_bread (handle, dir,
+- offset >> EXT3_BLOCK_SIZE_BITS(sb), 1, &retval);
+- if (!bh)
+- return retval;
+- if (dir->i_size <= offset) {
+- if (dir->i_size == 0) {
+- brelse(bh);
+- return -ENOENT;
+- }
++ de->inode = 0;
++ de->rec_len = cpu_to_le16(rlen = blocksize);
++ nlen = 0;
++ return add_dirent_to_buf(handle, dentry, inode, de, bh);
++}
+
+- ext3_debug ("creating next block\n");
++#ifdef CONFIG_EXT3_INDEX
++/*
++ * Returns 0 for success, or a negative error value
++ */
++static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry,
++ struct inode *inode)
++{
++ struct dx_frame frames[2], *frame;
++ struct dx_entry *entries, *at;
++ struct dx_hash_info hinfo;
++ struct buffer_head * bh;
++ struct inode *dir = dentry->d_parent->d_inode;
++ struct super_block * sb = dir->i_sb;
++ struct ext3_dir_entry_2 *de;
++ int err;
+
+- BUFFER_TRACE(bh, "get_write_access");
+- ext3_journal_get_write_access(handle, bh);
+- de = (struct ext3_dir_entry_2 *) bh->b_data;
+- de->inode = 0;
+- de->rec_len = le16_to_cpu(sb->s_blocksize);
+- dir->u.ext3_i.i_disksize =
+- dir->i_size = offset + sb->s_blocksize;
+- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
+- ext3_mark_inode_dirty(handle, dir);
+- } else {
++ frame = dx_probe(dentry, 0, &hinfo, frames, &err);
++ if (!frame)
++ return err;
++ entries = frame->entries;
++ at = frame->at;
+
+- ext3_debug ("skipping to next block\n");
++ if (!(bh = ext3_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
++ goto cleanup;
+
+- de = (struct ext3_dir_entry_2 *) bh->b_data;
+- }
+- }
+- if (!ext3_check_dir_entry ("ext3_add_entry", dir, de, bh,
+- offset)) {
+- brelse (bh);
+- return -ENOENT;
+- }
+- if (ext3_match (namelen, name, de)) {
+- brelse (bh);
+- return -EEXIST;
++ BUFFER_TRACE(bh, "get_write_access");
++ err = ext3_journal_get_write_access(handle, bh);
++ if (err)
++ goto journal_error;
++
++ err = add_dirent_to_buf(handle, dentry, inode, 0, bh);
++ if (err != -ENOSPC) {
++ bh = 0;
++ goto cleanup;
++ }
++
++ /* Block full, should compress but for now just split */
++ dxtrace(printk("using %u of %u node entries\n",
++ dx_get_count(entries), dx_get_limit(entries)));
++ /* Need to split index? */
++ if (dx_get_count(entries) == dx_get_limit(entries)) {
++ u32 newblock;
++ unsigned icount = dx_get_count(entries);
++ int levels = frame - frames;
++ struct dx_entry *entries2;
++ struct dx_node *node2;
++ struct buffer_head *bh2;
++
++ if (levels && (dx_get_count(frames->entries) ==
++ dx_get_limit(frames->entries))) {
++ ext3_warning(sb, __FUNCTION__,
++ "Directory index full!\n");
++ err = -ENOSPC;
++ goto cleanup;
+ }
+- if ((le32_to_cpu(de->inode) == 0 &&
+- le16_to_cpu(de->rec_len) >= rec_len) ||
+- (le16_to_cpu(de->rec_len) >=
+- EXT3_DIR_REC_LEN(de->name_len) + rec_len)) {
+- BUFFER_TRACE(bh, "get_write_access");
+- ext3_journal_get_write_access(handle, bh);
+- /* By now the buffer is marked for journaling */
+- offset += le16_to_cpu(de->rec_len);
+- if (le32_to_cpu(de->inode)) {
+- de1 = (struct ext3_dir_entry_2 *) ((char *) de +
+- EXT3_DIR_REC_LEN(de->name_len));
+- de1->rec_len =
+- cpu_to_le16(le16_to_cpu(de->rec_len) -
+- EXT3_DIR_REC_LEN(de->name_len));
+- de->rec_len = cpu_to_le16(
+- EXT3_DIR_REC_LEN(de->name_len));
+- de = de1;
++ bh2 = ext3_append (handle, dir, &newblock, &err);
++ if (!(bh2))
++ goto cleanup;
++ node2 = (struct dx_node *)(bh2->b_data);
++ entries2 = node2->entries;
++ node2->fake.rec_len = cpu_to_le16(sb->s_blocksize);
++ node2->fake.inode = 0;
++ BUFFER_TRACE(frame->bh, "get_write_access");
++ err = ext3_journal_get_write_access(handle, frame->bh);
++ if (err)
++ goto journal_error;
++ if (levels) {
++ unsigned icount1 = icount/2, icount2 = icount - icount1;
++ unsigned hash2 = dx_get_hash(entries + icount1);
++ dxtrace(printk("Split index %i/%i\n", icount1, icount2));
++
++ BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
++ err = ext3_journal_get_write_access(handle,
++ frames[0].bh);
++ if (err)
++ goto journal_error;
++
++ memcpy ((char *) entries2, (char *) (entries + icount1),
++ icount2 * sizeof(struct dx_entry));
++ dx_set_count (entries, icount1);
++ dx_set_count (entries2, icount2);
++ dx_set_limit (entries2, dx_node_limit(dir));
++
++ /* Which index block gets the new entry? */
++ if (at - entries >= icount1) {
++ frame->at = at = at - entries - icount1 + entries2;
++ frame->entries = entries = entries2;
++ swap(frame->bh, bh2);
+ }
+- de->file_type = EXT3_FT_UNKNOWN;
+- if (inode) {
+- de->inode = cpu_to_le32(inode->i_ino);
+- ext3_set_de_type(dir->i_sb, de, inode->i_mode);
+- } else
+- de->inode = 0;
+- de->name_len = namelen;
+- memcpy (de->name, name, namelen);
+- /*
+- * XXX shouldn't update any times until successful
+- * completion of syscall, but too many callers depend
+- * on this.
+- *
+- * XXX similarly, too many callers depend on
+- * ext3_new_inode() setting the times, but error
+- * recovery deletes the inode, so the worst that can
+- * happen is that the times are slightly out of date
+- * and/or different from the directory change time.
+- */
+- dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
+- dir->i_version = ++event;
+- ext3_mark_inode_dirty(handle, dir);
+- BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+- ext3_journal_dirty_metadata(handle, bh);
+- brelse(bh);
+- return 0;
++ dx_insert_block (frames + 0, hash2, newblock);
++ dxtrace(dx_show_index ("node", frames[1].entries));
++ dxtrace(dx_show_index ("node",
++ ((struct dx_node *) bh2->b_data)->entries));
++ err = ext3_journal_dirty_metadata(handle, bh2);
++ if (err)
++ goto journal_error;
++ brelse (bh2);
++ } else {
++ dxtrace(printk("Creating second level index...\n"));
++ memcpy((char *) entries2, (char *) entries,
++ icount * sizeof(struct dx_entry));
++ dx_set_limit(entries2, dx_node_limit(dir));
++
++ /* Set up root */
++ dx_set_count(entries, 1);
++ dx_set_block(entries + 0, newblock);
++ ((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1;
++
++ /* Add new access path frame */
++ frame = frames + 1;
++ frame->at = at = at - entries + entries2;
++ frame->entries = entries = entries2;
++ frame->bh = bh2;
++ err = ext3_journal_get_write_access(handle,
++ frame->bh);
++ if (err)
++ goto journal_error;
+ }
+- offset += le16_to_cpu(de->rec_len);
+- de = (struct ext3_dir_entry_2 *)
+- ((char *) de + le16_to_cpu(de->rec_len));
++ ext3_journal_dirty_metadata(handle, frames[0].bh);
+ }
+- brelse (bh);
+- return -ENOSPC;
++ de = do_split(handle, dir, &bh, frame, &hinfo, &err);
++ if (!de)
++ goto cleanup;
++ err = add_dirent_to_buf(handle, dentry, inode, de, bh);
++ bh = 0;
++ goto cleanup;
++
++journal_error:
++ ext3_std_error(dir->i_sb, err);
++cleanup:
++ if (bh)
++ brelse(bh);
++ dx_release(frames);
++ return err;
+ }
++#endif
+
+ /*
+ * ext3_delete_entry deletes a directory entry by merging it with the
+@@ -452,9 +1543,11 @@ static int ext3_create (struct inode * d
+ struct inode * inode;
+ int err;
+
+- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);
+- if (IS_ERR(handle))
++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3);
++ if (IS_ERR(handle)) {
+ return PTR_ERR(handle);
++ }
+
+ if (IS_SYNC(dir))
+ handle->h_sync = 1;
+@@ -479,9 +1572,11 @@ static int ext3_mknod (struct inode * di
+ struct inode *inode;
+ int err;
+
+- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);
+- if (IS_ERR(handle))
++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3);
++ if (IS_ERR(handle)) {
+ return PTR_ERR(handle);
++ }
+
+ if (IS_SYNC(dir))
+ handle->h_sync = 1;
+@@ -508,9 +1603,11 @@ static int ext3_mkdir(struct inode * dir
+ if (dir->i_nlink >= EXT3_LINK_MAX)
+ return -EMLINK;
+
+- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 3);
+- if (IS_ERR(handle))
++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3);
++ if (IS_ERR(handle)) {
+ return PTR_ERR(handle);
++ }
+
+ if (IS_SYNC(dir))
+ handle->h_sync = 1;
+@@ -522,7 +1619,7 @@ static int ext3_mkdir(struct inode * dir
+
+ inode->i_op = &ext3_dir_inode_operations;
+ inode->i_fop = &ext3_dir_operations;
+- inode->i_size = inode->u.ext3_i.i_disksize = inode->i_sb->s_blocksize;
++ inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
+ inode->i_blocks = 0;
+ dir_block = ext3_bread (handle, inode, 0, 1, &err);
+ if (!dir_block) {
+@@ -555,21 +1652,19 @@ static int ext3_mkdir(struct inode * dir
+ inode->i_mode |= S_ISGID;
+ ext3_mark_inode_dirty(handle, inode);
+ err = ext3_add_entry (handle, dentry, inode);
+- if (err)
+- goto out_no_entry;
++ if (err) {
++ inode->i_nlink = 0;
++ ext3_mark_inode_dirty(handle, inode);
++ iput (inode);
++ goto out_stop;
++ }
+ dir->i_nlink++;
+- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
++ ext3_update_dx_flag(dir);
+ ext3_mark_inode_dirty(handle, dir);
+ d_instantiate(dentry, inode);
+ out_stop:
+ ext3_journal_stop(handle, dir);
+ return err;
+-
+-out_no_entry:
+- inode->i_nlink = 0;
+- ext3_mark_inode_dirty(handle, inode);
+- iput (inode);
+- goto out_stop;
+ }
+
+ /*
+@@ -656,7 +1751,7 @@ int ext3_orphan_add(handle_t *handle, st
+ int err = 0, rc;
+
+ lock_super(sb);
+- if (!list_empty(&inode->u.ext3_i.i_orphan))
++ if (!list_empty(&EXT3_I(inode)->i_orphan))
+ goto out_unlock;
+
+ /* Orphan handling is only valid for files with data blocks
+@@ -697,7 +1792,7 @@ int ext3_orphan_add(handle_t *handle, st
+ * This is safe: on error we're going to ignore the orphan list
+ * anyway on the next recovery. */
+ if (!err)
+- list_add(&inode->u.ext3_i.i_orphan, &EXT3_SB(sb)->s_orphan);
++ list_add(&EXT3_I(inode)->i_orphan, &EXT3_SB(sb)->s_orphan);
+
+ jbd_debug(4, "superblock will point to %ld\n", inode->i_ino);
+ jbd_debug(4, "orphan inode %ld will point to %d\n",
+@@ -715,25 +1810,26 @@ out_unlock:
+ int ext3_orphan_del(handle_t *handle, struct inode *inode)
+ {
+ struct list_head *prev;
++ struct ext3_inode_info *ei = EXT3_I(inode);
+ struct ext3_sb_info *sbi;
+ ino_t ino_next;
+ struct ext3_iloc iloc;
+ int err = 0;
+
+ lock_super(inode->i_sb);
+- if (list_empty(&inode->u.ext3_i.i_orphan)) {
++ if (list_empty(&ei->i_orphan)) {
+ unlock_super(inode->i_sb);
+ return 0;
+ }
+
+ ino_next = NEXT_ORPHAN(inode);
+- prev = inode->u.ext3_i.i_orphan.prev;
++ prev = ei->i_orphan.prev;
+ sbi = EXT3_SB(inode->i_sb);
+
+ jbd_debug(4, "remove inode %ld from orphan list\n", inode->i_ino);
+
+- list_del(&inode->u.ext3_i.i_orphan);
+- INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan);
++ list_del(&ei->i_orphan);
++ INIT_LIST_HEAD(&ei->i_orphan);
+
+ /* If we're on an error path, we may not have a valid
+ * transaction handle with which to update the orphan list on
+@@ -794,8 +1890,9 @@ static int ext3_rmdir (struct inode * di
+ handle_t *handle;
+
+ handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);
+- if (IS_ERR(handle))
++ if (IS_ERR(handle)) {
+ return PTR_ERR(handle);
++ }
+
+ retval = -ENOENT;
+ bh = ext3_find_entry (dentry, &de);
+@@ -833,7 +1930,7 @@ static int ext3_rmdir (struct inode * di
+ dir->i_nlink--;
+ inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+ ext3_mark_inode_dirty(handle, inode);
+- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
++ ext3_update_dx_flag(dir);
+ ext3_mark_inode_dirty(handle, dir);
+
+ end_rmdir:
+@@ -851,8 +1948,9 @@ static int ext3_unlink(struct inode * di
+ handle_t *handle;
+
+ handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);
+- if (IS_ERR(handle))
++ if (IS_ERR(handle)) {
+ return PTR_ERR(handle);
++ }
+
+ if (IS_SYNC(dir))
+ handle->h_sync = 1;
+@@ -879,7 +1977,7 @@ static int ext3_unlink(struct inode * di
+ if (retval)
+ goto end_unlink;
+ dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+- dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
++ ext3_update_dx_flag(dir);
+ ext3_mark_inode_dirty(handle, dir);
+ inode->i_nlink--;
+ if (!inode->i_nlink)
+@@ -905,9 +2003,11 @@ static int ext3_symlink (struct inode *
+ if (l > dir->i_sb->s_blocksize)
+ return -ENAMETOOLONG;
+
+- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS + 5);
+- if (IS_ERR(handle))
++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5);
++ if (IS_ERR(handle)) {
+ return PTR_ERR(handle);
++ }
+
+ if (IS_SYNC(dir))
+ handle->h_sync = 1;
+@@ -917,7 +2017,7 @@ static int ext3_symlink (struct inode *
+ if (IS_ERR(inode))
+ goto out_stop;
+
+- if (l > sizeof (inode->u.ext3_i.i_data)) {
++ if (l > sizeof (EXT3_I(inode)->i_data)) {
+ inode->i_op = &page_symlink_inode_operations;
+ inode->i_mapping->a_ops = &ext3_aops;
+ /*
+@@ -926,25 +2026,23 @@ static int ext3_symlink (struct inode *
+ * i_size in generic_commit_write().
+ */
+ err = block_symlink(inode, symname, l);
+- if (err)
+- goto out_no_entry;
++ if (err) {
++ ext3_dec_count(handle, inode);
++ ext3_mark_inode_dirty(handle, inode);
++ iput (inode);
++ goto out_stop;
++ }
+ } else {
+ inode->i_op = &ext3_fast_symlink_inode_operations;
+- memcpy((char*)&inode->u.ext3_i.i_data,symname,l);
++ memcpy((char*)&EXT3_I(inode)->i_data,symname,l);
+ inode->i_size = l-1;
+ }
+- inode->u.ext3_i.i_disksize = inode->i_size;
++ EXT3_I(inode)->i_disksize = inode->i_size;
+ err = ext3_add_nondir(handle, dentry, inode);
+ ext3_mark_inode_dirty(handle, inode);
+ out_stop:
+ ext3_journal_stop(handle, dir);
+ return err;
+-
+-out_no_entry:
+- ext3_dec_count(handle, inode);
+- ext3_mark_inode_dirty(handle, inode);
+- iput (inode);
+- goto out_stop;
+ }
+
+ static int ext3_link (struct dentry * old_dentry,
+@@ -957,12 +2055,15 @@ static int ext3_link (struct dentry * ol
+ if (S_ISDIR(inode->i_mode))
+ return -EPERM;
+
+- if (inode->i_nlink >= EXT3_LINK_MAX)
++ if (inode->i_nlink >= EXT3_LINK_MAX) {
+ return -EMLINK;
++ }
+
+- handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS);
+- if (IS_ERR(handle))
++ handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
++ EXT3_INDEX_EXTRA_TRANS_BLOCKS);
++ if (IS_ERR(handle)) {
+ return PTR_ERR(handle);
++ }
+
+ if (IS_SYNC(dir))
+ handle->h_sync = 1;
+@@ -996,9 +2097,11 @@ static int ext3_rename (struct inode * o
+
+ old_bh = new_bh = dir_bh = NULL;
+
+- handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS + 2);
+- if (IS_ERR(handle))
++ handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS +
++ EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2);
++ if (IS_ERR(handle)) {
+ return PTR_ERR(handle);
++ }
+
+ if (IS_SYNC(old_dir) || IS_SYNC(new_dir))
+ handle->h_sync = 1;
+@@ -1071,14 +2174,33 @@ static int ext3_rename (struct inode * o
+ /*
+ * ok, that's it
+ */
+- ext3_delete_entry(handle, old_dir, old_de, old_bh);
++ retval = ext3_delete_entry(handle, old_dir, old_de, old_bh);
++ if (retval == -ENOENT) {
++ /*
++ * old_de could have moved out from under us.
++ */
++ struct buffer_head *old_bh2;
++ struct ext3_dir_entry_2 *old_de2;
++
++ old_bh2 = ext3_find_entry(old_dentry, &old_de2);
++ if (old_bh2) {
++ retval = ext3_delete_entry(handle, old_dir,
++ old_de2, old_bh2);
++ brelse(old_bh2);
++ }
++ }
++ if (retval) {
++ ext3_warning(old_dir->i_sb, "ext3_rename",
++ "Deleting old file (%lu), %d, error=%d",
++ old_dir->i_ino, old_dir->i_nlink, retval);
++ }
+
+ if (new_inode) {
+ new_inode->i_nlink--;
+ new_inode->i_ctime = CURRENT_TIME;
+ }
+ old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME;
+- old_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
++ ext3_update_dx_flag(old_dir);
+ if (dir_bh) {
+ BUFFER_TRACE(dir_bh, "get_write_access");
+ ext3_journal_get_write_access(handle, dir_bh);
+@@ -1090,7 +2212,7 @@ static int ext3_rename (struct inode * o
+ new_inode->i_nlink--;
+ } else {
+ new_dir->i_nlink++;
+- new_dir->u.ext3_i.i_flags &= ~EXT3_INDEX_FL;
++ ext3_update_dx_flag(new_dir);
+ ext3_mark_inode_dirty(handle, new_dir);
+ }
+ }
+--- linux-2.4.18-chaos-pdirops/fs/ext3/super.c~ext3-htree 2003-09-23 11:52:24.000000000 +0400
++++ linux-2.4.18-chaos-pdirops-alexey/fs/ext3/super.c 2003-09-23 12:11:54.000000000 +0400
+@@ -705,6 +705,7 @@ static int ext3_setup_super(struct super
+ es->s_mtime = cpu_to_le32(CURRENT_TIME);
+ ext3_update_dynamic_rev(sb);
+ EXT3_SET_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
++
+ ext3_commit_super (sb, es, 1);
+ if (test_opt (sb, DEBUG))
+ printk (KERN_INFO
+@@ -715,6 +716,7 @@ static int ext3_setup_super(struct super
+ EXT3_BLOCKS_PER_GROUP(sb),
+ EXT3_INODES_PER_GROUP(sb),
+ sbi->s_mount_opt);
++
+ printk(KERN_INFO "EXT3 FS " EXT3FS_VERSION ", " EXT3FS_DATE " on %s, ",
+ bdevname(sb->s_dev));
+ if (EXT3_SB(sb)->s_journal->j_inode == NULL) {
+@@ -889,6 +891,7 @@ static loff_t ext3_max_size(int bits)
+ return res;
+ }
+
++
+ struct super_block * ext3_read_super (struct super_block * sb, void * data,
+ int silent)
+ {
+@@ -1065,6 +1068,9 @@ struct super_block * ext3_read_super (st
+ sbi->s_mount_state = le16_to_cpu(es->s_state);
+ sbi->s_addr_per_block_bits = log2(EXT3_ADDR_PER_BLOCK(sb));
+ sbi->s_desc_per_block_bits = log2(EXT3_DESC_PER_BLOCK(sb));
++ for (i=0; i < 4; i++)
++ sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
++ sbi->s_def_hash_version = es->s_def_hash_version;
+
+ if (sbi->s_blocks_per_group > blocksize * 8) {
+ printk (KERN_ERR
+@@ -1747,6 +1753,7 @@ static void __exit exit_ext3_fs(void)
+ unregister_filesystem(&ext3_fs_type);
+ }
+
++EXPORT_SYMBOL(ext3_force_commit);
+ EXPORT_SYMBOL(ext3_bread);
+
+ MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
+--- linux-2.4.18-chaos-pdirops/include/linux/ext3_fs.h~ext3-htree 2003-07-28 17:52:17.000000000 +0400
++++ linux-2.4.18-chaos-pdirops-alexey/include/linux/ext3_fs.h 2003-09-23 12:12:04.000000000 +0400
+@@ -40,6 +40,11 @@
+ #define EXT3FS_VERSION "2.4-0.9.18"
+
+ /*
++ * Always enable hashed directories
++ */
++#define CONFIG_EXT3_INDEX
++
++/*
+ * Debug code
+ */
+ #ifdef EXT3FS_DEBUG
+@@ -437,8 +442,11 @@ struct ext3_super_block {
+ /*E0*/ __u32 s_journal_inum; /* inode number of journal file */
+ __u32 s_journal_dev; /* device number of journal file */
+ __u32 s_last_orphan; /* start of list of inodes to delete */
+-
+-/*EC*/ __u32 s_reserved[197]; /* Padding to the end of the block */
++ __u32 s_hash_seed[4]; /* HTREE hash seed */
++ __u8 s_def_hash_version; /* Default hash version to use */
++ __u8 s_reserved_char_pad;
++ __u16 s_reserved_word_pad;
++ __u32 s_reserved[192]; /* Padding to the end of the block */
+ };
+
+ #ifdef __KERNEL__
+@@ -575,9 +583,46 @@ struct ext3_dir_entry_2 {
+ #define EXT3_DIR_ROUND (EXT3_DIR_PAD - 1)
+ #define EXT3_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT3_DIR_ROUND) & \
+ ~EXT3_DIR_ROUND)
++/*
++ * Hash Tree Directory indexing
++ * (c) Daniel Phillips, 2001
++ */
++
++#ifdef CONFIG_EXT3_INDEX
++ #define is_dx(dir) (EXT3_HAS_COMPAT_FEATURE(dir->i_sb, \
++ EXT3_FEATURE_COMPAT_DIR_INDEX) && \
++ (EXT3_I(dir)->i_flags & EXT3_INDEX_FL))
++#define EXT3_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT3_LINK_MAX)
++#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
++#else
++ #define is_dx(dir) 0
++#define EXT3_DIR_LINK_MAX(dir) ((dir)->i_nlink >= EXT3_LINK_MAX)
++#define EXT3_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2)
++#endif
++
++/* Legal values for the dx_root hash_version field: */
++
++#define DX_HASH_LEGACY 0
++#define DX_HASH_HALF_MD4 1
++#define DX_HASH_TEA 2
++
++/* hash info structure used by the directory hash */
++struct dx_hash_info
++{
++ u32 hash;
++ u32 minor_hash;
++ int hash_version;
++ u32 *seed;
++};
+
+ #ifdef __KERNEL__
+ /*
++ * Control parameters used by ext3_htree_next_block
++ */
++#define HASH_NB_ALWAYS 1
++
++
++/*
+ * Describe an inode's exact location on disk and in memory
+ */
+ struct ext3_iloc
+@@ -587,6 +632,27 @@ struct ext3_iloc
+ unsigned long block_group;
+ };
+
++
++/*
++ * This structure is stuffed into the struct file's private_data field
++ * for directories. It is where we put information so that we can do
++ * readdir operations in hash tree order.
++ */
++struct dir_private_info {
++ rb_root_t root;
++ rb_node_t *curr_node;
++ struct fname *extra_fname;
++ loff_t last_pos;
++ __u32 curr_hash;
++ __u32 curr_minor_hash;
++ __u32 next_hash;
++};
++
++/*
++ * Special error return code only used by dx_probe() and its callers.
++ */
++#define ERR_BAD_DX_DIR -75000
++
+ /*
+ * Function prototypes
+ */
+@@ -614,11 +680,20 @@ extern struct ext3_group_desc * ext3_get
+
+ /* dir.c */
+ extern int ext3_check_dir_entry(const char *, struct inode *,
+- struct ext3_dir_entry_2 *, struct buffer_head *,
+- unsigned long);
++ struct ext3_dir_entry_2 *,
++ struct buffer_head *, unsigned long);
++extern int ext3_htree_store_dirent(struct file *dir_file, __u32 hash,
++ __u32 minor_hash,
++ struct ext3_dir_entry_2 *dirent);
++extern void ext3_htree_free_dir_info(struct dir_private_info *p);
++
+ /* fsync.c */
+ extern int ext3_sync_file (struct file *, struct dentry *, int);
+
++/* hash.c */
++extern int ext3fs_dirhash(const char *name, int len, struct
++ dx_hash_info *hinfo);
++
+ /* ialloc.c */
+ extern struct inode * ext3_new_inode (handle_t *, const struct inode *, int);
+ extern void ext3_free_inode (handle_t *, struct inode *);
+@@ -650,6 +725,8 @@ extern int ext3_ioctl (struct inode *, s
+ /* namei.c */
+ extern int ext3_orphan_add(handle_t *, struct inode *);
+ extern int ext3_orphan_del(handle_t *, struct inode *);
++extern int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash,
++ __u32 start_minor_hash, __u32 *next_hash);
+
+ /* super.c */
+ extern void ext3_error (struct super_block *, const char *, const char *, ...)
+--- linux-2.4.18-chaos-pdirops/include/linux/ext3_fs_sb.h~ext3-htree 2003-07-28 17:52:17.000000000 +0400
++++ linux-2.4.18-chaos-pdirops-alexey/include/linux/ext3_fs_sb.h 2003-09-23 12:11:54.000000000 +0400
+@@ -62,6 +62,8 @@ struct ext3_sb_info {
+ int s_inode_size;
+ int s_first_ino;
+ u32 s_next_generation;
++ u32 s_hash_seed[4];
++ int s_def_hash_version;
+
+ /* Journaling */
+ struct inode * s_journal_inode;
+--- linux-2.4.18-chaos-pdirops/include/linux/ext3_jbd.h~ext3-htree 2001-12-21 20:42:03.000000000 +0300
++++ linux-2.4.18-chaos-pdirops-alexey/include/linux/ext3_jbd.h 2003-09-23 12:11:54.000000000 +0400
+@@ -63,6 +63,8 @@ extern int ext3_writepage_trans_blocks(s
+
+ #define EXT3_RESERVE_TRANS_BLOCKS 12
+
++#define EXT3_INDEX_EXTRA_TRANS_BLOCKS 8
++
+ int
+ ext3_mark_iloc_dirty(handle_t *handle,
+ struct inode *inode,
+--- linux-2.4.18-chaos-pdirops/include/linux/rbtree.h~ext3-htree 2001-11-22 22:46:18.000000000 +0300
++++ linux-2.4.18-chaos-pdirops-alexey/include/linux/rbtree.h 2003-09-23 12:11:54.000000000 +0400
+@@ -120,6 +120,8 @@ rb_root_t;
+
+ extern void rb_insert_color(rb_node_t *, rb_root_t *);
+ extern void rb_erase(rb_node_t *, rb_root_t *);
++extern rb_node_t *rb_get_first(rb_root_t *root);
++extern rb_node_t *rb_get_next(rb_node_t *n);
+
+ static inline void rb_link_node(rb_node_t * node, rb_node_t * parent, rb_node_t ** rb_link)
+ {
+--- linux-2.4.18-chaos-pdirops/lib/rbtree.c~ext3-htree 2003-09-23 11:52:34.000000000 +0400
++++ linux-2.4.18-chaos-pdirops-alexey/lib/rbtree.c 2003-09-23 12:11:54.000000000 +0400
+@@ -17,6 +17,8 @@
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+ linux/lib/rbtree.c
++
++ rb_get_first and rb_get_next written by Theodore Ts'o, 9/8/2002
+ */
+
+ #include <linux/rbtree.h>
+@@ -295,3 +297,43 @@ void rb_erase(rb_node_t * node, rb_root_
+ __rb_erase_color(child, parent, root);
+ }
+ EXPORT_SYMBOL(rb_erase);
++
++/*
++ * This function returns the first node (in sort order) of the tree.
++ */
++rb_node_t *rb_get_first(rb_root_t *root)
++{
++ rb_node_t *n;
++
++ n = root->rb_node;
++ if (!n)
++ return 0;
++ while (n->rb_left)
++ n = n->rb_left;
++ return n;
++}
++EXPORT_SYMBOL(rb_get_first);
++
++/*
++ * Given a node, this function will return the next node in the tree.
++ */
++rb_node_t *rb_get_next(rb_node_t *n)
++{
++ rb_node_t *parent;
++
++ if (n->rb_right) {
++ n = n->rb_right;
++ while (n->rb_left)
++ n = n->rb_left;
++ return n;
++ } else {
++ while ((parent = n->rb_parent)) {
++ if (n == parent->rb_left)
++ return parent;
++ n = parent;
++ }
++ return 0;
++ }
++}
++EXPORT_SYMBOL(rb_get_next);
++
+
+_
--- /dev/null
+ fs/ext3/namei.c | 15 +++++++--------
+ fs/ext3/namei.c.orig | 21 +++++++++++++++------
+ fs/ext3/super.c | 1 +
+ include/linux/ext3_fs_sb.h | 1 +
+ include/linux/ext3_fs_sb.h.orig | 2 ++
+ 5 files changed, 26 insertions(+), 14 deletions(-)
+
+--- linux-2.4.22-ac1/fs/ext3/namei.c~ext3-orphan_lock-2.4.22-rh 2003-09-26 00:24:09.000000000 +0400
++++ linux-2.4.22-ac1-alexey/fs/ext3/namei.c 2003-09-26 00:26:36.000000000 +0400
+@@ -1748,8 +1748,8 @@ int ext3_orphan_add(handle_t *handle, st
+ struct super_block *sb = inode->i_sb;
+ struct ext3_iloc iloc;
+ int err = 0, rc;
+-
+- lock_super(sb);
++
++ down(&EXT3_SB(sb)->s_orphan_lock);
+ if (!list_empty(&EXT3_I(inode)->i_orphan))
+ goto out_unlock;
+
+@@ -1797,7 +1797,7 @@ int ext3_orphan_add(handle_t *handle, st
+ jbd_debug(4, "orphan inode %ld will point to %d\n",
+ inode->i_ino, NEXT_ORPHAN(inode));
+ out_unlock:
+- unlock_super(sb);
++ up(&EXT3_SB(sb)->s_orphan_lock);
+ ext3_std_error(inode->i_sb, err);
+ return err;
+ }
+@@ -1810,20 +1810,19 @@ int ext3_orphan_del(handle_t *handle, st
+ {
+ struct list_head *prev;
+ struct ext3_inode_info *ei = EXT3_I(inode);
+- struct ext3_sb_info *sbi;
++ struct ext3_sb_info *sbi = EXT3_SB(inode->i_sb);
+ unsigned long ino_next;
+ struct ext3_iloc iloc;
+ int err = 0;
+
+- lock_super(inode->i_sb);
++ down(&sbi->s_orphan_lock);
+ if (list_empty(&ei->i_orphan)) {
+- unlock_super(inode->i_sb);
++ up(&sbi->s_orphan_lock);
+ return 0;
+ }
+
+ ino_next = NEXT_ORPHAN(inode);
+ prev = ei->i_orphan.prev;
+- sbi = EXT3_SB(inode->i_sb);
+
+ jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);
+
+@@ -1872,7 +1871,7 @@ int ext3_orphan_del(handle_t *handle, st
+ out_err:
+ ext3_std_error(inode->i_sb, err);
+ out:
+- unlock_super(inode->i_sb);
++ up(&sbi->s_orphan_lock);
+ return err;
+
+ out_brelse:
+--- linux-2.4.22-ac1/fs/ext3/namei.c.orig~ext3-orphan_lock-2.4.22-rh 2003-09-25 14:58:37.000000000 +0400
++++ linux-2.4.22-ac1-alexey/fs/ext3/namei.c.orig 2003-09-26 00:24:09.000000000 +0400
+@@ -29,6 +29,7 @@
+ #include <linux/sched.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
++#include <linux/ext3_xattr.h>
+ #include <linux/fcntl.h>
+ #include <linux/stat.h>
+ #include <linux/string.h>
+@@ -1614,7 +1615,7 @@ static int ext3_mkdir(struct inode * dir
+ if (IS_SYNC(dir))
+ handle->h_sync = 1;
+
+- inode = ext3_new_inode (handle, dir, S_IFDIR);
++ inode = ext3_new_inode (handle, dir, S_IFDIR | mode);
+ err = PTR_ERR(inode);
+ if (IS_ERR(inode))
+ goto out_stop;
+@@ -1622,7 +1623,6 @@ static int ext3_mkdir(struct inode * dir
+ inode->i_op = &ext3_dir_inode_operations;
+ inode->i_fop = &ext3_dir_operations;
+ inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
+- inode->i_blocks = 0;
+ dir_block = ext3_bread (handle, inode, 0, 1, &err);
+ if (!dir_block) {
+ inode->i_nlink--; /* is this nlink == 0? */
+@@ -1649,9 +1649,6 @@ static int ext3_mkdir(struct inode * dir
+ BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata");
+ ext3_journal_dirty_metadata(handle, dir_block);
+ brelse (dir_block);
+- inode->i_mode = S_IFDIR | mode;
+- if (dir->i_mode & S_ISGID)
+- inode->i_mode |= S_ISGID;
+ ext3_mark_inode_dirty(handle, inode);
+ err = ext3_add_entry (handle, dentry, inode);
+ if (err) {
+@@ -2020,7 +2017,7 @@ static int ext3_symlink (struct inode *
+ goto out_stop;
+
+ if (l > sizeof (EXT3_I(inode)->i_data)) {
+- inode->i_op = &page_symlink_inode_operations;
++ inode->i_op = &ext3_symlink_inode_operations;
+ inode->i_mapping->a_ops = &ext3_aops;
+ /*
+ * block_symlink() calls back into ext3_prepare/commit_write.
+@@ -2245,4 +2242,16 @@ struct inode_operations ext3_dir_inode_o
+ rmdir: ext3_rmdir, /* BKL held */
+ mknod: ext3_mknod, /* BKL held */
+ rename: ext3_rename, /* BKL held */
++ setxattr: ext3_setxattr, /* BKL held */
++ getxattr: ext3_getxattr, /* BKL held */
++ listxattr: ext3_listxattr, /* BKL held */
++ removexattr: ext3_removexattr, /* BKL held */
+ };
++
++struct inode_operations ext3_special_inode_operations = {
++ setxattr: ext3_setxattr, /* BKL held */
++ getxattr: ext3_getxattr, /* BKL held */
++ listxattr: ext3_listxattr, /* BKL held */
++ removexattr: ext3_removexattr, /* BKL held */
++};
++
+--- linux-2.4.22-ac1/fs/ext3/super.c~ext3-orphan_lock-2.4.22-rh 2003-09-26 00:24:09.000000000 +0400
++++ linux-2.4.22-ac1-alexey/fs/ext3/super.c 2003-09-26 00:25:22.000000000 +0400
+@@ -1164,6 +1164,7 @@ struct super_block * ext3_read_super (st
+ sb->s_op = &ext3_sops;
+ sb->dq_op = &ext3_qops;
+ INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
++ sema_init(&sbi->s_orphan_lock, 1);
+
+ sb->s_root = 0;
+
+--- linux-2.4.22-ac1/include/linux/ext3_fs_sb.h~ext3-orphan_lock-2.4.22-rh 2003-09-26 00:24:08.000000000 +0400
++++ linux-2.4.22-ac1-alexey/include/linux/ext3_fs_sb.h 2003-09-26 00:25:22.000000000 +0400
+@@ -72,6 +72,7 @@ struct ext3_sb_info {
+ struct inode * s_journal_inode;
+ struct journal_s * s_journal;
+ struct list_head s_orphan;
++ struct semaphore s_orphan_lock;
+ unsigned long s_commit_interval;
+ struct block_device *journal_bdev;
+ #ifdef CONFIG_JBD_DEBUG
+--- linux-2.4.22-ac1/include/linux/ext3_fs_sb.h.orig~ext3-orphan_lock-2.4.22-rh 2003-09-25 14:16:34.000000000 +0400
++++ linux-2.4.22-ac1-alexey/include/linux/ext3_fs_sb.h.orig 2003-09-26 00:24:08.000000000 +0400
+@@ -62,6 +62,8 @@ struct ext3_sb_info {
+ int s_inode_size;
+ int s_first_ino;
+ u32 s_next_generation;
++ u32 s_hash_seed[4];
++ int s_def_hash_version;
+
+ unsigned long s_dir_count;
+ u8 *s_debts;
+
+_
--- /dev/null
+ fs/ext3/ialloc.c | 35 ++++++++++++++++++++++++++++++++++-
+ fs/ext3/ioctl.c | 25 +++++++++++++++++++++++++
+ fs/ext3/namei.c | 21 +++++++++++++++++----
+ include/linux/dcache.h | 5 +++++
+ include/linux/ext3_fs.h | 5 ++++-
+ 5 files changed, 85 insertions(+), 6 deletions(-)
+
+--- linux-2.6.0-test5/fs/ext3/ialloc.c~ext3-wantedi-2.6 2003-06-24 18:05:25.000000000 +0400
++++ linux-2.6.0-test5-alexey/fs/ext3/ialloc.c 2003-09-28 13:40:23.000000000 +0400
+@@ -420,7 +420,8 @@ static int find_group_other(struct super
+ * For other inodes, search forward from the parent directory's block
+ * group to find a free inode.
+ */
+-struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode)
++struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode,
++ unsigned long goal)
+ {
+ struct super_block *sb;
+ struct buffer_head *bitmap_bh = NULL;
+@@ -448,6 +449,38 @@ struct inode *ext3_new_inode(handle_t *h
+
+ es = EXT3_SB(sb)->s_es;
+ sbi = EXT3_SB(sb);
++ if (goal) {
++ group = (goal - 1) / EXT3_INODES_PER_GROUP(sb);
++ ino = (goal - 1) % EXT3_INODES_PER_GROUP(sb);
++ gdp = ext3_get_group_desc(sb, group, &bh2);
++
++ err = -EIO;
++ bitmap_bh = read_inode_bitmap (sb, group);
++ if (!bitmap_bh)
++ goto fail;
++
++ BUFFER_TRACE(bh, "get_write_access");
++ err = ext3_journal_get_write_access(handle, bitmap_bh);
++ if (err) goto fail;
++
++ if (ext3_set_bit_atomic(sb_bgl_lock(sbi, group),
++ ino, bitmap_bh->b_data)) {
++ printk(KERN_ERR "goal inode %lu unavailable\n", goal);
++ /* Oh well, we tried. */
++ goto continue_allocation;
++ }
++
++ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
++ err = ext3_journal_dirty_metadata(handle, bitmap_bh);
++ if (err) goto fail;
++
++ /* We've shortcircuited the allocation system successfully,
++ * now finish filling in the inode.
++ */
++ goto got;
++ }
++
++continue_allocation:
+ if (S_ISDIR(mode)) {
+ if (test_opt (sb, OLDALLOC))
+ group = find_group_dir(sb, dir);
+--- linux-2.6.0-test5/fs/ext3/ioctl.c~ext3-wantedi-2.6 2003-06-24 18:05:25.000000000 +0400
++++ linux-2.6.0-test5-alexey/fs/ext3/ioctl.c 2003-09-28 13:45:54.000000000 +0400
+@@ -24,6 +24,31 @@ int ext3_ioctl (struct inode * inode, st
+ ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg);
+
+ switch (cmd) {
++ case EXT3_IOC_CREATE_INUM: {
++ char name[32];
++ struct dentry *dchild, *dparent;
++ int rc = 0;
++
++ dparent = list_entry(inode->i_dentry.next, struct dentry,
++ d_alias);
++ snprintf(name, sizeof name, "%lu", arg);
++ dchild = lookup_one_len(name, dparent, strlen(name));
++ if (dchild->d_inode) {
++ printk(KERN_ERR "%*s/%lu already exists (ino %lu)\n",
++ dparent->d_name.len, dparent->d_name.name, arg,
++ dchild->d_inode->i_ino);
++ rc = -EEXIST;
++ } else {
++ dchild->d_fsdata = (void *)arg;
++ rc = vfs_create(inode, dchild, 0644, NULL);
++ if (rc)
++ printk(KERN_ERR "vfs_create: %d\n", rc);
++ else if (dchild->d_inode->i_ino != arg)
++ rc = -EEXIST;
++ }
++ dput(dchild);
++ return rc;
++ }
+ case EXT3_IOC_GETFLAGS:
+ flags = ei->i_flags & EXT3_FL_USER_VISIBLE;
+ return put_user(flags, (int *) arg);
+--- linux-2.6.0-test5/fs/ext3/namei.c~ext3-wantedi-2.6 2003-09-19 18:01:48.000000000 +0400
++++ linux-2.6.0-test5-alexey/fs/ext3/namei.c 2003-09-28 13:40:23.000000000 +0400
+@@ -1617,6 +1617,19 @@ static int ext3_add_nondir(handle_t *han
+ return err;
+ }
+
++static struct inode * ext3_new_inode_wantedi(handle_t *handle, struct inode *dir,
++ int mode, struct dentry *dentry)
++{
++ unsigned long inum = 0;
++
++ if (dentry->d_fsdata != NULL) {
++ struct dentry_params *param =
++ (struct dentry_params *) dentry->d_fsdata;
++ inum = param->p_inum;
++ }
++ return ext3_new_inode(handle, dir, mode, inum);
++}
++
+ /*
+ * By the time this is called, we already have created
+ * the directory cache entry for the new file, but it
+@@ -1640,7 +1653,7 @@ static int ext3_create (struct inode * d
+ if (IS_DIRSYNC(dir))
+ handle->h_sync = 1;
+
+- inode = ext3_new_inode (handle, dir, mode);
++ inode = ext3_new_inode_wantedi (handle, dir, mode, dentry);
+ err = PTR_ERR(inode);
+ if (!IS_ERR(inode)) {
+ inode->i_op = &ext3_file_inode_operations;
+@@ -1670,7 +1683,7 @@ static int ext3_mknod (struct inode * di
+ if (IS_DIRSYNC(dir))
+ handle->h_sync = 1;
+
+- inode = ext3_new_inode (handle, dir, mode);
++ inode = ext3_new_inode_wantedi (handle, dir, mode, dentry);
+ err = PTR_ERR(inode);
+ if (!IS_ERR(inode)) {
+ init_special_inode(inode, inode->i_mode, rdev);
+@@ -1702,7 +1715,7 @@ static int ext3_mkdir(struct inode * dir
+ if (IS_DIRSYNC(dir))
+ handle->h_sync = 1;
+
+- inode = ext3_new_inode (handle, dir, S_IFDIR | mode);
++ inode = ext3_new_inode_wantedi (handle, dir, S_IFDIR | mode, dentry);
+ err = PTR_ERR(inode);
+ if (IS_ERR(inode))
+ goto out_stop;
+@@ -2094,7 +2107,7 @@ static int ext3_symlink (struct inode *
+ if (IS_DIRSYNC(dir))
+ handle->h_sync = 1;
+
+- inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO);
++ inode = ext3_new_inode_wantedi (handle, dir, S_IFLNK|S_IRWXUGO, dentry);
+ err = PTR_ERR(inode);
+ if (IS_ERR(inode))
+ goto out_stop;
+--- linux-2.6.0-test5/include/linux/dcache.h~ext3-wantedi-2.6 2003-09-28 13:40:22.000000000 +0400
++++ linux-2.6.0-test5-alexey/include/linux/dcache.h 2003-09-28 13:40:23.000000000 +0400
+@@ -25,6 +25,11 @@ struct vfsmount;
+
+ #define IS_ROOT(x) ((x) == (x)->d_parent)
+
++struct dentry_params {
++ unsigned long p_inum;
++ void *p_ptr;
++};
++
+ /*
+ * "quick string" -- eases parameter passing, but more importantly
+ * saves "metadata" about the string (ie length and the hash).
+--- linux-2.6.0-test5/include/linux/ext3_fs.h~ext3-wantedi-2.6 2003-09-19 18:01:10.000000000 +0400
++++ linux-2.6.0-test5-alexey/include/linux/ext3_fs.h 2003-09-28 13:40:23.000000000 +0400
+@@ -203,6 +203,7 @@ struct ext3_group_desc
+ #define EXT3_IOC_SETFLAGS _IOW('f', 2, long)
+ #define EXT3_IOC_GETVERSION _IOR('f', 3, long)
+ #define EXT3_IOC_SETVERSION _IOW('f', 4, long)
++/* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */
+ #define EXT3_IOC_GETVERSION_OLD _IOR('v', 1, long)
+ #define EXT3_IOC_SETVERSION_OLD _IOW('v', 2, long)
+ #ifdef CONFIG_JBD_DEBUG
+@@ -707,7 +708,8 @@ extern int ext3fs_dirhash(const char *na
+ dx_hash_info *hinfo);
+
+ /* ialloc.c */
+-extern struct inode * ext3_new_inode (handle_t *, struct inode *, int);
++extern struct inode * ext3_new_inode (handle_t *, struct inode *, int,
++ unsigned long);
+ extern void ext3_free_inode (handle_t *, struct inode *);
+ extern struct inode * ext3_orphan_get (struct super_block *, unsigned long);
+ extern unsigned long ext3_count_free_inodes (struct super_block *);
+@@ -792,4 +794,5 @@ extern struct inode_operations ext3_fast
+
+ #endif /* __KERNEL__ */
+
++#define EXT3_IOC_CREATE_INUM _IOW('f', 5, long)
+ #endif /* _LINUX_EXT3_FS_H */
+
+_
--- /dev/null
+ fs/ext3/ialloc.c | 41 +++++++++++++++++++++++++++++++++++++++--
+ fs/ext3/inode.c | 2 +-
+ fs/ext3/ioctl.c | 25 +++++++++++++++++++++++++
+ fs/ext3/namei.c | 21 +++++++++++++++++----
+ include/linux/dcache.h | 5 +++++
+ include/linux/ext3_fs.h | 5 ++++-
+ 6 files changed, 91 insertions(+), 8 deletions(-)
+
+--- linux-2.4.22-ac1/fs/ext3/ialloc.c~extN-wantedi-2.4.22-rh 2003-09-26 00:57:29.000000000 +0400
++++ linux-2.4.22-ac1-alexey/fs/ext3/ialloc.c 2003-09-26 01:00:17.000000000 +0400
+@@ -524,7 +524,8 @@ static int find_group_other(struct super
+ * group to find a free inode.
+ */
+ struct inode * ext3_new_inode (handle_t *handle,
+- const struct inode * dir, int mode)
++ const struct inode * dir, int mode,
++ unsigned long goal)
+ {
+ struct super_block * sb;
+ struct buffer_head * bh;
+@@ -549,7 +550,41 @@ struct inode * ext3_new_inode (handle_t
+ init_rwsem(&inode->u.ext3_i.truncate_sem);
+
+ lock_super (sb);
+- es = sb->u.ext3_sb.s_es;
++ es = EXT3_SB(sb)->s_es;
++
++ if (goal) {
++ group = (goal - 1) / EXT3_INODES_PER_GROUP(sb);
++ ino = (goal - 1) % EXT3_INODES_PER_GROUP(sb);
++ gdp = ext3_get_group_desc(sb, group, &bh2);
++
++ bitmap_nr = load_inode_bitmap (sb, group);
++ if (bitmap_nr < 0) {
++ err = bitmap_nr;
++ goto fail;
++ }
++
++ bh = EXT3_SB(sb)->s_inode_bitmap[bitmap_nr];
++
++ BUFFER_TRACE(bh, "get_write_access");
++ err = ext3_journal_get_write_access(handle, bh);
++ if (err) goto fail;
++
++ if (ext3_set_bit(ino, bh->b_data)) {
++ printk(KERN_ERR "goal inode %lu unavailable\n", goal);
++ /* Oh well, we tried. */
++ goto repeat;
++ }
++
++ BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
++ err = ext3_journal_dirty_metadata(handle, bh);
++ if (err) goto fail;
++
++ /* We've shortcircuited the allocation system successfully,
++ * now finish filling in the inode.
++ */
++ goto have_bit_and_group;
++ }
++
+ repeat:
+ if (S_ISDIR(mode)) {
+ if (test_opt (sb, OLDALLOC))
+@@ -606,6 +641,8 @@ repeat:
+ }
+ goto repeat;
+ }
++
++have_bit_and_group:
+ ino += group * EXT3_INODES_PER_GROUP(sb) + 1;
+ if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
+ ext3_error (sb, "ext3_new_inode",
+--- linux-2.4.22-ac1/fs/ext3/inode.c~extN-wantedi-2.4.22-rh 2003-09-26 00:57:29.000000000 +0400
++++ linux-2.4.22-ac1-alexey/fs/ext3/inode.c 2003-09-26 00:57:29.000000000 +0400
+@@ -2614,7 +2614,7 @@ void ext3_truncate_thread(struct inode *
+ if (IS_ERR(handle))
+ goto out_truncate;
+
+- new_inode = ext3_new_inode(handle, old_inode, old_inode->i_mode);
++ new_inode = ext3_new_inode(handle, old_inode, old_inode->i_mode, 0);
+ if (IS_ERR(new_inode)) {
+ ext3_debug("truncate inode %lu directly (no new inodes)\n",
+ old_inode->i_ino);
+--- linux-2.4.22-ac1/fs/ext3/ioctl.c~extN-wantedi-2.4.22-rh 2003-09-25 14:16:23.000000000 +0400
++++ linux-2.4.22-ac1-alexey/fs/ext3/ioctl.c 2003-09-26 00:57:29.000000000 +0400
+@@ -23,6 +23,31 @@ int ext3_ioctl (struct inode * inode, st
+ ext3_debug ("cmd = %u, arg = %lu\n", cmd, arg);
+
+ switch (cmd) {
++ case EXT3_IOC_CREATE_INUM: {
++ char name[32];
++ struct dentry *dchild, *dparent;
++ int rc = 0;
++
++ dparent = list_entry(inode->i_dentry.next, struct dentry,
++ d_alias);
++ snprintf(name, sizeof name, "%lu", arg);
++ dchild = lookup_one_len(name, dparent, strlen(name));
++ if (dchild->d_inode) {
++ printk(KERN_ERR "%*s/%lu already exists (ino %lu)\n",
++ dparent->d_name.len, dparent->d_name.name, arg,
++ dchild->d_inode->i_ino);
++ rc = -EEXIST;
++ } else {
++ dchild->d_fsdata = (void *)arg;
++ rc = vfs_create(inode, dchild, 0644);
++ if (rc)
++ printk(KERN_ERR "vfs_create: %d\n", rc);
++ else if (dchild->d_inode->i_ino != arg)
++ rc = -EEXIST;
++ }
++ dput(dchild);
++ return rc;
++ }
+ case EXT3_IOC_GETFLAGS:
+ flags = inode->u.ext3_i.i_flags & EXT3_FL_USER_VISIBLE;
+ return put_user(flags, (int *) arg);
+--- linux-2.4.22-ac1/fs/ext3/namei.c~extN-wantedi-2.4.22-rh 2003-09-26 00:57:28.000000000 +0400
++++ linux-2.4.22-ac1-alexey/fs/ext3/namei.c 2003-09-26 00:57:29.000000000 +0400
+@@ -1534,6 +1534,19 @@ static int ext3_add_nondir(handle_t *han
+ return err;
+ }
+
++static struct inode * ext3_new_inode_wantedi(handle_t *handle, struct inode *dir,
++ int mode, struct dentry *dentry)
++{
++ unsigned long inum = 0;
++
++ if (dentry->d_fsdata != NULL) {
++ struct dentry_params *param =
++ (struct dentry_params *) dentry->d_fsdata;
++ inum = param->p_inum;
++ }
++ return ext3_new_inode(handle, dir, mode, inum);
++}
++
+ /*
+ * By the time this is called, we already have created
+ * the directory cache entry for the new file, but it
+@@ -1557,7 +1570,7 @@ static int ext3_create (struct inode * d
+ if (IS_SYNC(dir))
+ handle->h_sync = 1;
+
+- inode = ext3_new_inode (handle, dir, mode);
++ inode = ext3_new_inode_wantedi (handle, dir, mode, dentry);
+ err = PTR_ERR(inode);
+ if (!IS_ERR(inode)) {
+ inode->i_op = &ext3_file_inode_operations;
+@@ -1585,7 +1598,7 @@ static int ext3_mknod (struct inode * di
+ if (IS_SYNC(dir))
+ handle->h_sync = 1;
+
+- inode = ext3_new_inode (handle, dir, mode);
++ inode = ext3_new_inode_wantedi (handle, dir, mode, dentry);
+ err = PTR_ERR(inode);
+ if (!IS_ERR(inode)) {
+ init_special_inode(inode, mode, rdev);
+@@ -1615,7 +1628,7 @@ static int ext3_mkdir(struct inode * dir
+ if (IS_SYNC(dir))
+ handle->h_sync = 1;
+
+- inode = ext3_new_inode (handle, dir, S_IFDIR | mode);
++ inode = ext3_new_inode_wantedi (handle, dir, S_IFDIR | mode, dentry);
+ err = PTR_ERR(inode);
+ if (IS_ERR(inode))
+ goto out_stop;
+@@ -2010,7 +2023,7 @@ static int ext3_symlink (struct inode *
+ if (IS_SYNC(dir))
+ handle->h_sync = 1;
+
+- inode = ext3_new_inode (handle, dir, S_IFLNK|S_IRWXUGO);
++ inode = ext3_new_inode_wantedi (handle, dir, S_IFLNK|S_IRWXUGO, dentry);
+ err = PTR_ERR(inode);
+ if (IS_ERR(inode))
+ goto out_stop;
+--- linux-2.4.22-ac1/include/linux/dcache.h~extN-wantedi-2.4.22-rh 2003-09-26 00:57:27.000000000 +0400
++++ linux-2.4.22-ac1-alexey/include/linux/dcache.h 2003-09-26 00:57:29.000000000 +0400
+@@ -63,6 +63,11 @@ static inline void intent_init(struct lo
+
+ #define IS_ROOT(x) ((x) == (x)->d_parent)
+
++struct dentry_params {
++ unsigned long p_inum;
++ void *p_ptr;
++};
++
+ /*
+ * "quick string" -- eases parameter passing, but more importantly
+ * saves "metadata" about the string (ie length and the hash).
+--- linux-2.4.22-ac1/include/linux/ext3_fs.h~extN-wantedi-2.4.22-rh 2003-09-26 00:57:29.000000000 +0400
++++ linux-2.4.22-ac1-alexey/include/linux/ext3_fs.h 2003-09-26 00:57:29.000000000 +0400
+@@ -203,6 +203,7 @@ struct ext3_group_desc
+ #define EXT3_IOC_SETFLAGS _IOW('f', 2, long)
+ #define EXT3_IOC_GETVERSION _IOR('f', 3, long)
+ #define EXT3_IOC_SETVERSION _IOW('f', 4, long)
++/* EXT3_IOC_CREATE_INUM at bottom of file (visible to kernel and user). */
+ #define EXT3_IOC_GETVERSION_OLD _IOR('v', 1, long)
+ #define EXT3_IOC_SETVERSION_OLD _IOW('v', 2, long)
+ #ifdef CONFIG_JBD_DEBUG
+@@ -676,7 +677,8 @@ extern int ext3fs_dirhash(const char *na
+ dx_hash_info *hinfo);
+
+ /* ialloc.c */
+-extern struct inode * ext3_new_inode (handle_t *, const struct inode *, int);
++extern struct inode * ext3_new_inode (handle_t *, const struct inode *, int,
++ unsigned long);
+ extern void ext3_free_inode (handle_t *, struct inode *);
+ extern struct inode * ext3_orphan_get (struct super_block *, unsigned long);
+ extern unsigned long ext3_count_free_inodes (struct super_block *);
+@@ -769,4 +771,5 @@ extern struct inode_operations ext3_fast
+
+ #endif /* __KERNEL__ */
+
++#define EXT3_IOC_CREATE_INUM _IOW('f', 5, long)
+ #endif /* _LINUX_EXT3_FS_H */
+
+_
--- /dev/null
+ fs/Makefile | 2 +-
+ fs/inode.c | 4 +++-
+ mm/page_alloc.c | 1 +
+ 3 files changed, 5 insertions(+), 2 deletions(-)
+
+--- linux-2.4.22-ac1/fs/inode.c~iod-stock-exports-2.4.22-rh 2003-09-25 14:45:32.000000000 +0400
++++ linux-2.4.22-ac1-alexey/fs/inode.c 2003-09-25 14:49:41.000000000 +0400
+@@ -5,6 +5,7 @@
+ */
+
+ #include <linux/config.h>
++#include <linux/module.h>
+ #include <linux/fs.h>
+ #include <linux/string.h>
+ #include <linux/mm.h>
+@@ -68,7 +69,8 @@ static LIST_HEAD(anon_hash_chain); /* fo
+ * NOTE! You also have to own the lock if you change
+ * the i_state of an inode while it is in use..
+ */
+-static spinlock_t inode_lock = SPIN_LOCK_UNLOCKED;
++spinlock_t inode_lock = SPIN_LOCK_UNLOCKED;
++EXPORT_SYMBOL(inode_lock);
+
+ /*
+ * Statistics gathering..
+--- linux-2.4.22-ac1/fs/Makefile~iod-stock-exports-2.4.22-rh 2003-09-25 14:16:28.000000000 +0400
++++ linux-2.4.22-ac1-alexey/fs/Makefile 2003-09-25 14:50:00.000000000 +0400
+@@ -7,7 +7,7 @@
+
+ O_TARGET := fs.o
+
+-export-objs := filesystems.o open.o dcache.o buffer.o dquot.o dcookies.o
++export-objs := filesystems.o open.o dcache.o buffer.o dquot.o dcookies.o inode.o
+ mod-subdirs := nls xfs
+
+ obj-y := open.o read_write.o devices.o file_table.o buffer.o \
+--- linux-2.4.22-ac1/mm/page_alloc.c~iod-stock-exports-2.4.22-rh 2003-09-25 14:16:28.000000000 +0400
++++ linux-2.4.22-ac1-alexey/mm/page_alloc.c 2003-09-25 14:49:41.000000000 +0400
+@@ -28,6 +28,7 @@ int nr_inactive_pages;
+ LIST_HEAD(inactive_list);
+ LIST_HEAD(active_list);
+ pg_data_t *pgdat_list;
++EXPORT_SYMBOL(pgdat_list);
+
+ /*
+ *
+
+_
--- /dev/null
+ include/linux/socket.h | 4 ++++
+ net/netsyms.c | 1 +
+ net/socket.c | 2 +-
+ 3 files changed, 6 insertions(+), 1 deletion(-)
+
+--- linux-2.4.22-ac1/include/linux/socket.h~socket-exports-2.4.22-rh 2003-06-13 18:51:39.000000000 +0400
++++ linux-2.4.22-ac1-alexey/include/linux/socket.h 2003-09-26 00:49:43.000000000 +0400
+@@ -275,6 +275,10 @@ extern void memcpy_tokerneliovec(struct
+ extern int move_addr_to_user(void *kaddr, int klen, void *uaddr, int *ulen);
+ extern int move_addr_to_kernel(void *uaddr, int ulen, void *kaddr);
+ extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data);
++struct socket;
++extern int sock_map_fd(struct socket *sock);
++extern struct socket *sockfd_lookup(int fd, int *err);
++
+ #endif
+ #endif /* not kernel and not glibc */
+ #endif /* _LINUX_SOCKET_H */
+--- linux-2.4.22-ac1/net/netsyms.c~socket-exports-2.4.22-rh 2003-09-26 00:49:19.000000000 +0400
++++ linux-2.4.22-ac1-alexey/net/netsyms.c 2003-09-26 00:50:20.000000000 +0400
+@@ -163,6 +163,7 @@ EXPORT_SYMBOL(put_cmsg);
+ EXPORT_SYMBOL(sock_kmalloc);
+ EXPORT_SYMBOL(sock_kfree_s);
+ EXPORT_SYMBOL(sockfd_lookup);
++EXPORT_SYMBOL(sock_map_fd);
+
+ #ifdef CONFIG_FILTER
+ EXPORT_SYMBOL(sk_run_filter);
+--- linux-2.4.22-ac1/net/socket.c~socket-exports-2.4.22-rh 2003-08-25 15:44:44.000000000 +0400
++++ linux-2.4.22-ac1-alexey/net/socket.c 2003-09-26 00:49:43.000000000 +0400
+@@ -325,7 +325,7 @@ static struct dentry_operations sockfs_d
+ * but we take care of internal coherence yet.
+ */
+
+-static int sock_map_fd(struct socket *sock)
++int sock_map_fd(struct socket *sock)
+ {
+ int fd;
+ struct qstr this;
+
+_
--- /dev/null
+ include/linux/skbuff.h | 30 +++++
+ include/net/tcp.h | 5
+ net/core/skbuff.c | 25 ++++
+ net/ipv4/tcp.c | 252 ++++++++++++++++++++++++++++++++++++++++++++++++-
+ net/netsyms.c | 2
+ 5 files changed, 311 insertions(+), 3 deletions(-)
+
+--- linux-2.4.22-ac1/include/linux/skbuff.h~tcp-zero-copy-2.4.22-rh 2003-08-25 15:44:44.000000000 +0400
++++ linux-2.4.22-ac1-alexey/include/linux/skbuff.h 2003-09-26 00:38:48.000000000 +0400
+@@ -116,6 +116,30 @@ struct skb_frag_struct
+ __u16 size;
+ };
+
++/* Support for callback when skb data has been released */
++typedef struct zccd /* Zero Copy Callback Descriptor */
++{ /* (embed as first member of custom struct) */
++ atomic_t zccd_count; /* reference count */
++ void (*zccd_destructor)(struct zccd *); /* callback when refcount reaches zero */
++} zccd_t;
++
++static inline void zccd_init (zccd_t *d, void (*callback)(zccd_t *))
++{
++ atomic_set (&d->zccd_count, 1);
++ d->zccd_destructor = callback;
++}
++
++static inline void zccd_get (zccd_t *d) /* take a reference */
++{
++ atomic_inc (&d->zccd_count);
++}
++
++static inline void zccd_put (zccd_t *d) /* release a reference */
++{
++ if (atomic_dec_and_test (&d->zccd_count))
++ (d->zccd_destructor)(d);
++}
++
+ /* This data is invariant across clones and lives at
+ * the end of the header data, ie. at skb->end.
+ */
+@@ -123,6 +147,12 @@ struct skb_shared_info {
+ atomic_t dataref;
+ unsigned int nr_frags;
+ struct sk_buff *frag_list;
++ zccd_t *zccd; /* zero copy descriptor */
++ zccd_t *zccd2; /* 2nd zero copy descriptor */
++ /* NB we expect zero-copy data to be at least 1 packet, so
++ * having 2 zccds means we don't unneccessarily split the packet
++ * where consecutive zero-copy sends abutt.
++ */
+ skb_frag_t frags[MAX_SKB_FRAGS];
+ };
+
+--- linux-2.4.22-ac1/include/net/tcp.h~tcp-zero-copy-2.4.22-rh 2003-08-25 15:44:44.000000000 +0400
++++ linux-2.4.22-ac1-alexey/include/net/tcp.h 2003-09-26 00:38:48.000000000 +0400
+@@ -643,6 +643,8 @@ extern int tcp_v4_tw_remember_stam
+
+ extern int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size);
+ extern ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags);
++extern ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
++ int flags, zccd_t *zccd);
+
+ extern int tcp_ioctl(struct sock *sk,
+ int cmd,
+@@ -737,6 +739,9 @@ extern int tcp_recvmsg(struct sock *sk
+ struct msghdr *msg,
+ int len, int nonblock,
+ int flags, int *addr_len);
++extern int tcp_recvpackets(struct sock *sk,
++ struct sk_buff_head *packets,
++ int len, int nonblock);
+
+ extern int tcp_listen_start(struct sock *sk);
+
+--- linux-2.4.22-ac1/net/core/skbuff.c~tcp-zero-copy-2.4.22-rh 2003-08-25 15:44:44.000000000 +0400
++++ linux-2.4.22-ac1-alexey/net/core/skbuff.c 2003-09-26 00:38:48.000000000 +0400
+@@ -208,6 +208,8 @@ struct sk_buff *alloc_skb(unsigned int s
+ atomic_set(&(skb_shinfo(skb)->dataref), 1);
+ skb_shinfo(skb)->nr_frags = 0;
+ skb_shinfo(skb)->frag_list = NULL;
++ skb_shinfo(skb)->zccd = NULL; /* skbuffs kick off with NO user zero copy descriptors */
++ skb_shinfo(skb)->zccd2 = NULL;
+ return skb;
+
+ nodata:
+@@ -277,6 +279,10 @@ static void skb_release_data(struct sk_b
+ {
+ if (!skb->cloned ||
+ atomic_dec_and_test(&(skb_shinfo(skb)->dataref))) {
++ if (skb_shinfo(skb)->zccd != NULL) /* zero copy callback descriptor? */
++ zccd_put (skb_shinfo(skb)->zccd); /* release hold */
++ if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd zero copy callback descriptor? */
++ zccd_put (skb_shinfo(skb)->zccd2); /* release hold */
+ if (skb_shinfo(skb)->nr_frags) {
+ int i;
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+@@ -535,6 +541,8 @@ int skb_linearize(struct sk_buff *skb, i
+ atomic_set(&(skb_shinfo(skb)->dataref), 1);
+ skb_shinfo(skb)->nr_frags = 0;
+ skb_shinfo(skb)->frag_list = NULL;
++ skb_shinfo(skb)->zccd = NULL; /* copied data => no user zero copy descriptor */
++ skb_shinfo(skb)->zccd2 = NULL;
+
+ /* We are no longer a clone, even if we were. */
+ skb->cloned = 0;
+@@ -581,6 +589,14 @@ struct sk_buff *pskb_copy(struct sk_buff
+ n->data_len = skb->data_len;
+ n->len = skb->len;
+
++ if (skb_shinfo(skb)->zccd != NULL) /* user zero copy descriptor? */
++ zccd_get (skb_shinfo(skb)->zccd); /* 1 more ref (pages are shared) */
++ skb_shinfo(n)->zccd = skb_shinfo(skb)->zccd;
++
++ if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd user zero copy descriptor? */
++ zccd_get (skb_shinfo(skb)->zccd2); /* 1 more ref (pages are shared) */
++ skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2;
++
+ if (skb_shinfo(skb)->nr_frags) {
+ int i;
+
+@@ -623,6 +639,8 @@ int pskb_expand_head(struct sk_buff *skb
+ u8 *data;
+ int size = nhead + (skb->end - skb->head) + ntail;
+ long off;
++ zccd_t *zccd = skb_shinfo(skb)->zccd; /* stash user zero copy descriptor */
++ zccd_t *zccd2 = skb_shinfo(skb)->zccd2; /* stash 2nd user zero copy descriptor */
+
+ if (skb_shared(skb))
+ BUG();
+@@ -644,6 +662,11 @@ int pskb_expand_head(struct sk_buff *skb
+ if (skb_shinfo(skb)->frag_list)
+ skb_clone_fraglist(skb);
+
++ if (zccd != NULL) /* user zero copy descriptor? */
++ zccd_get (zccd); /* extra ref (pages are shared) */
++ if (zccd2 != NULL) /* 2nd user zero copy descriptor? */
++ zccd_get (zccd2); /* extra ref (pages are shared) */
++
+ skb_release_data(skb);
+
+ off = (data+nhead) - skb->head;
+@@ -658,6 +681,8 @@ int pskb_expand_head(struct sk_buff *skb
+ skb->nh.raw += off;
+ skb->cloned = 0;
+ atomic_set(&skb_shinfo(skb)->dataref, 1);
++ skb_shinfo(skb)->zccd = zccd;
++ skb_shinfo(skb)->zccd2 = zccd2;
+ return 0;
+
+ nodata:
+--- linux-2.4.22-ac1/net/ipv4/tcp.c~tcp-zero-copy-2.4.22-rh 2003-08-25 15:44:44.000000000 +0400
++++ linux-2.4.22-ac1-alexey/net/ipv4/tcp.c 2003-09-26 00:38:48.000000000 +0400
+@@ -747,7 +747,7 @@ do_interrupted:
+ goto out;
+ }
+
+-ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags);
++ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags, zccd_t *zccd);
+
+ static inline int
+ can_coalesce(struct sk_buff *skb, int i, struct page *page, int off)
+@@ -826,7 +826,8 @@ static int tcp_error(struct sock *sk, in
+ return err;
+ }
+
+-ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags)
++/* Extra parameter: user zero copy descriptor (or NULL if not doing that) */
++ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags, zccd_t *zccd)
+ {
+ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
+ int mss_now;
+@@ -874,6 +875,17 @@ new_segment:
+ copy = size;
+
+ i = skb_shinfo(skb)->nr_frags;
++
++ if (zccd != NULL && /* this is a zcc I/O */
++ skb_shinfo(skb)->zccd != NULL && /* skb is part of a zcc I/O */
++ skb_shinfo(skb)->zccd2 != NULL &&
++ skb_shinfo(skb)->zccd != zccd && /* not the same one */
++ skb_shinfo(skb)->zccd2 != zccd)
++ {
++ tcp_mark_push (tp, skb);
++ goto new_segment;
++ }
++
+ if (can_coalesce(skb, i, page, offset)) {
+ skb_shinfo(skb)->frags[i-1].size += copy;
+ } else if (i < MAX_SKB_FRAGS) {
+@@ -884,6 +896,20 @@ new_segment:
+ goto new_segment;
+ }
+
++ if (zccd != NULL && /* this is a zcc I/O */
++ skb_shinfo(skb)->zccd != zccd && /* not already referencing this zccd */
++ skb_shinfo(skb)->zccd2 != zccd)
++ {
++ zccd_get (zccd); /* bump ref count */
++
++ BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL);
++
++ if (skb_shinfo(skb)->zccd == NULL) /* reference this zccd */
++ skb_shinfo(skb)->zccd = zccd;
++ else
++ skb_shinfo(skb)->zccd2 = zccd;
++ }
++
+ skb->len += copy;
+ skb->data_len += copy;
+ skb->ip_summed = CHECKSUM_HW;
+@@ -947,7 +973,31 @@ ssize_t tcp_sendpage(struct socket *sock
+
+ lock_sock(sk);
+ TCP_CHECK_TIMER(sk);
+- res = do_tcp_sendpages(sk, &page, offset, size, flags);
++ res = do_tcp_sendpages(sk, &page, offset, size, flags, NULL);
++ TCP_CHECK_TIMER(sk);
++ release_sock(sk);
++ return res;
++}
++
++ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
++ int flags, zccd_t *zccd)
++{
++ ssize_t res;
++ struct sock *sk = sock->sk;
++
++#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM)
++
++ if (!(sk->route_caps & NETIF_F_SG) || /* caller shouldn't waste her time */
++ !(sk->route_caps & TCP_ZC_CSUM_FLAGS)) /* on double mapping */
++ BUG ();
++
++#undef TCP_ZC_CSUM_FLAGS
++
++ lock_sock(sk);
++ TCP_CHECK_TIMER(sk);
++
++ res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd);
++
+ TCP_CHECK_TIMER(sk);
+ release_sock(sk);
+ return res;
+@@ -1771,6 +1821,202 @@ recv_urg:
+ goto out;
+ }
+
++int tcp_recvpackets (struct sock *sk, struct sk_buff_head *packets,
++ int len, int nonblock)
++{
++ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
++ int copied;
++ long timeo;
++
++ BUG_TRAP (len > 0);
++ /*BUG_TRAP ((flags & (MSG_OOB | MSG_PEEK | MSG_TRUNC)) == 0);*/
++
++ lock_sock(sk);
++
++ TCP_CHECK_TIMER(sk);
++
++ copied = -ENOTCONN;
++ if (sk->state == TCP_LISTEN)
++ goto out;
++
++ copied = 0;
++ timeo = sock_rcvtimeo(sk, nonblock);
++
++ do {
++ struct sk_buff * skb;
++ u32 offset;
++ unsigned long used;
++ int exhausted;
++ int eaten;
++
++ /* Are we at urgent data? Stop if we have read anything. */
++ if (copied && tp->urg_data && tp->urg_seq == tp->copied_seq)
++ break;
++
++ /* We need to check signals first, to get correct SIGURG
++ * handling. FIXME: Need to check this doesnt impact 1003.1g
++ * and move it down to the bottom of the loop
++ */
++ if (signal_pending(current)) {
++ if (copied)
++ break;
++ copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
++ break;
++ }
++
++ /* Next get a buffer. */
++
++ skb = skb_peek(&sk->receive_queue);
++
++ if (skb == NULL) /* nothing ready */
++ {
++ if (copied) {
++ if (sk->err ||
++ sk->state == TCP_CLOSE ||
++ (sk->shutdown & RCV_SHUTDOWN) ||
++ !timeo ||
++ (0))
++ break;
++ } else {
++ if (sk->done)
++ break;
++
++ if (sk->err) {
++ copied = sock_error(sk);
++ break;
++ }
++
++ if (sk->shutdown & RCV_SHUTDOWN)
++ break;
++
++ if (sk->state == TCP_CLOSE) {
++ if (!sk->done) {
++ /* This occurs when user tries to read
++ * from never connected socket.
++ */
++ copied = -ENOTCONN;
++ break;
++ }
++ break;
++ }
++
++ if (!timeo) {
++ copied = -EAGAIN;
++ break;
++ }
++ }
++
++ cleanup_rbuf(sk, copied);
++ timeo = tcp_data_wait(sk, timeo);
++ continue;
++ }
++
++ BUG_TRAP (atomic_read (&skb->users) == 1);
++
++ exhausted = eaten = 0;
++
++ offset = tp->copied_seq - TCP_SKB_CB(skb)->seq;
++ if (skb->h.th->syn)
++ offset--;
++
++ used = skb->len - offset;
++
++ if (tp->urg_data) {
++ u32 urg_offset = tp->urg_seq - tp->copied_seq;
++ if (urg_offset < used) {
++ if (!urg_offset) { /* at urgent date */
++ if (!sk->urginline) {
++ tp->copied_seq++; /* discard the single byte of urgent data */
++ offset++;
++ used--;
++ }
++ } else /* truncate read */
++ used = urg_offset;
++ }
++ }
++
++ BUG_TRAP (used >= 0);
++ if (len < used)
++ used = len;
++
++ if (used == 0)
++ exhausted = 1;
++ else
++ {
++ if (skb_is_nonlinear (skb))
++ {
++ int rc = skb_linearize (skb, GFP_KERNEL);
++
++ printk ("tcp_recvpackets(): linearising: %d\n", rc);
++
++ if (rc)
++ {
++ if (!copied)
++ copied = rc;
++ break;
++ }
++ }
++
++ if ((offset + used) == skb->len) /* consuming the whole packet */
++ {
++ __skb_unlink (skb, &sk->receive_queue);
++ dst_release (skb->dst);
++ skb_orphan (skb);
++ __skb_pull (skb, offset);
++ __skb_queue_tail (packets, skb);
++ exhausted = eaten = 1;
++ }
++ else /* consuming only part of the packet */
++ {
++ struct sk_buff *skb2 = skb_clone (skb, GFP_KERNEL);
++
++ if (skb2 == NULL)
++ {
++ if (!copied)
++ copied = -ENOMEM;
++ break;
++ }
++
++ dst_release (skb2->dst);
++ __skb_pull (skb2, offset);
++ __skb_trim (skb2, used);
++ __skb_queue_tail (packets, skb2);
++ }
++
++ tp->copied_seq += used;
++ copied += used;
++ len -= used;
++ }
++
++ if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) {
++ tp->urg_data = 0;
++ tcp_fast_path_check(sk, tp);
++ }
++
++ if (!exhausted)
++ continue;
++
++ if (skb->h.th->fin)
++ {
++ tp->copied_seq++;
++ if (!eaten)
++ tcp_eat_skb (sk, skb);
++ break;
++ }
++
++ if (!eaten)
++ tcp_eat_skb (sk, skb);
++
++ } while (len > 0);
++
++ out:
++ /* Clean up data we have read: This will do ACK frames. */
++ cleanup_rbuf(sk, copied);
++ TCP_CHECK_TIMER(sk);
++ release_sock(sk);
++ return copied;
++}
++
+ /*
+ * State processing on a close. This implements the state shift for
+ * sending our FIN frame. Note that we only send a FIN for some
+--- linux-2.4.22-ac1/net/netsyms.c~tcp-zero-copy-2.4.22-rh 2003-09-25 14:16:26.000000000 +0400
++++ linux-2.4.22-ac1-alexey/net/netsyms.c 2003-09-26 00:39:16.000000000 +0400
+@@ -396,6 +396,8 @@ EXPORT_SYMBOL(sysctl_tcp_wmem);
+ EXPORT_SYMBOL(sysctl_tcp_ecn);
+ EXPORT_SYMBOL(tcp_cwnd_application_limited);
+ EXPORT_SYMBOL(tcp_sendpage);
++EXPORT_SYMBOL(tcp_sendpage_zccd);
++EXPORT_SYMBOL(tcp_recvpackets);
+ EXPORT_SYMBOL(sysctl_tcp_low_latency);
+
+ EXPORT_SYMBOL(tcp_write_xmit);
+
+_
--- /dev/null
+ fs/dcache.c | 19 ++
+ fs/exec.c | 17 +-
+ fs/namei.c | 295 +++++++++++++++++++++++++++++++++++++++-------
+ fs/namespace.c | 28 +++-
+ fs/open.c | 172 +++++++++++++++++++-------
+ fs/stat.c | 52 +++++---
+ include/linux/dcache.h | 60 +++++++++
+ include/linux/fs.h | 32 ++++
+ include/linux/fs_struct.h | 4
+ kernel/exit.c | 3
+ kernel/fork.c | 3
+ kernel/ksyms.c | 1
+ 12 files changed, 558 insertions(+), 128 deletions(-)
+
+--- linux-2.4.22-ac1/fs/dcache.c~vfs_intent-2.4.22-rh 2003-09-25 14:16:29.000000000 +0400
++++ linux-2.4.22-ac1-alexey/fs/dcache.c 2003-09-25 14:42:46.000000000 +0400
+@@ -181,6 +181,13 @@ int d_invalidate(struct dentry * dentry)
+ spin_unlock(&dcache_lock);
+ return 0;
+ }
++
++ /* network invalidation by Lustre */
++ if (dentry->d_flags & DCACHE_LUSTRE_INVALID) {
++ spin_unlock(&dcache_lock);
++ return 0;
++ }
++
+ /*
+ * Check whether to do a partial shrink_dcache
+ * to get rid of unused child entries.
+@@ -833,13 +840,19 @@ void d_delete(struct dentry * dentry)
+ * Adds a dentry to the hash according to its name.
+ */
+
+-void d_rehash(struct dentry * entry)
++void __d_rehash(struct dentry * entry, int lock)
+ {
+ struct list_head *list = d_hash(entry->d_parent, entry->d_name.hash);
+ if (!list_empty(&entry->d_hash)) BUG();
+- spin_lock(&dcache_lock);
++ if (lock) spin_lock(&dcache_lock);
+ list_add(&entry->d_hash, list);
+- spin_unlock(&dcache_lock);
++ if (lock) spin_unlock(&dcache_lock);
++}
++EXPORT_SYMBOL(__d_rehash);
++
++void d_rehash(struct dentry * entry)
++{
++ __d_rehash(entry, 1);
+ }
+
+ #define do_switch(x,y) do { \
+--- linux-2.4.22-ac1/fs/exec.c~vfs_intent-2.4.22-rh 2003-09-25 14:16:29.000000000 +0400
++++ linux-2.4.22-ac1-alexey/fs/exec.c 2003-09-25 14:42:46.000000000 +0400
+@@ -115,8 +115,10 @@ asmlinkage long sys_uselib(const char *
+ struct file * file;
+ struct nameidata nd;
+ int error;
++ struct lookup_intent it = { .it_op = IT_OPEN,
++ .it_flags = FMODE_READ|FMODE_EXEC };
+
+- error = user_path_walk(library, &nd);
++ error = user_path_walk_it(library, &nd, &it);
+ if (error)
+ goto out;
+
+@@ -128,7 +130,8 @@ asmlinkage long sys_uselib(const char *
+ if (error)
+ goto exit;
+
+- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
++ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &it);
++ intent_release(&it);
+ error = PTR_ERR(file);
+ if (IS_ERR(file))
+ goto out;
+@@ -390,8 +393,10 @@ struct file *open_exec(const char *name)
+ struct inode *inode;
+ struct file *file;
+ int err = 0;
++ struct lookup_intent it = { .it_op = IT_OPEN,
++ .it_flags = FMODE_READ|FMODE_EXEC };
+
+- err = path_lookup(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd);
++ err = path_lookup_it(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd, &it);
+ file = ERR_PTR(err);
+ if (!err) {
+ inode = nd.dentry->d_inode;
+@@ -403,7 +408,8 @@ struct file *open_exec(const char *name)
+ err = -EACCES;
+ file = ERR_PTR(err);
+ if (!err) {
+- file = dentry_open(nd.dentry, nd.mnt, O_RDONLY);
++ file = dentry_open_it(nd.dentry, nd.mnt, O_RDONLY, &it);
++ intent_release(&it);
+ if (!IS_ERR(file)) {
+ err = deny_write_access(file);
+ if (err) {
+@@ -415,6 +421,7 @@ out:
+ return file;
+ }
+ }
++ intent_release(&it);
+ path_release(&nd);
+ }
+ goto out;
+@@ -1322,7 +1329,7 @@ int do_coredump(long signr, int exit_cod
+ goto close_fail;
+ if (!file->f_op->write)
+ goto close_fail;
+- if (do_truncate(file->f_dentry, 0) != 0)
++ if (do_truncate(file->f_dentry, 0, 0) != 0)
+ goto close_fail;
+
+ retval = binfmt->core_dump(signr, regs, file);
+--- linux-2.4.22-ac1/fs/namei.c~vfs_intent-2.4.22-rh 2003-09-25 14:16:23.000000000 +0400
++++ linux-2.4.22-ac1-alexey/fs/namei.c 2003-09-25 14:44:40.000000000 +0400
+@@ -94,6 +94,13 @@
+ * XEmacs seems to be relying on it...
+ */
+
++void intent_release(struct lookup_intent *it)
++{
++ if (it && it->it_op_release)
++ it->it_op_release(it);
++
++}
++
+ /* In order to reduce some races, while at the same time doing additional
+ * checking and hopefully speeding things up, we copy filenames to the
+ * kernel data space before using them..
+@@ -260,10 +267,19 @@ void path_release(struct nameidata *nd)
+ * Internal lookup() using the new generic dcache.
+ * SMP-safe
+ */
+-static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, int flags)
++static struct dentry *cached_lookup(struct dentry *parent, struct qstr *name,
++ int flags, struct lookup_intent *it)
+ {
+ struct dentry * dentry = d_lookup(parent, name);
+
++ if (dentry && dentry->d_op && dentry->d_op->d_revalidate_it) {
++ if (!dentry->d_op->d_revalidate_it(dentry, flags, it) &&
++ !d_invalidate(dentry)) {
++ dput(dentry);
++ dentry = NULL;
++ }
++ return dentry;
++ } else
+ if (dentry && dentry->d_op && dentry->d_op->d_revalidate) {
+ if (!dentry->d_op->d_revalidate(dentry, flags) && !d_invalidate(dentry)) {
+ dput(dentry);
+@@ -281,11 +297,15 @@ static struct dentry * cached_lookup(str
+ * make sure that nobody added the entry to the dcache in the meantime..
+ * SMP-safe
+ */
+-static struct dentry * real_lookup(struct dentry * parent, struct qstr * name, int flags)
++static struct dentry *real_lookup(struct dentry *parent, struct qstr *name,
++ int flags, struct lookup_intent *it)
+ {
+ struct dentry * result;
+ struct inode *dir = parent->d_inode;
++ int counter = 0;
+
++again:
++ counter++;
+ down(&dir->i_sem);
+ /*
+ * First re-do the cached lookup just in case it was created
+@@ -300,6 +320,9 @@ static struct dentry * real_lookup(struc
+ result = ERR_PTR(-ENOMEM);
+ if (dentry) {
+ lock_kernel();
++ if (dir->i_op->lookup_it)
++ result = dir->i_op->lookup_it(dir, dentry, it, flags);
++ else
+ result = dir->i_op->lookup(dir, dentry);
+ unlock_kernel();
+ if (result)
+@@ -321,6 +344,15 @@ static struct dentry * real_lookup(struc
+ dput(result);
+ result = ERR_PTR(-ENOENT);
+ }
++ } else if (result->d_op && result->d_op->d_revalidate_it) {
++ if (!result->d_op->d_revalidate_it(result, flags, it) &&
++ !d_invalidate(result)) {
++ dput(result);
++ if (counter > 10)
++ result = ERR_PTR(-ESTALE);
++ if (!IS_ERR(result))
++ goto again;
++ }
+ }
+ return result;
+ }
+@@ -332,7 +364,8 @@ static struct dentry * real_lookup(struc
+ * Without that kind of total limit, nasty chains of consecutive
+ * symlinks can cause almost arbitrarily long lookups.
+ */
+-static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd)
++static inline int do_follow_link(struct dentry *dentry, struct nameidata *nd,
++ struct lookup_intent *it)
+ {
+ int err;
+ if (current->link_count >= 5)
+@@ -346,10 +379,12 @@ static inline int do_follow_link(struct
+ current->link_count++;
+ current->total_link_count++;
+ UPDATE_ATIME(dentry->d_inode);
++ nd->intent = it;
+ err = dentry->d_inode->i_op->follow_link(dentry, nd);
+ current->link_count--;
+ return err;
+ loop:
++ intent_release(it);
+ path_release(nd);
+ return -ELOOP;
+ }
+@@ -447,7 +482,8 @@ static inline void follow_dotdot(struct
+ *
+ * We expect 'base' to be positive and a directory.
+ */
+-int link_path_walk(const char * name, struct nameidata *nd)
++int link_path_walk_it(const char *name, struct nameidata *nd,
++ struct lookup_intent *it)
+ {
+ struct dentry *dentry;
+ struct inode *inode;
+@@ -520,9 +556,9 @@ int link_path_walk(const char * name, st
+ break;
+ }
+ /* This does the actual lookups.. */
+- dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE);
++ dentry = cached_lookup(nd->dentry, &this, LOOKUP_CONTINUE, NULL);
+ if (!dentry) {
+- dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE);
++ dentry = real_lookup(nd->dentry, &this, LOOKUP_CONTINUE, NULL);
+ err = PTR_ERR(dentry);
+ if (IS_ERR(dentry))
+ break;
+@@ -540,7 +576,7 @@ int link_path_walk(const char * name, st
+ goto out_dput;
+
+ if (inode->i_op->follow_link) {
+- err = do_follow_link(dentry, nd);
++ err = do_follow_link(dentry, nd, NULL);
+ dput(dentry);
+ if (err)
+ goto return_err;
+@@ -556,7 +592,7 @@ int link_path_walk(const char * name, st
+ nd->dentry = dentry;
+ }
+ err = -ENOTDIR;
+- if (!inode->i_op->lookup)
++ if (!inode->i_op->lookup && !inode->i_op->lookup_it)
+ break;
+ continue;
+ /* here ends the main loop */
+@@ -583,9 +619,9 @@ last_component:
+ if (err < 0)
+ break;
+ }
+- dentry = cached_lookup(nd->dentry, &this, 0);
++ dentry = cached_lookup(nd->dentry, &this, 0, it);
+ if (!dentry) {
+- dentry = real_lookup(nd->dentry, &this, 0);
++ dentry = real_lookup(nd->dentry, &this, 0, it);
+ err = PTR_ERR(dentry);
+ if (IS_ERR(dentry))
+ break;
+@@ -595,7 +631,7 @@ last_component:
+ inode = dentry->d_inode;
+ if ((lookup_flags & LOOKUP_FOLLOW)
+ && inode && inode->i_op && inode->i_op->follow_link) {
+- err = do_follow_link(dentry, nd);
++ err = do_follow_link(dentry, nd, it);
+ dput(dentry);
+ if (err)
+ goto return_err;
+@@ -609,7 +645,8 @@ last_component:
+ goto no_inode;
+ if (lookup_flags & LOOKUP_DIRECTORY) {
+ err = -ENOTDIR;
+- if (!inode->i_op || !inode->i_op->lookup)
++ if (!inode->i_op ||
++ (!inode->i_op->lookup && !inode->i_op->lookup_it))
+ break;
+ }
+ goto return_base;
+@@ -635,6 +672,25 @@ return_reval:
+ * Check the cached dentry for staleness.
+ */
+ dentry = nd->dentry;
++ if (dentry && dentry->d_op && dentry->d_op->d_revalidate_it) {
++ err = -ESTALE;
++ if (!dentry->d_op->d_revalidate_it(dentry, 0, it)) {
++ struct dentry *new;
++ err = permission(dentry->d_parent->d_inode,
++ MAY_EXEC);
++ if (err)
++ break;
++ new = real_lookup(dentry->d_parent,
++ &dentry->d_name, 0, NULL);
++ d_invalidate(dentry);
++ dput(dentry);
++ if (IS_ERR(new)) {
++ err = PTR_ERR(new);
++ break;
++ }
++ nd->dentry = new;
++ }
++ } else
+ if (dentry && dentry->d_sb
+ && (dentry->d_sb->s_type->fs_flags & FS_ALWAYS_REVAL)) {
+ err = -ESTALE;
+@@ -649,15 +705,28 @@ out_dput:
+ dput(dentry);
+ break;
+ }
++ if (err)
++ intent_release(it);
+ path_release(nd);
+ return_err:
+ return err;
+ }
+
++int link_path_walk(const char * name, struct nameidata *nd)
++{
++ return link_path_walk_it(name, nd, NULL);
++}
++
++int path_walk_it(const char * name, struct nameidata *nd, struct lookup_intent *it)
++{
++ current->total_link_count = 0;
++ return link_path_walk_it(name, nd, it);
++}
++
+ int path_walk(const char * name, struct nameidata *nd)
+ {
+ current->total_link_count = 0;
+- return link_path_walk(name, nd);
++ return link_path_walk_it(name, nd, NULL);
+ }
+
+ /* SMP-safe */
+@@ -742,6 +811,17 @@ walk_init_root(const char *name, struct
+ }
+
+ /* SMP-safe */
++int path_lookup_it(const char *path, unsigned flags, struct nameidata *nd,
++ struct lookup_intent *it)
++{
++ int error = 0;
++ if (path_init(path, flags, nd))
++ error = path_walk_it(path, nd, it);
++ return error;
++}
++
++
++/* SMP-safe */
+ int path_lookup(const char *path, unsigned flags, struct nameidata *nd)
+ {
+ int error = 0;
+@@ -756,6 +836,7 @@ int path_init(const char *name, unsigned
+ {
+ nd->last_type = LAST_ROOT; /* if there are only slashes... */
+ nd->flags = flags;
++ nd->intent = NULL;
+ if (*name=='/')
+ return walk_init_root(name,nd);
+ read_lock(¤t->fs->lock);
+@@ -770,7 +851,8 @@ int path_init(const char *name, unsigned
+ * needs parent already locked. Doesn't follow mounts.
+ * SMP-safe.
+ */
+-struct dentry * lookup_hash(struct qstr *name, struct dentry * base)
++struct dentry * lookup_hash_it(struct qstr *name, struct dentry * base,
++ struct lookup_intent *it)
+ {
+ struct dentry * dentry;
+ struct inode *inode;
+@@ -793,13 +875,16 @@ struct dentry * lookup_hash(struct qstr
+ goto out;
+ }
+
+- dentry = cached_lookup(base, name, 0);
++ dentry = cached_lookup(base, name, 0, it);
+ if (!dentry) {
+ struct dentry *new = d_alloc(base, name);
+ dentry = ERR_PTR(-ENOMEM);
+ if (!new)
+ goto out;
+ lock_kernel();
++ if (inode->i_op->lookup_it)
++ dentry = inode->i_op->lookup_it(inode, new, it, 0);
++ else
+ dentry = inode->i_op->lookup(inode, new);
+ unlock_kernel();
+ if (!dentry)
+@@ -811,6 +896,12 @@ out:
+ return dentry;
+ }
+
++struct dentry * lookup_hash(struct qstr *name, struct dentry * base)
++{
++ return lookup_hash_it(name, base, NULL);
++}
++
++
+ /* SMP-safe */
+ struct dentry * lookup_one_len(const char * name, struct dentry * base, int len)
+ {
+@@ -832,7 +923,7 @@ struct dentry * lookup_one_len(const cha
+ }
+ this.hash = end_name_hash(hash);
+
+- return lookup_hash(&this, base);
++ return lookup_hash_it(&this, base, NULL);
+ access:
+ return ERR_PTR(-EACCES);
+ }
+@@ -863,6 +954,23 @@ int __user_walk(const char *name, unsign
+ return err;
+ }
+
++int __user_walk_it(const char *name, unsigned flags, struct nameidata *nd,
++ struct lookup_intent *it)
++{
++ char *tmp;
++ int err;
++
++ tmp = getname(name);
++ err = PTR_ERR(tmp);
++ if (!IS_ERR(tmp)) {
++ err = 0;
++ if (path_init(tmp, flags, nd))
++ err = path_walk_it(tmp, nd, it);
++ putname(tmp);
++ }
++ return err;
++}
++
+ /*
+ * It's inline, so penalty for filesystems that don't use sticky bit is
+ * minimal.
+@@ -958,7 +1066,8 @@ static inline int lookup_flags(unsigned
+ return retval;
+ }
+
+-int vfs_create(struct inode *dir, struct dentry *dentry, int mode)
++static int vfs_create_it(struct inode *dir, struct dentry *dentry, int mode,
++ struct lookup_intent *it)
+ {
+ int error;
+
+@@ -971,12 +1080,15 @@ int vfs_create(struct inode *dir, struct
+ goto exit_lock;
+
+ error = -EACCES; /* shouldn't it be ENOSYS? */
+- if (!dir->i_op || !dir->i_op->create)
++ if (!dir->i_op || (!dir->i_op->create && !dir->i_op->create_it))
+ goto exit_lock;
+
+ DQUOT_INIT(dir);
+ lock_kernel();
+- error = dir->i_op->create(dir, dentry, mode);
++ if (dir->i_op->create_it)
++ error = dir->i_op->create_it(dir, dentry, mode, it);
++ else
++ error = dir->i_op->create(dir, dentry, mode);
+ unlock_kernel();
+ exit_lock:
+ up(&dir->i_zombie);
+@@ -985,6 +1097,11 @@ exit_lock:
+ return error;
+ }
+
++int vfs_create(struct inode *dir, struct dentry *dentry, int mode)
++{
++ return vfs_create_it(dir, dentry, mode, NULL);
++}
++
+ /*
+ * open_namei()
+ *
+@@ -999,7 +1116,8 @@ exit_lock:
+ * for symlinks (where the permissions are checked later).
+ * SMP-safe
+ */
+-int open_namei(const char * pathname, int flag, int mode, struct nameidata *nd)
++int open_namei_it(const char *pathname, int flag, int mode,
++ struct nameidata *nd, struct lookup_intent *it)
+ {
+ int acc_mode, error = 0;
+ struct inode *inode;
+@@ -1009,11 +1127,14 @@ int open_namei(const char * pathname, in
+
+ acc_mode = ACC_MODE(flag);
+
++ if (it)
++ it->it_flags = flag;
++
+ /*
+ * The simplest case - just a plain lookup.
+ */
+ if (!(flag & O_CREAT)) {
+- error = path_lookup(pathname, lookup_flags(flag), nd);
++ error = path_lookup_it(pathname, lookup_flags(flag), nd, it);
+ if (error)
+ return error;
+ dentry = nd->dentry;
+@@ -1023,6 +1144,10 @@ int open_namei(const char * pathname, in
+ /*
+ * Create - we need to know the parent.
+ */
++ if (it) {
++ it->it_create_mode = mode;
++ it->it_op |= IT_CREAT;
++ }
+ error = path_lookup(pathname, LOOKUP_PARENT, nd);
+ if (error)
+ return error;
+@@ -1038,7 +1163,7 @@ int open_namei(const char * pathname, in
+
+ dir = nd->dentry;
+ down(&dir->d_inode->i_sem);
+- dentry = lookup_hash(&nd->last, nd->dentry);
++ dentry = lookup_hash_it(&nd->last, nd->dentry, it);
+
+ do_last:
+ error = PTR_ERR(dentry);
+@@ -1047,10 +1172,11 @@ do_last:
+ goto exit;
+ }
+
++ it->it_create_mode = mode;
+ /* Negative dentry, just create the file */
+ if (!dentry->d_inode) {
+- error = vfs_create(dir->d_inode, dentry,
+- mode & ~current->fs->umask);
++ error = vfs_create_it(dir->d_inode, dentry,
++ mode & ~current->fs->umask, it);
+ up(&dir->d_inode->i_sem);
+ dput(nd->dentry);
+ nd->dentry = dentry;
+@@ -1154,7 +1280,7 @@ ok:
+ if (!error) {
+ DQUOT_INIT(inode);
+
+- error = do_truncate(dentry, 0);
++ error = do_truncate(dentry, 0, 1);
+ }
+ put_write_access(inode);
+ if (error)
+@@ -1166,8 +1292,10 @@ ok:
+ return 0;
+
+ exit_dput:
++ intent_release(it);
+ dput(dentry);
+ exit:
++ intent_release(it);
+ path_release(nd);
+ return error;
+
+@@ -1186,7 +1314,10 @@ do_link:
+ * are done. Procfs-like symlinks just set LAST_BIND.
+ */
+ UPDATE_ATIME(dentry->d_inode);
++ nd->intent = it;
+ error = dentry->d_inode->i_op->follow_link(dentry, nd);
++ if (error)
++ intent_release(it);
+ dput(dentry);
+ if (error)
+ return error;
+@@ -1208,13 +1339,20 @@ do_link:
+ }
+ dir = nd->dentry;
+ down(&dir->d_inode->i_sem);
+- dentry = lookup_hash(&nd->last, nd->dentry);
++ dentry = lookup_hash_it(&nd->last, nd->dentry, it);
+ putname(nd->last.name);
+ goto do_last;
+ }
+
++int open_namei(const char *pathname, int flag, int mode, struct nameidata *nd)
++{
++ return open_namei_it(pathname, flag, mode, nd, NULL);
++}
++
++
+ /* SMP-safe */
+-static struct dentry *lookup_create(struct nameidata *nd, int is_dir)
++static struct dentry *lookup_create(struct nameidata *nd, int is_dir,
++ struct lookup_intent *it)
+ {
+ struct dentry *dentry;
+
+@@ -1222,7 +1360,7 @@ static struct dentry *lookup_create(stru
+ dentry = ERR_PTR(-EEXIST);
+ if (nd->last_type != LAST_NORM)
+ goto fail;
+- dentry = lookup_hash(&nd->last, nd->dentry);
++ dentry = lookup_hash_it(&nd->last, nd->dentry, it);
+ if (IS_ERR(dentry))
+ goto fail;
+ if (!is_dir && nd->last.name[nd->last.len] && !dentry->d_inode)
+@@ -1278,7 +1416,16 @@ asmlinkage long sys_mknod(const char * f
+ error = path_lookup(tmp, LOOKUP_PARENT, &nd);
+ if (error)
+ goto out;
+- dentry = lookup_create(&nd, 0);
++
++ if (nd.dentry->d_inode->i_op->mknod_raw) {
++ struct inode_operations *op = nd.dentry->d_inode->i_op;
++ error = op->mknod_raw(&nd, mode, dev);
++ /* the file system wants to use normal vfs path now */
++ if (error != -EOPNOTSUPP)
++ goto out2;
++ }
++
++ dentry = lookup_create(&nd, 0, NULL);
+ error = PTR_ERR(dentry);
+
+ mode &= ~current->fs->umask;
+@@ -1299,6 +1446,7 @@ asmlinkage long sys_mknod(const char * f
+ dput(dentry);
+ }
+ up(&nd.dentry->d_inode->i_sem);
++out2:
+ path_release(&nd);
+ out:
+ putname(tmp);
+@@ -1346,7 +1494,14 @@ asmlinkage long sys_mkdir(const char * p
+ error = path_lookup(tmp, LOOKUP_PARENT, &nd);
+ if (error)
+ goto out;
+- dentry = lookup_create(&nd, 1);
++ if (nd.dentry->d_inode->i_op->mkdir_raw) {
++ struct inode_operations *op = nd.dentry->d_inode->i_op;
++ error = op->mkdir_raw(&nd, mode);
++ /* the file system wants to use normal vfs path now */
++ if (error != -EOPNOTSUPP)
++ goto out2;
++ }
++ dentry = lookup_create(&nd, 1, NULL);
+ error = PTR_ERR(dentry);
+ if (!IS_ERR(dentry)) {
+ error = vfs_mkdir(nd.dentry->d_inode, dentry,
+@@ -1354,6 +1509,7 @@ asmlinkage long sys_mkdir(const char * p
+ dput(dentry);
+ }
+ up(&nd.dentry->d_inode->i_sem);
++out2:
+ path_release(&nd);
+ out:
+ putname(tmp);
+@@ -1454,8 +1610,16 @@ asmlinkage long sys_rmdir(const char * p
+ error = -EBUSY;
+ goto exit1;
+ }
++ if (nd.dentry->d_inode->i_op->rmdir_raw) {
++ struct inode_operations *op = nd.dentry->d_inode->i_op;
++
++ error = op->rmdir_raw(&nd);
++ /* the file system wants to use normal vfs path now */
++ if (error != -EOPNOTSUPP)
++ goto exit1;
++ }
+ down(&nd.dentry->d_inode->i_sem);
+- dentry = lookup_hash(&nd.last, nd.dentry);
++ dentry = lookup_hash_it(&nd.last, nd.dentry, NULL);
+ error = PTR_ERR(dentry);
+ if (!IS_ERR(dentry)) {
+ error = vfs_rmdir(nd.dentry->d_inode, dentry);
+@@ -1513,8 +1677,15 @@ asmlinkage long sys_unlink(const char *
+ error = -EISDIR;
+ if (nd.last_type != LAST_NORM)
+ goto exit1;
++ if (nd.dentry->d_inode->i_op->unlink_raw) {
++ struct inode_operations *op = nd.dentry->d_inode->i_op;
++ error = op->unlink_raw(&nd);
++ /* the file system wants to use normal vfs path now */
++ if (error != -EOPNOTSUPP)
++ goto exit1;
++ }
+ down(&nd.dentry->d_inode->i_sem);
+- dentry = lookup_hash(&nd.last, nd.dentry);
++ dentry = lookup_hash_it(&nd.last, nd.dentry, NULL);
+ error = PTR_ERR(dentry);
+ if (!IS_ERR(dentry)) {
+ /* Why not before? Because we want correct error value */
+@@ -1581,15 +1752,23 @@ asmlinkage long sys_symlink(const char *
+ error = path_lookup(to, LOOKUP_PARENT, &nd);
+ if (error)
+ goto out;
+- dentry = lookup_create(&nd, 0);
++ if (nd.dentry->d_inode->i_op->symlink_raw) {
++ struct inode_operations *op = nd.dentry->d_inode->i_op;
++ error = op->symlink_raw(&nd, from);
++ /* the file system wants to use normal vfs path now */
++ if (error != -EOPNOTSUPP)
++ goto out2;
++ }
++ dentry = lookup_create(&nd, 0, NULL);
+ error = PTR_ERR(dentry);
+ if (!IS_ERR(dentry)) {
+ error = vfs_symlink(nd.dentry->d_inode, dentry, from);
+ dput(dentry);
+ }
+ up(&nd.dentry->d_inode->i_sem);
++ out2:
+ path_release(&nd);
+-out:
++ out:
+ putname(to);
+ }
+ putname(from);
+@@ -1665,7 +1844,14 @@ asmlinkage long sys_link(const char * ol
+ error = -EXDEV;
+ if (old_nd.mnt != nd.mnt)
+ goto out_release;
+- new_dentry = lookup_create(&nd, 0);
++ if (nd.dentry->d_inode->i_op->link_raw) {
++ struct inode_operations *op = nd.dentry->d_inode->i_op;
++ error = op->link_raw(&old_nd, &nd);
++ /* the file system wants to use normal vfs path now */
++ if (error != -EOPNOTSUPP)
++ goto out_release;
++ }
++ new_dentry = lookup_create(&nd, 0, NULL);
+ error = PTR_ERR(new_dentry);
+ if (!IS_ERR(new_dentry)) {
+ error = vfs_link(old_nd.dentry, nd.dentry->d_inode, new_dentry);
+@@ -1709,7 +1895,7 @@ exit:
+ * locking].
+ */
+ int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
+- struct inode *new_dir, struct dentry *new_dentry)
++ struct inode *new_dir, struct dentry *new_dentry)
+ {
+ int error;
+ struct inode *target;
+@@ -1788,7 +1974,7 @@ out_unlock:
+ }
+
+ int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
+- struct inode *new_dir, struct dentry *new_dentry)
++ struct inode *new_dir, struct dentry *new_dentry)
+ {
+ int error;
+
+@@ -1876,9 +2062,18 @@ static inline int do_rename(const char *
+ if (newnd.last_type != LAST_NORM)
+ goto exit2;
+
++ if (old_dir->d_inode->i_op->rename_raw) {
++ lock_kernel();
++ error = old_dir->d_inode->i_op->rename_raw(&oldnd, &newnd);
++ unlock_kernel();
++ /* the file system wants to use normal vfs path now */
++ if (error != -EOPNOTSUPP)
++ goto exit2;
++ }
++
+ double_lock(new_dir, old_dir);
+
+- old_dentry = lookup_hash(&oldnd.last, old_dir);
++ old_dentry = lookup_hash_it(&oldnd.last, old_dir, NULL);
+ error = PTR_ERR(old_dentry);
+ if (IS_ERR(old_dentry))
+ goto exit3;
+@@ -1894,16 +2089,16 @@ static inline int do_rename(const char *
+ if (newnd.last.name[newnd.last.len])
+ goto exit4;
+ }
+- new_dentry = lookup_hash(&newnd.last, new_dir);
++ new_dentry = lookup_hash_it(&newnd.last, new_dir, NULL);
+ error = PTR_ERR(new_dentry);
+ if (IS_ERR(new_dentry))
+ goto exit4;
+
++
+ lock_kernel();
+ error = vfs_rename(old_dir->d_inode, old_dentry,
+ new_dir->d_inode, new_dentry);
+ unlock_kernel();
+-
+ dput(new_dentry);
+ exit4:
+ dput(old_dentry);
+@@ -1954,20 +2149,26 @@ out:
+ }
+
+ static inline int
+-__vfs_follow_link(struct nameidata *nd, const char *link)
++__vfs_follow_link(struct nameidata *nd, const char *link,
++ struct lookup_intent *it)
+ {
+ int res = 0;
+ char *name;
+ if (IS_ERR(link))
+ goto fail;
+
++ if (it == NULL)
++ it = nd->intent;
++ else if (it != nd->intent)
++ printk("it != nd->intent: tell phil@clusterfs.com\n");
++
+ if (*link == '/') {
+ path_release(nd);
+ if (!walk_init_root(link, nd))
+ /* weird __emul_prefix() stuff did it */
+ goto out;
+ }
+- res = link_path_walk(link, nd);
++ res = link_path_walk_it(link, nd, it);
+ out:
+ if (current->link_count || res || nd->last_type!=LAST_NORM)
+ return res;
+@@ -1991,7 +2192,13 @@ fail:
+
+ int vfs_follow_link(struct nameidata *nd, const char *link)
+ {
+- return __vfs_follow_link(nd, link);
++ return __vfs_follow_link(nd, link, NULL);
++}
++
++int vfs_follow_link_it(struct nameidata *nd, const char *link,
++ struct lookup_intent *it)
++{
++ return __vfs_follow_link(nd, link, it);
+ }
+
+ /* get the link contents into pagecache */
+@@ -2033,7 +2240,7 @@ int page_follow_link(struct dentry *dent
+ {
+ struct page *page = NULL;
+ char *s = page_getlink(dentry, &page);
+- int res = __vfs_follow_link(nd, s);
++ int res = __vfs_follow_link(nd, s, NULL);
+ if (page) {
+ kunmap(page);
+ page_cache_release(page);
+--- linux-2.4.22-ac1/fs/namespace.c~vfs_intent-2.4.22-rh 2003-09-25 14:16:28.000000000 +0400
++++ linux-2.4.22-ac1-alexey/fs/namespace.c 2003-09-25 14:42:46.000000000 +0400
+@@ -98,6 +98,7 @@ static void detach_mnt(struct vfsmount *
+ {
+ old_nd->dentry = mnt->mnt_mountpoint;
+ old_nd->mnt = mnt->mnt_parent;
++ UNPIN(old_nd->dentry, old_nd->mnt, 1);
+ mnt->mnt_parent = mnt;
+ mnt->mnt_mountpoint = mnt->mnt_root;
+ list_del_init(&mnt->mnt_child);
+@@ -109,6 +110,7 @@ static void attach_mnt(struct vfsmount *
+ {
+ mnt->mnt_parent = mntget(nd->mnt);
+ mnt->mnt_mountpoint = dget(nd->dentry);
++ PIN(nd->dentry, nd->mnt, 1);
+ list_add(&mnt->mnt_hash, mount_hashtable+hash(nd->mnt, nd->dentry));
+ list_add(&mnt->mnt_child, &nd->mnt->mnt_mounts);
+ nd->dentry->d_mounted++;
+@@ -488,14 +490,17 @@ static int do_loopback(struct nameidata
+ {
+ struct nameidata old_nd;
+ struct vfsmount *mnt = NULL;
++ struct lookup_intent it = { .it_op = IT_GETATTR };
+ int err = mount_is_safe(nd);
+ if (err)
+ return err;
+ if (!old_name || !*old_name)
+ return -EINVAL;
+- err = path_lookup(old_name, LOOKUP_POSITIVE|LOOKUP_FOLLOW, &old_nd);
+- if (err)
++ err = path_lookup_it(old_name, LOOKUP_POSITIVE|LOOKUP_FOLLOW, &old_nd, &it);
++ if (err) {
++ intent_release(&it);
+ return err;
++ }
+
+ down_write(¤t->namespace->sem);
+ err = -EINVAL;
+@@ -518,6 +523,7 @@ static int do_loopback(struct nameidata
+ }
+
+ up_write(¤t->namespace->sem);
++ intent_release(&it);
+ path_release(&old_nd);
+ return err;
+ }
+@@ -701,6 +707,7 @@ long do_mount(char * dev_name, char * di
+ unsigned long flags, void *data_page)
+ {
+ struct nameidata nd;
++ struct lookup_intent it = { .it_op = IT_GETATTR };
+ int retval = 0;
+ int mnt_flags = 0;
+
+@@ -725,10 +732,11 @@ long do_mount(char * dev_name, char * di
+ flags &= ~(MS_NOSUID|MS_NOEXEC|MS_NODEV);
+
+ /* ... and get the mountpoint */
+- retval = path_lookup(dir_name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd);
+- if (retval)
++ retval = path_lookup_it(dir_name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd, &it);
++ if (retval) {
++ intent_release(&it);
+ return retval;
+-
++ }
+ if (flags & MS_REMOUNT)
+ retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags,
+ data_page);
+@@ -739,6 +747,8 @@ long do_mount(char * dev_name, char * di
+ else
+ retval = do_add_mount(&nd, type_page, flags, mnt_flags,
+ dev_name, data_page);
++
++ intent_release(&it);
+ path_release(&nd);
+ return retval;
+ }
+@@ -904,6 +914,8 @@ asmlinkage long sys_pivot_root(const cha
+ {
+ struct vfsmount *tmp;
+ struct nameidata new_nd, old_nd, parent_nd, root_parent, user_nd;
++ struct lookup_intent new_it = { .it_op = IT_GETATTR };
++ struct lookup_intent old_it = { .it_op = IT_GETATTR };
+ int error;
+
+ if (!capable(CAP_SYS_ADMIN))
+@@ -911,14 +923,14 @@ asmlinkage long sys_pivot_root(const cha
+
+ lock_kernel();
+
+- error = __user_walk(new_root, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &new_nd);
++ error = __user_walk_it(new_root, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &new_nd, &new_it);
+ if (error)
+ goto out0;
+ error = -EINVAL;
+ if (!check_mnt(new_nd.mnt))
+ goto out1;
+
+- error = __user_walk(put_old, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &old_nd);
++ error = __user_walk_it(put_old, LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &old_nd, &old_it);
+ if (error)
+ goto out1;
+
+@@ -973,8 +985,10 @@ out2:
+ up(&old_nd.dentry->d_inode->i_zombie);
+ up_write(¤t->namespace->sem);
+ path_release(&user_nd);
++ intent_release(&old_it);
+ path_release(&old_nd);
+ out1:
++ intent_release(&new_it);
+ path_release(&new_nd);
+ out0:
+ unlock_kernel();
+--- linux-2.4.22-ac1/fs/open.c~vfs_intent-2.4.22-rh 2003-08-25 15:44:43.000000000 +0400
++++ linux-2.4.22-ac1-alexey/fs/open.c 2003-09-25 14:42:46.000000000 +0400
+@@ -19,6 +19,8 @@
+ #include <asm/uaccess.h>
+
+ #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))
++extern int path_walk_it(const char *name, struct nameidata *nd,
++ struct lookup_intent *it);
+
+ int vfs_statfs(struct super_block *sb, struct statfs *buf)
+ {
+@@ -95,9 +97,10 @@ void fd_install(unsigned int fd, struct
+ write_unlock(&files->file_lock);
+ }
+
+-int do_truncate(struct dentry *dentry, loff_t length)
++int do_truncate(struct dentry *dentry, loff_t length, int called_from_open)
+ {
+ struct inode *inode = dentry->d_inode;
++ struct inode_operations *op = dentry->d_inode->i_op;
+ int error;
+ struct iattr newattrs;
+
+@@ -109,7 +112,13 @@ int do_truncate(struct dentry *dentry, l
+ down(&inode->i_sem);
+ newattrs.ia_size = length;
+ newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
+- error = notify_change(dentry, &newattrs);
++ if (called_from_open)
++ newattrs.ia_valid |= ATTR_FROM_OPEN;
++ if (op->setattr_raw) {
++ newattrs.ia_valid |= ATTR_RAW;
++ error = op->setattr_raw(inode, &newattrs);
++ } else
++ error = notify_change(dentry, &newattrs);
+ up(&inode->i_sem);
+ up_write(&inode->i_alloc_sem);
+ return error;
+@@ -120,12 +129,13 @@ static inline long do_sys_truncate(const
+ struct nameidata nd;
+ struct inode * inode;
+ int error;
++ struct lookup_intent it = { .it_op = IT_GETATTR };
+
+ error = -EINVAL;
+ if (length < 0) /* sorry, but loff_t says... */
+ goto out;
+
+- error = user_path_walk(path, &nd);
++ error = user_path_walk_it(path, &nd, &it);
+ if (error)
+ goto out;
+ inode = nd.dentry->d_inode;
+@@ -165,11 +175,13 @@ static inline long do_sys_truncate(const
+ error = locks_verify_truncate(inode, NULL, length);
+ if (!error) {
+ DQUOT_INIT(inode);
+- error = do_truncate(nd.dentry, length);
++ intent_release(&it);
++ error = do_truncate(nd.dentry, length, 0);
+ }
+ put_write_access(inode);
+
+ dput_and_out:
++ intent_release(&it);
+ path_release(&nd);
+ out:
+ return error;
+@@ -217,7 +229,7 @@ static inline long do_sys_ftruncate(unsi
+
+ error = locks_verify_truncate(inode, file, length);
+ if (!error)
+- error = do_truncate(dentry, length);
++ error = do_truncate(dentry, length, 0);
+ out_putf:
+ fput(file);
+ out:
+@@ -262,11 +274,13 @@ asmlinkage long sys_utime(char * filenam
+ struct inode * inode;
+ struct iattr newattrs;
+
+- error = user_path_walk(filename, &nd);
++ error = user_path_walk_it(filename, &nd, NULL);
+ if (error)
+ goto out;
+ inode = nd.dentry->d_inode;
+
++ /* this is safe without a Lustre lock because it only depends
++ on the super block */
+ error = -EROFS;
+ if (IS_RDONLY(inode))
+ goto dput_and_out;
+@@ -281,11 +295,25 @@ asmlinkage long sys_utime(char * filenam
+ goto dput_and_out;
+
+ newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET;
+- } else {
++ }
++
++ if (inode->i_op->setattr_raw) {
++ struct inode_operations *op = nd.dentry->d_inode->i_op;
++
++ newattrs.ia_valid |= ATTR_RAW;
++ error = op->setattr_raw(inode, &newattrs);
++ /* the file system wants to use normal vfs path now */
++ if (error != -EOPNOTSUPP)
++ goto dput_and_out;
++ }
++
++ error = -EPERM;
++ if (!times) {
+ if (current->fsuid != inode->i_uid &&
+ (error = permission(inode,MAY_WRITE)) != 0)
+ goto dput_and_out;
+ }
++
+ error = notify_change(nd.dentry, &newattrs);
+ dput_and_out:
+ path_release(&nd);
+@@ -306,12 +334,14 @@ asmlinkage long sys_utimes(char * filena
+ struct inode * inode;
+ struct iattr newattrs;
+
+- error = user_path_walk(filename, &nd);
++ error = user_path_walk_it(filename, &nd, NULL);
+
+ if (error)
+ goto out;
+ inode = nd.dentry->d_inode;
+
++ /* this is safe without a Lustre lock because it only depends
++ on the super block */
+ error = -EROFS;
+ if (IS_RDONLY(inode))
+ goto dput_and_out;
+@@ -326,7 +356,20 @@ asmlinkage long sys_utimes(char * filena
+ newattrs.ia_atime = times[0].tv_sec;
+ newattrs.ia_mtime = times[1].tv_sec;
+ newattrs.ia_valid |= ATTR_ATIME_SET | ATTR_MTIME_SET;
+- } else {
++ }
++
++ if (inode->i_op->setattr_raw) {
++ struct inode_operations *op = nd.dentry->d_inode->i_op;
++
++ newattrs.ia_valid |= ATTR_RAW;
++ error = op->setattr_raw(inode, &newattrs);
++ /* the file system wants to use normal vfs path now */
++ if (error != -EOPNOTSUPP)
++ goto dput_and_out;
++ }
++
++ error = -EPERM;
++ if (!utimes) {
+ if (current->fsuid != inode->i_uid &&
+ (error = permission(inode,MAY_WRITE)) != 0)
+ goto dput_and_out;
+@@ -349,6 +392,7 @@ asmlinkage long sys_access(const char *
+ int old_fsuid, old_fsgid;
+ kernel_cap_t old_cap;
+ int res;
++ struct lookup_intent it = { .it_op = IT_GETATTR };
+
+ if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */
+ return -EINVAL;
+@@ -366,13 +410,14 @@ asmlinkage long sys_access(const char *
+ else
+ current->cap_effective = current->cap_permitted;
+
+- res = user_path_walk(filename, &nd);
++ res = user_path_walk_it(filename, &nd, &it);
+ if (!res) {
+ res = permission(nd.dentry->d_inode, mode);
+ /* SuS v2 requires we report a read only fs too */
+ if(!res && (mode & S_IWOTH) && IS_RDONLY(nd.dentry->d_inode)
+ && !special_file(nd.dentry->d_inode->i_mode))
+ res = -EROFS;
++ intent_release(&it);
+ path_release(&nd);
+ }
+
+@@ -387,8 +432,9 @@ asmlinkage long sys_chdir(const char * f
+ {
+ int error;
+ struct nameidata nd;
++ struct lookup_intent it = { .it_op = IT_GETATTR };
+
+- error = __user_walk(filename,LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY,&nd);
++ error = __user_walk_it(filename,LOOKUP_POSITIVE|LOOKUP_FOLLOW|LOOKUP_DIRECTORY,&nd, &it);
+ if (error)
+ goto out;
+
+@@ -399,6 +445,7 @@ asmlinkage long sys_chdir(const char * f
+ set_fs_pwd(current->fs, nd.mnt, nd.dentry);
+
+ dput_and_out:
++ intent_release(&it);
+ path_release(&nd);
+ out:
+ return error;
+@@ -438,9 +485,10 @@ asmlinkage long sys_chroot(const char *
+ {
+ int error;
+ struct nameidata nd;
++ struct lookup_intent it = { .it_op = IT_GETATTR };
+
+- error = __user_walk(filename, LOOKUP_POSITIVE | LOOKUP_FOLLOW |
+- LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd);
++ error = __user_walk_it(filename, LOOKUP_POSITIVE | LOOKUP_FOLLOW |
++ LOOKUP_DIRECTORY | LOOKUP_NOALT, &nd, &it);
+ if (error)
+ goto out;
+
+@@ -456,39 +504,56 @@ asmlinkage long sys_chroot(const char *
+ set_fs_altroot();
+ error = 0;
+ dput_and_out:
++ intent_release(&it);
+ path_release(&nd);
+ out:
+ return error;
+ }
+
+-asmlinkage long sys_fchmod(unsigned int fd, mode_t mode)
++int chmod_common(struct dentry *dentry, mode_t mode)
+ {
+- struct inode * inode;
+- struct dentry * dentry;
+- struct file * file;
+- int err = -EBADF;
++ struct inode *inode = dentry->d_inode;
+ struct iattr newattrs;
++ int err = -EROFS;
+
+- file = fget(fd);
+- if (!file)
++ if (IS_RDONLY(inode))
+ goto out;
+
+- dentry = file->f_dentry;
+- inode = dentry->d_inode;
++ if (inode->i_op->setattr_raw) {
++ newattrs.ia_mode = mode;
++ newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
++ newattrs.ia_valid |= ATTR_RAW;
++ err = inode->i_op->setattr_raw(inode, &newattrs);
++ /* the file system wants to use normal vfs path now */
++ if (err != -EOPNOTSUPP)
++ goto out;
++ }
+
+- err = -EROFS;
+- if (IS_RDONLY(inode))
+- goto out_putf;
+ err = -EPERM;
+ if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+- goto out_putf;
++ goto out;
++
+ if (mode == (mode_t) -1)
+ mode = inode->i_mode;
+ newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
+ newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
+ err = notify_change(dentry, &newattrs);
+
+-out_putf:
++out:
++ return err;
++}
++
++asmlinkage long sys_fchmod(unsigned int fd, mode_t mode)
++{
++ struct file * file;
++ int err = -EBADF;
++
++ file = fget(fd);
++ if (!file)
++ goto out;
++
++ err = chmod_common(file->f_dentry, mode);
++
+ fput(file);
+ out:
+ return err;
+@@ -497,30 +562,14 @@ out:
+ asmlinkage long sys_chmod(const char * filename, mode_t mode)
+ {
+ struct nameidata nd;
+- struct inode * inode;
+ int error;
+- struct iattr newattrs;
+
+ error = user_path_walk(filename, &nd);
+ if (error)
+ goto out;
+- inode = nd.dentry->d_inode;
+-
+- error = -EROFS;
+- if (IS_RDONLY(inode))
+- goto dput_and_out;
+
+- error = -EPERM;
+- if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+- goto dput_and_out;
++ error = chmod_common(nd.dentry, mode);
+
+- if (mode == (mode_t) -1)
+- mode = inode->i_mode;
+- newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
+- newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
+- error = notify_change(nd.dentry, &newattrs);
+-
+-dput_and_out:
+ path_release(&nd);
+ out:
+ return error;
+@@ -540,6 +589,20 @@ static int chown_common(struct dentry *
+ error = -EROFS;
+ if (IS_RDONLY(inode))
+ goto out;
++
++ if (inode->i_op->setattr_raw) {
++ struct inode_operations *op = dentry->d_inode->i_op;
++
++ newattrs.ia_uid = user;
++ newattrs.ia_gid = group;
++ newattrs.ia_valid = ATTR_UID | ATTR_GID | ATTR_CTIME;
++ newattrs.ia_valid |= ATTR_RAW;
++ error = op->setattr_raw(inode, &newattrs);
++ /* the file system wants to use normal vfs path now */
++ if (error != -EOPNOTSUPP)
++ return error;
++ }
++
+ error = -EPERM;
+ if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+ goto out;
+@@ -644,6 +707,7 @@ struct file *filp_open(const char * file
+ {
+ int namei_flags, error;
+ struct nameidata nd;
++ struct lookup_intent it = { .it_op = IT_OPEN };
+
+ namei_flags = flags;
+ if ((namei_flags+1) & O_ACCMODE)
+@@ -651,14 +715,15 @@ struct file *filp_open(const char * file
+ if (namei_flags & O_TRUNC)
+ namei_flags |= 2;
+
+- error = open_namei(filename, namei_flags, mode, &nd);
+- if (!error)
+- return dentry_open(nd.dentry, nd.mnt, flags);
++ error = open_namei_it(filename, namei_flags, mode, &nd, &it);
++ if (error)
++ return ERR_PTR(error);
+
+- return ERR_PTR(error);
++ return dentry_open_it(nd.dentry, nd.mnt, flags, &it);
+ }
+
+-struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
++struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt,
++ int flags, struct lookup_intent *it)
+ {
+ struct file * f;
+ struct inode *inode;
+@@ -695,12 +760,15 @@ struct file *dentry_open(struct dentry *
+ }
+
+ if (f->f_op && f->f_op->open) {
++ f->f_it = it;
+ error = f->f_op->open(inode,f);
++ f->f_it = NULL;
+ if (error)
+ goto cleanup_all;
+ }
+ f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
+
++ intent_release(it);
+ return f;
+
+ cleanup_all:
+@@ -715,11 +783,17 @@ cleanup_all:
+ cleanup_file:
+ put_filp(f);
+ cleanup_dentry:
++ intent_release(it);
+ dput(dentry);
+ mntput(mnt);
+ return ERR_PTR(error);
+ }
+
++struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
++{
++ return dentry_open_it(dentry, mnt, flags, NULL);
++}
++
+ /*
+ * Find an empty file descriptor entry, and mark it busy.
+ */
+--- linux-2.4.22-ac1/fs/stat.c~vfs_intent-2.4.22-rh 2003-09-25 14:16:27.000000000 +0400
++++ linux-2.4.22-ac1-alexey/fs/stat.c 2003-09-25 14:42:46.000000000 +0400
+@@ -17,10 +17,14 @@
+ * Revalidate the inode. This is required for proper NFS attribute caching.
+ */
+ static __inline__ int
+-do_revalidate(struct dentry *dentry)
++do_revalidate(struct dentry *dentry, struct lookup_intent *it)
+ {
+ struct inode * inode = dentry->d_inode;
+- if (inode->i_op && inode->i_op->revalidate)
++ if (!inode)
++ return -ENOENT;
++ if (inode->i_op && inode->i_op->revalidate_it)
++ return inode->i_op->revalidate_it(dentry, it);
++ else if (inode->i_op && inode->i_op->revalidate)
+ return inode->i_op->revalidate(dentry);
+ return 0;
+ }
+@@ -143,13 +147,15 @@ static int cp_new_stat(struct inode * in
+ asmlinkage long sys_stat(char * filename, struct __old_kernel_stat * statbuf)
+ {
+ struct nameidata nd;
++ struct lookup_intent it = { .it_op = IT_GETATTR };
+ int error;
+
+- error = user_path_walk(filename, &nd);
++ error = user_path_walk_it(filename, &nd, &it);
+ if (!error) {
+- error = do_revalidate(nd.dentry);
++ error = do_revalidate(nd.dentry, &it);
+ if (!error)
+ error = cp_old_stat(nd.dentry->d_inode, statbuf);
++ intent_release(&it);
+ path_release(&nd);
+ }
+ return error;
+@@ -159,13 +165,15 @@ asmlinkage long sys_stat(char * filename
+ asmlinkage long sys_newstat(char * filename, struct stat * statbuf)
+ {
+ struct nameidata nd;
++ struct lookup_intent it = { .it_op = IT_GETATTR };
+ int error;
+
+- error = user_path_walk(filename, &nd);
++ error = user_path_walk_it(filename, &nd, &it);
+ if (!error) {
+- error = do_revalidate(nd.dentry);
++ error = do_revalidate(nd.dentry, &it);
+ if (!error)
+ error = cp_new_stat(nd.dentry->d_inode, statbuf);
++ intent_release(&it);
+ path_release(&nd);
+ }
+ return error;
+@@ -180,13 +188,15 @@ asmlinkage long sys_newstat(char * filen
+ asmlinkage long sys_lstat(char * filename, struct __old_kernel_stat * statbuf)
+ {
+ struct nameidata nd;
++ struct lookup_intent it = { .it_op = IT_GETATTR };
+ int error;
+
+- error = user_path_walk_link(filename, &nd);
++ error = user_path_walk_link_it(filename, &nd, &it);
+ if (!error) {
+- error = do_revalidate(nd.dentry);
++ error = do_revalidate(nd.dentry, &it);
+ if (!error)
+ error = cp_old_stat(nd.dentry->d_inode, statbuf);
++ intent_release(&it);
+ path_release(&nd);
+ }
+ return error;
+@@ -197,13 +207,15 @@ asmlinkage long sys_lstat(char * filenam
+ asmlinkage long sys_newlstat(char * filename, struct stat * statbuf)
+ {
+ struct nameidata nd;
++ struct lookup_intent it = { .it_op = IT_GETATTR };
+ int error;
+
+- error = user_path_walk_link(filename, &nd);
++ error = user_path_walk_link_it(filename, &nd, &it);
+ if (!error) {
+- error = do_revalidate(nd.dentry);
++ error = do_revalidate(nd.dentry, &it);
+ if (!error)
+ error = cp_new_stat(nd.dentry->d_inode, statbuf);
++ intent_release(&it);
+ path_release(&nd);
+ }
+ return error;
+@@ -224,7 +236,7 @@ asmlinkage long sys_fstat(unsigned int f
+ if (f) {
+ struct dentry * dentry = f->f_dentry;
+
+- err = do_revalidate(dentry);
++ err = do_revalidate(dentry, NULL);
+ if (!err)
+ err = cp_old_stat(dentry->d_inode, statbuf);
+ fput(f);
+@@ -243,7 +255,7 @@ asmlinkage long sys_newfstat(unsigned in
+ if (f) {
+ struct dentry * dentry = f->f_dentry;
+
+- err = do_revalidate(dentry);
++ err = do_revalidate(dentry, NULL);
+ if (!err)
+ err = cp_new_stat(dentry->d_inode, statbuf);
+ fput(f);
+@@ -265,7 +277,7 @@ asmlinkage long sys_readlink(const char
+
+ error = -EINVAL;
+ if (inode->i_op && inode->i_op->readlink &&
+- !(error = do_revalidate(nd.dentry))) {
++ !(error = do_revalidate(nd.dentry, NULL))) {
+ UPDATE_ATIME(inode);
+ error = inode->i_op->readlink(nd.dentry, buf, bufsiz);
+ }
+@@ -341,12 +353,14 @@ asmlinkage long sys_stat64(char * filena
+ {
+ struct nameidata nd;
+ int error;
++ struct lookup_intent it = { .it_op = IT_GETATTR };
+
+- error = user_path_walk(filename, &nd);
++ error = user_path_walk_it(filename, &nd, &it);
+ if (!error) {
+- error = do_revalidate(nd.dentry);
++ error = do_revalidate(nd.dentry, &it);
+ if (!error)
+ error = cp_new_stat64(nd.dentry->d_inode, statbuf);
++ intent_release(&it);
+ path_release(&nd);
+ }
+ return error;
+@@ -356,12 +370,14 @@ asmlinkage long sys_lstat64(char * filen
+ {
+ struct nameidata nd;
+ int error;
++ struct lookup_intent it = { .it_op = IT_GETATTR };
+
+- error = user_path_walk_link(filename, &nd);
++ error = user_path_walk_link_it(filename, &nd, &it);
+ if (!error) {
+- error = do_revalidate(nd.dentry);
++ error = do_revalidate(nd.dentry, &it);
+ if (!error)
+ error = cp_new_stat64(nd.dentry->d_inode, statbuf);
++ intent_release(&it);
+ path_release(&nd);
+ }
+ return error;
+@@ -376,7 +392,7 @@ asmlinkage long sys_fstat64(unsigned lon
+ if (f) {
+ struct dentry * dentry = f->f_dentry;
+
+- err = do_revalidate(dentry);
++ err = do_revalidate(dentry, NULL);
+ if (!err)
+ err = cp_new_stat64(dentry->d_inode, statbuf);
+ fput(f);
+--- linux-2.4.22-ac1/include/linux/dcache.h~vfs_intent-2.4.22-rh 2003-09-25 14:16:28.000000000 +0400
++++ linux-2.4.22-ac1-alexey/include/linux/dcache.h 2003-09-25 14:42:46.000000000 +0400
+@@ -6,6 +6,51 @@
+ #include <asm/atomic.h>
+ #include <linux/mount.h>
+ #include <linux/kernel.h>
++#include <linux/string.h>
++
++#define IT_OPEN 0x0001
++#define IT_CREAT 0x0002
++#define IT_READDIR 0x0004
++#define IT_GETATTR 0x0008
++#define IT_LOOKUP 0x0010
++#define IT_UNLINK 0x0020
++#define IT_GETXATTR 0x0040
++#define IT_EXEC 0x0080
++#define IT_PIN 0x0100
++
++#define IT_FL_LOCKED 0x0001
++#define IT_FL_FOLLOWED 0x0002 /* set by vfs_follow_link */
++
++#define INTENT_MAGIC 0x19620323
++
++
++struct lustre_intent_data {
++ int it_disposition;
++ int it_status;
++ __u64 it_lock_handle;
++ void *it_data;
++ int it_lock_mode;
++ int it_int_flags;
++};
++struct lookup_intent {
++ int it_magic;
++ void (*it_op_release)(struct lookup_intent *);
++ int it_op;
++ int it_flags;
++ int it_create_mode;
++ union {
++ struct lustre_intent_data lustre;
++ } d;
++};
++
++static inline void intent_init(struct lookup_intent *it, int op, int flags)
++{
++ memset(it, 0, sizeof(*it));
++ it->it_magic = INTENT_MAGIC;
++ it->it_op = op;
++ it->it_flags = flags;
++}
++
+
+ /*
+ * linux/include/linux/dcache.h
+@@ -95,8 +140,22 @@ struct dentry_operations {
+ int (*d_delete)(struct dentry *);
+ void (*d_release)(struct dentry *);
+ void (*d_iput)(struct dentry *, struct inode *);
++ int (*d_revalidate_it)(struct dentry *, int, struct lookup_intent *);
++ void (*d_pin)(struct dentry *, struct vfsmount * , int);
++ void (*d_unpin)(struct dentry *, struct vfsmount *, int);
+ };
+
++#define PIN(de,mnt,flag) if (de->d_op && de->d_op->d_pin) \
++ de->d_op->d_pin(de, mnt, flag);
++#define UNPIN(de,mnt,flag) if (de->d_op && de->d_op->d_unpin) \
++ de->d_op->d_unpin(de, mnt, flag);
++
++
++/* defined in fs/namei.c */
++extern void intent_release(struct lookup_intent *it);
++/* defined in fs/dcache.c */
++extern void __d_rehash(struct dentry * entry, int lock);
++
+ /* the dentry parameter passed to d_hash and d_compare is the parent
+ * directory of the entries to be compared. It is used in case these
+ * functions need any directory specific information for determining
+@@ -128,6 +187,7 @@ d_iput: no no yes
+ * s_nfsd_free_path semaphore will be down
+ */
+ #define DCACHE_REFERENCED 0x0008 /* Recently used, don't discard. */
++#define DCACHE_LUSTRE_INVALID 0x0010 /* Lustre invalidated */
+
+ extern spinlock_t dcache_lock;
+
+--- linux-2.4.22-ac1/include/linux/fs.h~vfs_intent-2.4.22-rh 2003-09-25 14:39:01.000000000 +0400
++++ linux-2.4.22-ac1-alexey/include/linux/fs.h 2003-09-25 14:42:46.000000000 +0400
+@@ -73,6 +73,7 @@ extern int leases_enable, dir_notify_ena
+
+ #define FMODE_READ 1
+ #define FMODE_WRITE 2
++#define FMODE_EXEC 4
+
+ #define READ 0
+ #define WRITE 1
+@@ -343,6 +344,9 @@ extern void set_bh_page(struct buffer_he
+ #define ATTR_MTIME_SET 256
+ #define ATTR_FORCE 512 /* Not a change, but a change it */
+ #define ATTR_ATTR_FLAG 1024
++#define ATTR_RAW 0x0800 /* file system, not vfs will massage attrs */
++#define ATTR_FROM_OPEN 0x1000 /* called from open path, ie O_TRUNC */
++#define ATTR_CTIME_SET 0x2000
+
+ /*
+ * This is the Inode Attributes structure, used for notify_change(). It
+@@ -481,6 +485,7 @@ struct inode {
+ struct pipe_inode_info *i_pipe;
+ struct block_device *i_bdev;
+ struct char_device *i_cdev;
++ void *i_filterdata;
+
+ unsigned long i_dnotify_mask; /* Directory notify events */
+ struct dnotify_struct *i_dnotify; /* for directory notifications */
+@@ -583,6 +588,7 @@ struct file {
+
+ /* needed for tty driver, and maybe others */
+ void *private_data;
++ struct lookup_intent *f_it;
+
+ /* preallocated helper kiobuf to speedup O_DIRECT */
+ struct kiobuf *f_iobuf;
+@@ -703,6 +709,7 @@ struct nameidata {
+ struct qstr last;
+ unsigned int flags;
+ int last_type;
++ struct lookup_intent *intent;
+ };
+
+ /*
+@@ -823,7 +830,8 @@ extern int vfs_symlink(struct inode *, s
+ extern int vfs_link(struct dentry *, struct inode *, struct dentry *);
+ extern int vfs_rmdir(struct inode *, struct dentry *);
+ extern int vfs_unlink(struct inode *, struct dentry *);
+-extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *);
++int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
++ struct inode *new_dir, struct dentry *new_dentry);
+
+ /*
+ * File types
+@@ -883,21 +891,32 @@ struct file_operations {
+
+ struct inode_operations {
+ int (*create) (struct inode *,struct dentry *,int);
++ int (*create_it) (struct inode *,struct dentry *,int, struct lookup_intent *);
+ struct dentry * (*lookup) (struct inode *,struct dentry *);
++ struct dentry * (*lookup_it) (struct inode *,struct dentry *, struct lookup_intent *, int flags);
+ int (*link) (struct dentry *,struct inode *,struct dentry *);
++ int (*link_raw) (struct nameidata *,struct nameidata *);
+ int (*unlink) (struct inode *,struct dentry *);
++ int (*unlink_raw) (struct nameidata *);
+ int (*symlink) (struct inode *,struct dentry *,const char *);
++ int (*symlink_raw) (struct nameidata *,const char *);
+ int (*mkdir) (struct inode *,struct dentry *,int);
++ int (*mkdir_raw) (struct nameidata *,int);
+ int (*rmdir) (struct inode *,struct dentry *);
++ int (*rmdir_raw) (struct nameidata *);
+ int (*mknod) (struct inode *,struct dentry *,int,int);
++ int (*mknod_raw) (struct nameidata *,int,dev_t);
+ int (*rename) (struct inode *, struct dentry *,
+ struct inode *, struct dentry *);
++ int (*rename_raw) (struct nameidata *, struct nameidata *);
+ int (*readlink) (struct dentry *, char *,int);
+ int (*follow_link) (struct dentry *, struct nameidata *);
+ void (*truncate) (struct inode *);
+ int (*permission) (struct inode *, int);
+ int (*revalidate) (struct dentry *);
++ int (*revalidate_it) (struct dentry *, struct lookup_intent *);
+ int (*setattr) (struct dentry *, struct iattr *);
++ int (*setattr_raw) (struct inode *, struct iattr *);
+ int (*getattr) (struct dentry *, struct iattr *);
+ int (*setxattr) (struct dentry *, const char *, void *, size_t, int);
+ ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
+@@ -1094,10 +1113,14 @@ static inline int get_lease(struct inode
+
+ asmlinkage long sys_open(const char *, int, int);
+ asmlinkage long sys_close(unsigned int); /* yes, it's really unsigned */
+-extern int do_truncate(struct dentry *, loff_t start);
++extern int do_truncate(struct dentry *, loff_t start, int called_from_open);
+
+ extern struct file *filp_open(const char *, int, int);
+ extern struct file * dentry_open(struct dentry *, struct vfsmount *, int);
++extern int open_namei_it(const char *filename, int namei_flags, int mode,
++ struct nameidata *nd, struct lookup_intent *it);
++extern struct file *dentry_open_it(struct dentry *dentry, struct vfsmount *mnt,
++ int flags, struct lookup_intent *it);
+ extern int filp_close(struct file *, fl_owner_t id);
+ extern char * getname(const char *);
+
+@@ -1388,6 +1411,7 @@ typedef int (*read_actor_t)(read_descrip
+ extern loff_t default_llseek(struct file *file, loff_t offset, int origin);
+
+ extern int FASTCALL(__user_walk(const char *, unsigned, struct nameidata *));
++extern int FASTCALL(__user_walk_it(const char *, unsigned, struct nameidata *, struct lookup_intent *it));
+ extern int FASTCALL(path_init(const char *, unsigned, struct nameidata *));
+ extern int FASTCALL(path_walk(const char *, struct nameidata *));
+ extern int FASTCALL(path_lookup(const char *, unsigned, struct nameidata *));
+@@ -1399,6 +1423,8 @@ extern struct dentry * lookup_one_len(co
+ extern struct dentry * lookup_hash(struct qstr *, struct dentry *);
+ #define user_path_walk(name,nd) __user_walk(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd)
+ #define user_path_walk_link(name,nd) __user_walk(name, LOOKUP_POSITIVE, nd)
++#define user_path_walk_it(name,nd,it) __user_walk_it(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, nd, it)
++#define user_path_walk_link_it(name,nd,it) __user_walk_it(name, LOOKUP_POSITIVE, nd, it)
+
+ extern void inode_init_once(struct inode *);
+ extern void iput(struct inode *);
+@@ -1538,6 +1564,8 @@ extern struct file_operations generic_ro
+
+ extern int vfs_readlink(struct dentry *, char *, int, const char *);
+ extern int vfs_follow_link(struct nameidata *, const char *);
++extern int vfs_follow_link_it(struct nameidata *, const char *,
++ struct lookup_intent *it);
+ extern int page_readlink(struct dentry *, char *, int);
+ extern int page_follow_link(struct dentry *, struct nameidata *);
+ extern struct inode_operations page_symlink_inode_operations;
+--- linux-2.4.22-ac1/include/linux/fs_struct.h~vfs_intent-2.4.22-rh 2003-09-25 14:16:24.000000000 +0400
++++ linux-2.4.22-ac1-alexey/include/linux/fs_struct.h 2003-09-25 14:42:46.000000000 +0400
+@@ -37,10 +37,12 @@ static inline void set_fs_root(struct fs
+ write_lock(&fs->lock);
+ old_root = fs->root;
+ old_rootmnt = fs->rootmnt;
++ PIN(dentry, mnt, 1);
+ fs->rootmnt = mntget(mnt);
+ fs->root = dget(dentry);
+ write_unlock(&fs->lock);
+ if (old_root) {
++ UNPIN(old_root, old_rootmnt, 1);
+ dput(old_root);
+ mntput(old_rootmnt);
+ }
+@@ -60,10 +62,12 @@ static inline void set_fs_pwd(struct fs_
+ write_lock(&fs->lock);
+ old_pwd = fs->pwd;
+ old_pwdmnt = fs->pwdmnt;
++ PIN(dentry, mnt, 0);
+ fs->pwdmnt = mntget(mnt);
+ fs->pwd = dget(dentry);
+ write_unlock(&fs->lock);
+ if (old_pwd) {
++ UNPIN(old_pwd, old_pwdmnt, 0);
+ dput(old_pwd);
+ mntput(old_pwdmnt);
+ }
+--- linux-2.4.22-ac1/kernel/exit.c~vfs_intent-2.4.22-rh 2003-09-25 14:16:29.000000000 +0400
++++ linux-2.4.22-ac1-alexey/kernel/exit.c 2003-09-25 14:42:46.000000000 +0400
+@@ -342,11 +342,14 @@ static inline void __put_fs_struct(struc
+ {
+ /* No need to hold fs->lock if we are killing it */
+ if (atomic_dec_and_test(&fs->count)) {
++ UNPIN(fs->pwd, fs->pwdmnt, 0);
++ UNPIN(fs->root, fs->rootmnt, 1);
+ dput(fs->root);
+ mntput(fs->rootmnt);
+ dput(fs->pwd);
+ mntput(fs->pwdmnt);
+ if (fs->altroot) {
++ UNPIN(fs->altroot, fs->altrootmnt, 1);
+ dput(fs->altroot);
+ mntput(fs->altrootmnt);
+ }
+--- linux-2.4.22-ac1/kernel/fork.c~vfs_intent-2.4.22-rh 2003-09-25 14:16:28.000000000 +0400
++++ linux-2.4.22-ac1-alexey/kernel/fork.c 2003-09-25 14:42:46.000000000 +0400
+@@ -457,10 +457,13 @@ static inline struct fs_struct *__copy_f
+ fs->umask = old->umask;
+ read_lock(&old->lock);
+ fs->rootmnt = mntget(old->rootmnt);
++ PIN(old->pwd, old->pwdmnt, 0);
++ PIN(old->root, old->rootmnt, 1);
+ fs->root = dget(old->root);
+ fs->pwdmnt = mntget(old->pwdmnt);
+ fs->pwd = dget(old->pwd);
+ if (old->altroot) {
++ PIN(old->altroot, old->altrootmnt, 1);
+ fs->altrootmnt = mntget(old->altrootmnt);
+ fs->altroot = dget(old->altroot);
+ } else {
+--- linux-2.4.22-ac1/kernel/ksyms.c~vfs_intent-2.4.22-rh 2003-09-25 14:39:02.000000000 +0400
++++ linux-2.4.22-ac1-alexey/kernel/ksyms.c 2003-09-25 14:42:46.000000000 +0400
+@@ -295,6 +295,7 @@ EXPORT_SYMBOL(read_cache_page);
+ EXPORT_SYMBOL(set_page_dirty);
+ EXPORT_SYMBOL(vfs_readlink);
+ EXPORT_SYMBOL(vfs_follow_link);
++EXPORT_SYMBOL(vfs_follow_link_it);
+ EXPORT_SYMBOL(page_readlink);
+ EXPORT_SYMBOL(page_follow_link);
+ EXPORT_SYMBOL(page_symlink_inode_operations);
+
+_
--- /dev/null
+ Documentation/Configure.help | 66 ++
+ arch/alpha/defconfig | 7
+ arch/alpha/kernel/entry.S | 12
+ arch/arm/defconfig | 7
+ arch/arm/kernel/calls.S | 24
+ arch/i386/defconfig | 7
+ arch/ia64/defconfig | 7
+ arch/m68k/defconfig | 7
+ arch/mips/defconfig | 7
+ arch/mips64/defconfig | 7
+ arch/ppc/defconfig | 14
+ arch/ppc64/kernel/misc.S | 2
+ arch/s390/defconfig | 7
+ arch/s390/kernel/entry.S | 24
+ arch/s390x/defconfig | 7
+ arch/s390x/kernel/entry.S | 24
+ arch/s390x/kernel/wrapper32.S | 90 +++
+ arch/sparc/defconfig | 7
+ arch/sparc64/defconfig | 7
+ fs/Config.in | 14
+ fs/Makefile | 3
+ fs/ext2/Makefile | 4
+ fs/ext2/file.c | 5
+ fs/ext2/ialloc.c | 2
+ fs/ext2/inode.c | 34 -
+ fs/ext2/namei.c | 14
+ fs/ext2/super.c | 29
+ fs/ext2/symlink.c | 14
+ fs/ext2/xattr.c | 1212 +++++++++++++++++++++++++++++++++++++++++
+ fs/ext2/xattr_user.c | 103 +++
+ fs/ext3/Makefile | 10
+ fs/ext3/ext3-exports.c | 13
+ fs/ext3/file.c | 5
+ fs/ext3/ialloc.c | 2
+ fs/ext3/inode.c | 35 -
+ fs/ext3/namei.c | 21
+ fs/ext3/super.c | 37 +
+ fs/ext3/symlink.c | 14
+ fs/ext3/xattr.c | 1225 ++++++++++++++++++++++++++++++++++++++++++
+ fs/ext3/xattr_user.c | 111 +++
+ fs/jfs/jfs_xattr.h | 6
+ fs/jfs/xattr.c | 6
+ fs/mbcache.c | 648 ++++++++++++++++++++++
+ include/asm-arm/unistd.h | 2
+ include/asm-ppc64/unistd.h | 2
+ include/asm-s390/unistd.h | 13
+ include/asm-s390x/unistd.h | 13
+ include/linux/cache_def.h | 15
+ include/linux/errno.h | 4
+ include/linux/ext2_fs.h | 31 -
+ include/linux/ext2_xattr.h | 157 +++++
+ include/linux/ext3_fs.h | 31 -
+ include/linux/ext3_jbd.h | 8
+ include/linux/ext3_xattr.h | 157 +++++
+ include/linux/fs.h | 2
+ include/linux/mbcache.h | 69 ++
+ kernel/ksyms.c | 4
+ mm/vmscan.c | 35 +
+ 58 files changed, 4306 insertions(+), 137 deletions(-)
+
+--- linux-2.4.22-ac1/arch/alpha/defconfig~xattr-0.8.54-2.4.22-rh 2003-06-13 18:51:29.000000000 +0400
++++ linux-2.4.22-ac1-alexey/arch/alpha/defconfig 2003-09-25 23:57:02.000000000 +0400
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ CONFIG_ALPHA=y
+ # CONFIG_UID16 is not set
+ # CONFIG_RWSEM_GENERIC_SPINLOCK is not set
+--- linux-2.4.22-ac1/arch/alpha/kernel/entry.S~xattr-0.8.54-2.4.22-rh 2003-09-25 14:16:18.000000000 +0400
++++ linux-2.4.22-ac1-alexey/arch/alpha/kernel/entry.S 2003-09-25 23:57:02.000000000 +0400
+@@ -1158,6 +1158,18 @@ sys_call_table:
+ .quad sys_readahead
+ .quad sys_ni_syscall /* 380, sys_security */
+ .quad sys_tkill
++ .quad sys_setxattr
++ .quad sys_lsetxattr
++ .quad sys_fsetxattr
++ .quad sys_getxattr /* 385 */
++ .quad sys_lgetxattr
++ .quad sys_fgetxattr
++ .quad sys_listxattr
++ .quad sys_llistxattr
++ .quad sys_flistxattr /* 390 */
++ .quad sys_removexattr
++ .quad sys_lremovexattr
++ .quad sys_fremovexattr
+
+ /* Remember to update everything, kids. */
+ .ifne (. - sys_call_table) - (NR_SYSCALLS * 8)
+--- linux-2.4.22-ac1/arch/arm/defconfig~xattr-0.8.54-2.4.22-rh 2001-05-20 04:43:05.000000000 +0400
++++ linux-2.4.22-ac1-alexey/arch/arm/defconfig 2003-09-25 23:57:02.000000000 +0400
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ CONFIG_ARM=y
+ # CONFIG_EISA is not set
+ # CONFIG_SBUS is not set
+--- linux-2.4.22-ac1/arch/arm/kernel/calls.S~xattr-0.8.54-2.4.22-rh 2003-08-25 15:44:39.000000000 +0400
++++ linux-2.4.22-ac1-alexey/arch/arm/kernel/calls.S 2003-09-26 00:00:10.000000000 +0400
+@@ -240,18 +240,18 @@ __syscall_start:
+ .long SYMBOL_NAME(sys_ni_syscall) /* Security */
+ .long SYMBOL_NAME(sys_gettid)
+ /* 225 */ .long SYMBOL_NAME(sys_readahead)
+- .long SYMBOL_NAME(sys_ni_syscall) /* setxattr */
+- .long SYMBOL_NAME(sys_ni_syscall) /* lsetxattr */
+- .long SYMBOL_NAME(sys_ni_syscall) /* fsetxattr */
+- .long SYMBOL_NAME(sys_ni_syscall) /* getxattr */
+-/* 230 */ .long SYMBOL_NAME(sys_ni_syscall) /* lgetxattr */
+- .long SYMBOL_NAME(sys_ni_syscall) /* fgetxattr */
+- .long SYMBOL_NAME(sys_ni_syscall) /* listxattr */
+- .long SYMBOL_NAME(sys_ni_syscall) /* llistxattr */
+- .long SYMBOL_NAME(sys_ni_syscall) /* flistxattr */
+-/* 235 */ .long SYMBOL_NAME(sys_ni_syscall) /* removexattr */
+- .long SYMBOL_NAME(sys_ni_syscall) /* lremovexattr */
+- .long SYMBOL_NAME(sys_ni_syscall) /* fremovexattr */
++ .long SYMBOL_NAME(sys_setxattr)
++ .long SYMBOL_NAME(sys_lsetxattr)
++ .long SYMBOL_NAME(sys_fsetxattr)
++ .long SYMBOL_NAME(sys_getxattr)
++/* 230 */ .long SYMBOL_NAME(sys_lgetxattr)
++ .long SYMBOL_NAME(sys_fgetxattr)
++ .long SYMBOL_NAME(sys_listxattr)
++ .long SYMBOL_NAME(sys_llistxattr)
++ .long SYMBOL_NAME(sys_flistxattr)
++/* 235 */ .long SYMBOL_NAME(sys_removexattr)
++ .long SYMBOL_NAME(sys_lremovexattr)
++ .long SYMBOL_NAME(sys_fremovexattr)
+ .long SYMBOL_NAME(sys_tkill)
+ .long SYMBOL_NAME(sys_ni_syscall) /* sendfile64 */
+ /* 240 */ .long SYMBOL_NAME(sys_ni_syscall) /* futex */
+--- linux-2.4.22-ac1/arch/i386/defconfig~xattr-0.8.54-2.4.22-rh 2003-09-25 14:16:18.000000000 +0400
++++ linux-2.4.22-ac1-alexey/arch/i386/defconfig 2003-09-25 23:57:02.000000000 +0400
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ CONFIG_X86=y
+ CONFIG_ISA=y
+ # CONFIG_SBUS is not set
+--- linux-2.4.22-ac1/arch/ia64/defconfig~xattr-0.8.54-2.4.22-rh 2003-08-25 15:44:39.000000000 +0400
++++ linux-2.4.22-ac1-alexey/arch/ia64/defconfig 2003-09-25 23:57:02.000000000 +0400
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+
+ #
+ # Code maturity level options
+--- linux-2.4.22-ac1/arch/m68k/defconfig~xattr-0.8.54-2.4.22-rh 2000-06-19 23:56:08.000000000 +0400
++++ linux-2.4.22-ac1-alexey/arch/m68k/defconfig 2003-09-25 23:57:02.000000000 +0400
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ CONFIG_UID16=y
+
+ #
+--- linux-2.4.22-ac1/arch/mips64/defconfig~xattr-0.8.54-2.4.22-rh 2003-08-25 15:44:40.000000000 +0400
++++ linux-2.4.22-ac1-alexey/arch/mips64/defconfig 2003-09-25 23:57:02.000000000 +0400
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ CONFIG_MIPS=y
+ # CONFIG_MIPS32 is not set
+ CONFIG_MIPS64=y
+--- linux-2.4.22-ac1/arch/mips/defconfig~xattr-0.8.54-2.4.22-rh 2003-08-25 15:44:39.000000000 +0400
++++ linux-2.4.22-ac1-alexey/arch/mips/defconfig 2003-09-25 23:57:02.000000000 +0400
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ CONFIG_MIPS=y
+ CONFIG_MIPS32=y
+ # CONFIG_MIPS64 is not set
+--- linux-2.4.22-ac1/arch/ppc64/kernel/misc.S~xattr-0.8.54-2.4.22-rh 2003-08-25 15:44:40.000000000 +0400
++++ linux-2.4.22-ac1-alexey/arch/ppc64/kernel/misc.S 2003-09-25 23:57:02.000000000 +0400
+@@ -805,6 +805,7 @@ _GLOBAL(sys_call_table32)
+ .llong .sys_gettid /* 207 */
+ #if 0 /* Reserved syscalls */
+ .llong .sys_tkill /* 208 */
++#endif
+ .llong .sys_setxattr
+ .llong .sys_lsetxattr /* 210 */
+ .llong .sys_fsetxattr
+@@ -817,6 +818,7 @@ _GLOBAL(sys_call_table32)
+ .llong .sys_removexattr
+ .llong .sys_lremovexattr
+ .llong .sys_fremovexattr /* 220 */
++#if 0 /* Reserved syscalls */
+ .llong .sys_futex
+ #endif
+ .llong .sys_perfmonctl /* Put this here for now ... */
+--- linux-2.4.22-ac1/arch/ppc/defconfig~xattr-0.8.54-2.4.22-rh 2003-06-13 18:51:31.000000000 +0400
++++ linux-2.4.22-ac1-alexey/arch/ppc/defconfig 2003-09-25 23:57:02.000000000 +0400
+@@ -1,6 +1,20 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ # CONFIG_UID16 is not set
+ # CONFIG_RWSEM_GENERIC_SPINLOCK is not set
+ CONFIG_RWSEM_XCHGADD_ALGORITHM=y
+--- linux-2.4.22-ac1/arch/s390/defconfig~xattr-0.8.54-2.4.22-rh 2003-09-25 14:16:18.000000000 +0400
++++ linux-2.4.22-ac1-alexey/arch/s390/defconfig 2003-09-25 23:57:02.000000000 +0400
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ # CONFIG_ISA is not set
+ # CONFIG_EISA is not set
+ # CONFIG_MCA is not set
+--- linux-2.4.22-ac1/arch/s390/kernel/entry.S~xattr-0.8.54-2.4.22-rh 2003-09-25 14:16:18.000000000 +0400
++++ linux-2.4.22-ac1-alexey/arch/s390/kernel/entry.S 2003-09-25 23:57:02.000000000 +0400
+@@ -559,18 +559,18 @@ sys_call_table:
+ .long sys_fcntl64
+ .long sys_readahead
+ .long sys_ni_syscall
+- .long sys_ni_syscall /* 224 - reserved for setxattr */
+- .long sys_ni_syscall /* 225 - reserved for lsetxattr */
+- .long sys_ni_syscall /* 226 - reserved for fsetxattr */
+- .long sys_ni_syscall /* 227 - reserved for getxattr */
+- .long sys_ni_syscall /* 228 - reserved for lgetxattr */
+- .long sys_ni_syscall /* 229 - reserved for fgetxattr */
+- .long sys_ni_syscall /* 230 - reserved for listxattr */
+- .long sys_ni_syscall /* 231 - reserved for llistxattr */
+- .long sys_ni_syscall /* 232 - reserved for flistxattr */
+- .long sys_ni_syscall /* 233 - reserved for removexattr */
+- .long sys_ni_syscall /* 234 - reserved for lremovexattr */
+- .long sys_ni_syscall /* 235 - reserved for fremovexattr */
++ .long sys_setxattr
++ .long sys_lsetxattr /* 225 */
++ .long sys_fsetxattr
++ .long sys_getxattr
++ .long sys_lgetxattr
++ .long sys_fgetxattr
++ .long sys_listxattr /* 230 */
++ .long sys_llistxattr
++ .long sys_flistxattr
++ .long sys_removexattr
++ .long sys_lremovexattr
++ .long sys_fremovexattr /* 235 */
+ .long sys_gettid
+ .long sys_tkill
+ .rept 255-237
+--- linux-2.4.22-ac1/arch/s390x/defconfig~xattr-0.8.54-2.4.22-rh 2003-09-25 14:16:18.000000000 +0400
++++ linux-2.4.22-ac1-alexey/arch/s390x/defconfig 2003-09-25 23:57:02.000000000 +0400
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ # CONFIG_ISA is not set
+ # CONFIG_EISA is not set
+ # CONFIG_MCA is not set
+--- linux-2.4.22-ac1/arch/s390x/kernel/entry.S~xattr-0.8.54-2.4.22-rh 2003-09-25 14:16:18.000000000 +0400
++++ linux-2.4.22-ac1-alexey/arch/s390x/kernel/entry.S 2003-09-25 23:57:02.000000000 +0400
+@@ -591,18 +591,18 @@ sys_call_table:
+ .long SYSCALL(sys_ni_syscall,sys32_fcntl64_wrapper)
+ .long SYSCALL(sys_readahead,sys32_readahead)
+ .long SYSCALL(sys_ni_syscall,sys_ni_syscall)
+- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 224 - reserved for setxattr */
+- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 225 - reserved for lsetxattr */
+- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 226 - reserved for fsetxattr */
+- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 227 - reserved for getxattr */
+- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 228 - reserved for lgetxattr */
+- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 229 - reserved for fgetxattr */
+- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 230 - reserved for listxattr */
+- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 231 - reserved for llistxattr */
+- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 232 - reserved for flistxattr */
+- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 233 - reserved for removexattr */
+- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 234 - reserved for lremovexattr */
+- .long SYSCALL(sys_ni_syscall,sys_ni_syscall) /* 235 - reserved for fremovexattr */
++ .long SYSCALL(sys_setxattr,sys32_setxattr_wrapper)
++ .long SYSCALL(sys_lsetxattr,sys32_lsetxattr_wrapper) /* 225 */
++ .long SYSCALL(sys_fsetxattr,sys32_fsetxattr_wrapper)
++ .long SYSCALL(sys_getxattr,sys32_getxattr_wrapper)
++ .long SYSCALL(sys_lgetxattr,sys32_lgetxattr_wrapper)
++ .long SYSCALL(sys_fgetxattr,sys32_fgetxattr_wrapper)
++ .long SYSCALL(sys_listxattr,sys32_listxattr_wrapper) /* 230 */
++ .long SYSCALL(sys_llistxattr,sys32_llistxattr_wrapper)
++ .long SYSCALL(sys_flistxattr,sys32_flistxattr_wrapper)
++ .long SYSCALL(sys_removexattr,sys32_removexattr_wrapper)
++ .long SYSCALL(sys_lremovexattr,sys32_lremovexattr_wrapper)
++ .long SYSCALL(sys_fremovexattr,sys32_fremovexattr_wrapper)/* 235 */
+ .long SYSCALL(sys_gettid,sys_gettid)
+ .long SYSCALL(sys_tkill,sys_tkill)
+ .rept 255-237
+--- linux-2.4.22-ac1/arch/s390x/kernel/wrapper32.S~xattr-0.8.54-2.4.22-rh 2003-08-25 15:44:40.000000000 +0400
++++ linux-2.4.22-ac1-alexey/arch/s390x/kernel/wrapper32.S 2003-09-26 00:05:14.000000000 +0400
+@@ -1097,6 +1097,96 @@ sys32_fstat64_wrapper:
+ llgtr %r3,%r3 # struct stat64 *
+ llgfr %r4,%r4 # long
+ jg sys32_fstat64 # branch to system call
++
++ .globl sys32_setxattr_wrapper
++sys32_setxattr_wrapper:
++ llgtr %r2,%r2 # char *
++ llgtr %r3,%r3 # char *
++ llgtr %r4,%r4 # void *
++ llgfr %r5,%r5 # size_t
++ lgfr %r6,%r6 # int
++ jg sys_setxattr
++
++ .globl sys32_lsetxattr_wrapper
++sys32_lsetxattr_wrapper:
++ llgtr %r2,%r2 # char *
++ llgtr %r3,%r3 # char *
++ llgtr %r4,%r4 # void *
++ llgfr %r5,%r5 # size_t
++ lgfr %r6,%r6 # int
++ jg sys_lsetxattr
++
++ .globl sys32_fsetxattr_wrapper
++sys32_fsetxattr_wrapper:
++ lgfr %r2,%r2 # int
++ llgtr %r3,%r3 # char *
++ llgtr %r4,%r4 # void *
++ llgfr %r5,%r5 # size_t
++ lgfr %r6,%r6 # int
++ jg sys_fsetxattr
++
++ .globl sys32_getxattr_wrapper
++sys32_getxattr_wrapper:
++ llgtr %r2,%r2 # char *
++ llgtr %r3,%r3 # char *
++ llgtr %r4,%r4 # void *
++ llgfr %r5,%r5 # size_t
++ jg sys_getxattr
++
++ .globl sys32_lgetxattr_wrapper
++sys32_lgetxattr_wrapper:
++ llgtr %r2,%r2 # char *
++ llgtr %r3,%r3 # char *
++ llgtr %r4,%r4 # void *
++ llgfr %r5,%r5 # size_t
++ jg sys_lgetxattr
++
++ .globl sys32_fgetxattr_wrapper
++sys32_fgetxattr_wrapper:
++ lgfr %r2,%r2 # int
++ llgtr %r3,%r3 # char *
++ llgtr %r4,%r4 # void *
++ llgfr %r5,%r5 # size_t
++ jg sys_fgetxattr
++
++ .globl sys32_listxattr_wrapper
++sys32_listxattr_wrapper:
++ llgtr %r2,%r2 # char *
++ llgtr %r3,%r3 # char *
++ llgfr %r4,%r4 # size_t
++ jg sys_listxattr
++
++ .globl sys32_llistxattr_wrapper
++sys32_llistxattr_wrapper:
++ llgtr %r2,%r2 # char *
++ llgtr %r3,%r3 # char *
++ llgfr %r4,%r4 # size_t
++ jg sys_llistxattr
++
++ .globl sys32_flistxattr_wrapper
++sys32_flistxattr_wrapper:
++ lgfr %r2,%r2 # int
++ llgtr %r3,%r3 # char *
++ llgfr %r4,%r4 # size_t
++ jg sys_flistxattr
++
++ .globl sys32_removexattr_wrapper
++sys32_removexattr_wrapper:
++ llgtr %r2,%r2 # char *
++ llgtr %r3,%r3 # char *
++ jg sys_removexattr
++
++ .globl sys32_lremovexattr_wrapper
++sys32_lremovexattr_wrapper:
++ llgtr %r2,%r2 # char *
++ llgtr %r3,%r3 # char *
++ jg sys_lremovexattr
++
++ .globl sys32_fremovexattr_wrapper
++sys32_fremovexattr_wrapper:
++ lgfr %r2,%r2 # int
++ llgtr %r3,%r3 # char *
++ jg sys_fremovexattr
+
+ .globl sys32_stime_wrapper
+ sys32_stime_wrapper:
+--- linux-2.4.22-ac1/arch/sparc64/defconfig~xattr-0.8.54-2.4.22-rh 2003-08-25 15:44:40.000000000 +0400
++++ linux-2.4.22-ac1-alexey/arch/sparc64/defconfig 2003-09-25 23:57:02.000000000 +0400
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+
+ #
+ # Code maturity level options
+--- linux-2.4.22-ac1/arch/sparc/defconfig~xattr-0.8.54-2.4.22-rh 2002-08-03 04:39:43.000000000 +0400
++++ linux-2.4.22-ac1-alexey/arch/sparc/defconfig 2003-09-25 23:57:02.000000000 +0400
+@@ -1,6 +1,13 @@
+ #
+ # Automatically generated make config: don't edit
+ #
++# CONFIG_EXT3_FS_XATTR is not set
++# CONFIG_EXT3_FS_XATTR_SHARING is not set
++# CONFIG_EXT3_FS_XATTR_USER is not set
++# CONFIG_EXT2_FS_XATTR is not set
++# CONFIG_EXT2_FS_XATTR_SHARING is not set
++# CONFIG_EXT2_FS_XATTR_USER is not set
++# CONFIG_FS_MBCACHE is not set
+ CONFIG_UID16=y
+ CONFIG_HIGHMEM=y
+
+--- linux-2.4.22-ac1/Documentation/Configure.help~xattr-0.8.54-2.4.22-rh 2003-09-25 14:16:30.000000000 +0400
++++ linux-2.4.22-ac1-alexey/Documentation/Configure.help 2003-09-25 23:57:02.000000000 +0400
+@@ -16145,6 +16145,39 @@ CONFIG_EXT2_FS
+ be compiled as a module, and so this could be dangerous. Most
+ everyone wants to say Y here.
+
++Ext2 extended attributes
++CONFIG_EXT2_FS_XATTR
++ Extended attributes are name:value pairs associated with inodes by
++ the kernel or by users (see the attr(5) manual page, or visit
++ <http://acl.bestbits.at/> for details).
++
++ If unsure, say N.
++
++Ext2 extended attribute block sharing
++CONFIG_EXT2_FS_XATTR_SHARING
++ This options enables code for sharing identical extended attribute
++ blocks among multiple inodes.
++
++ Usually, say Y.
++
++Ext2 extended user attributes
++CONFIG_EXT2_FS_XATTR_USER
++ This option enables extended user attributes on ext2. Processes can
++ associate extended user attributes with inodes to store additional
++ information such as the character encoding of files, etc. (see the
++ attr(5) manual page, or visit <http://acl.bestbits.at/> for details).
++
++ If unsure, say N.
++
++Ext2 trusted extended attributes
++CONFIG_EXT2_FS_XATTR_TRUSTED
++ This option enables extended attributes on ext2 that are accessible
++ (and visible) only to users capable of CAP_SYS_ADMIN. Usually this
++ is only the super user. Trusted extended attributes are meant for
++ implementing system/security services.
++
++ If unsure, say N.
++
+ Ext3 journalling file system support (EXPERIMENTAL)
+ CONFIG_EXT3_FS
+ This is the journalling version of the Second extended file system
+@@ -16177,6 +16210,39 @@ CONFIG_EXT3_FS
+ of your root partition (the one containing the directory /) cannot
+ be compiled as a module, and so this may be dangerous.
+
++Ext3 extended attributes
++CONFIG_EXT3_FS_XATTR
++ Extended attributes are name:value pairs associated with inodes by
++ the kernel or by users (see the attr(5) manual page, or visit
++ <http://acl.bestbits.at/> for details).
++
++ If unsure, say N.
++
++Ext3 extended attribute block sharing
++CONFIG_EXT3_FS_XATTR_SHARING
++ This options enables code for sharing identical extended attribute
++ blocks among multiple inodes.
++
++ Usually, say Y.
++
++Ext3 extended user attributes
++CONFIG_EXT3_FS_XATTR_USER
++ This option enables extended user attributes on ext3. Processes can
++ associate extended user attributes with inodes to store additional
++ information such as the character encoding of files, etc. (see the
++ attr(5) manual page, or visit <http://acl.bestbits.at/> for details).
++
++ If unsure, say N.
++
++Ext3 trusted extended attributes
++CONFIG_EXT3_FS_XATTR_TRUSTED
++ This option enables extended attributes on ext3 that are accessible
++ (and visible) only to users capable of CAP_SYS_ADMIN. Usually this
++ is only the super user. Trusted extended attributes are meant for
++ implementing system/security services.
++
++ If unsure, say N.
++
+ Journal Block Device support (JBD for ext3) (EXPERIMENTAL)
+ CONFIG_JBD
+ This is a generic journalling layer for block devices. It is
+--- linux-2.4.22-ac1/fs/Config.in~xattr-0.8.54-2.4.22-rh 2003-09-25 14:16:23.000000000 +0400
++++ linux-2.4.22-ac1-alexey/fs/Config.in 2003-09-25 23:57:02.000000000 +0400
+@@ -29,6 +29,11 @@ dep_mbool ' Debug Befs' CONFIG_BEFS_DEB
+ dep_tristate 'BFS file system support (EXPERIMENTAL)' CONFIG_BFS_FS $CONFIG_EXPERIMENTAL
+
+ tristate 'Ext3 journalling file system support' CONFIG_EXT3_FS
++dep_mbool ' Ext3 extended attributes' CONFIG_EXT3_FS_XATTR $CONFIG_EXT3_FS
++dep_bool ' Ext3 extended attribute block sharing' \
++ CONFIG_EXT3_FS_XATTR_SHARING $CONFIG_EXT3_FS_XATTR
++dep_bool ' Ext3 extended user attributes' \
++ CONFIG_EXT3_FS_XATTR_USER $CONFIG_EXT3_FS_XATTR
+ # CONFIG_JBD could be its own option (even modular), but until there are
+ # other users than ext3, we will simply make it be the same as CONFIG_EXT3_FS
+ # dep_tristate ' Journal Block Device support (JBD for ext3)' CONFIG_JBD $CONFIG_EXT3_FS
+@@ -88,6 +93,11 @@ dep_mbool ' QNX4FS write support (DANGE
+ tristate 'ROM file system support' CONFIG_ROMFS_FS
+
+ tristate 'Second extended fs support' CONFIG_EXT2_FS
++dep_mbool ' Ext2 extended attributes' CONFIG_EXT2_FS_XATTR $CONFIG_EXT2_FS
++dep_bool ' Ext2 extended attribute block sharing' \
++ CONFIG_EXT2_FS_XATTR_SHARING $CONFIG_EXT2_FS_XATTR
++dep_bool ' Ext2 extended user attributes' \
++ CONFIG_EXT2_FS_XATTR_USER $CONFIG_EXT2_FS_XATTR
+
+ tristate 'System V/Xenix/V7/Coherent file system support' CONFIG_SYSV_FS
+
+@@ -164,6 +174,10 @@ else
+ define_tristate CONFIG_ZISOFS_FS n
+ fi
+
++# Meta block cache for Extended Attributes (ext2/ext3)
++#tristate 'Meta block cache' CONFIG_FS_MBCACHE
++define_tristate CONFIG_FS_MBCACHE y
++
+ mainmenu_option next_comment
+ comment 'Partition Types'
+ source fs/partitions/Config.in
+--- linux-2.4.22-ac1/fs/ext2/file.c~xattr-0.8.54-2.4.22-rh 2001-10-11 19:05:18.000000000 +0400
++++ linux-2.4.22-ac1-alexey/fs/ext2/file.c 2003-09-25 23:57:02.000000000 +0400
+@@ -20,6 +20,7 @@
+
+ #include <linux/fs.h>
+ #include <linux/ext2_fs.h>
++#include <linux/ext2_xattr.h>
+ #include <linux/sched.h>
+
+ /*
+@@ -51,4 +52,8 @@ struct file_operations ext2_file_operati
+
+ struct inode_operations ext2_file_inode_operations = {
+ truncate: ext2_truncate,
++ setxattr: ext2_setxattr,
++ getxattr: ext2_getxattr,
++ listxattr: ext2_listxattr,
++ removexattr: ext2_removexattr,
+ };
+--- linux-2.4.22-ac1/fs/ext2/ialloc.c~xattr-0.8.54-2.4.22-rh 2003-06-13 18:51:37.000000000 +0400
++++ linux-2.4.22-ac1-alexey/fs/ext2/ialloc.c 2003-09-25 23:57:02.000000000 +0400
+@@ -15,6 +15,7 @@
+ #include <linux/config.h>
+ #include <linux/fs.h>
+ #include <linux/ext2_fs.h>
++#include <linux/ext2_xattr.h>
+ #include <linux/locks.h>
+ #include <linux/quotaops.h>
+
+@@ -167,6 +168,7 @@ void ext2_free_inode (struct inode * ino
+ */
+ if (!is_bad_inode(inode)) {
+ /* Quota is already initialized in iput() */
++ ext2_xattr_delete_inode(inode);
+ DQUOT_FREE_INODE(inode);
+ DQUOT_DROP(inode);
+ }
+--- linux-2.4.22-ac1/fs/ext2/inode.c~xattr-0.8.54-2.4.22-rh 2003-06-13 18:51:37.000000000 +0400
++++ linux-2.4.22-ac1-alexey/fs/ext2/inode.c 2003-09-25 23:57:02.000000000 +0400
+@@ -39,6 +39,18 @@ MODULE_LICENSE("GPL");
+ static int ext2_update_inode(struct inode * inode, int do_sync);
+
+ /*
++ * Test whether an inode is a fast symlink.
++ */
++static inline int ext2_inode_is_fast_symlink(struct inode *inode)
++{
++ int ea_blocks = inode->u.ext2_i.i_file_acl ?
++ (inode->i_sb->s_blocksize >> 9) : 0;
++
++ return (S_ISLNK(inode->i_mode) &&
++ inode->i_blocks - ea_blocks == 0);
++}
++
++/*
+ * Called at each iput()
+ */
+ void ext2_put_inode (struct inode * inode)
+@@ -53,9 +65,7 @@ void ext2_delete_inode (struct inode * i
+ {
+ lock_kernel();
+
+- if (is_bad_inode(inode) ||
+- inode->i_ino == EXT2_ACL_IDX_INO ||
+- inode->i_ino == EXT2_ACL_DATA_INO)
++ if (is_bad_inode(inode))
+ goto no_delete;
+ inode->u.ext2_i.i_dtime = CURRENT_TIME;
+ mark_inode_dirty(inode);
+@@ -801,6 +811,8 @@ void ext2_truncate (struct inode * inode
+ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ S_ISLNK(inode->i_mode)))
+ return;
++ if (ext2_inode_is_fast_symlink(inode))
++ return;
+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+ return;
+
+@@ -903,8 +915,7 @@ void ext2_read_inode (struct inode * ino
+ unsigned long offset;
+ struct ext2_group_desc * gdp;
+
+- if ((inode->i_ino != EXT2_ROOT_INO && inode->i_ino != EXT2_ACL_IDX_INO &&
+- inode->i_ino != EXT2_ACL_DATA_INO &&
++ if ((inode->i_ino != EXT2_ROOT_INO &&
+ inode->i_ino < EXT2_FIRST_INO(inode->i_sb)) ||
+ inode->i_ino > le32_to_cpu(inode->i_sb->u.ext2_sb.s_es->s_inodes_count)) {
+ ext2_error (inode->i_sb, "ext2_read_inode",
+@@ -989,10 +1000,7 @@ void ext2_read_inode (struct inode * ino
+ for (block = 0; block < EXT2_N_BLOCKS; block++)
+ inode->u.ext2_i.i_data[block] = raw_inode->i_block[block];
+
+- if (inode->i_ino == EXT2_ACL_IDX_INO ||
+- inode->i_ino == EXT2_ACL_DATA_INO)
+- /* Nothing to do */ ;
+- else if (S_ISREG(inode->i_mode)) {
++ if (S_ISREG(inode->i_mode)) {
+ inode->i_op = &ext2_file_inode_operations;
+ inode->i_fop = &ext2_file_operations;
+ inode->i_mapping->a_ops = &ext2_aops;
+@@ -1001,15 +1009,17 @@ void ext2_read_inode (struct inode * ino
+ inode->i_fop = &ext2_dir_operations;
+ inode->i_mapping->a_ops = &ext2_aops;
+ } else if (S_ISLNK(inode->i_mode)) {
+- if (!inode->i_blocks)
++ if (ext2_inode_is_fast_symlink(inode))
+ inode->i_op = &ext2_fast_symlink_inode_operations;
+ else {
+- inode->i_op = &page_symlink_inode_operations;
++ inode->i_op = &ext2_symlink_inode_operations;
+ inode->i_mapping->a_ops = &ext2_aops;
+ }
+- } else
++ } else {
++ inode->i_op = &ext2_special_inode_operations;
+ init_special_inode(inode, inode->i_mode,
+ le32_to_cpu(raw_inode->i_block[0]));
++ }
+ brelse (bh);
+ inode->i_attr_flags = 0;
+ ext2_set_inode_flags(inode);
+--- linux-2.4.22-ac1/fs/ext2/Makefile~xattr-0.8.54-2.4.22-rh 2001-10-11 19:05:18.000000000 +0400
++++ linux-2.4.22-ac1-alexey/fs/ext2/Makefile 2003-09-25 23:57:02.000000000 +0400
+@@ -13,4 +13,8 @@ obj-y := balloc.o bitmap.o dir.o file
+ ioctl.o namei.o super.o symlink.o
+ obj-m := $(O_TARGET)
+
++export-objs += xattr.o
++obj-$(CONFIG_EXT2_FS_XATTR) += xattr.o
++obj-$(CONFIG_EXT2_FS_XATTR_USER) += xattr_user.o
++
+ include $(TOPDIR)/Rules.make
+--- linux-2.4.22-ac1/fs/ext2/namei.c~xattr-0.8.54-2.4.22-rh 2001-10-04 09:57:36.000000000 +0400
++++ linux-2.4.22-ac1-alexey/fs/ext2/namei.c 2003-09-25 23:57:02.000000000 +0400
+@@ -31,6 +31,7 @@
+
+ #include <linux/fs.h>
+ #include <linux/ext2_fs.h>
++#include <linux/ext2_xattr.h>
+ #include <linux/pagemap.h>
+
+ /*
+@@ -136,7 +137,7 @@ static int ext2_symlink (struct inode *
+
+ if (l > sizeof (inode->u.ext2_i.i_data)) {
+ /* slow symlink */
+- inode->i_op = &page_symlink_inode_operations;
++ inode->i_op = &ext2_symlink_inode_operations;
+ inode->i_mapping->a_ops = &ext2_aops;
+ err = block_symlink(inode, symname, l);
+ if (err)
+@@ -345,4 +346,15 @@ struct inode_operations ext2_dir_inode_o
+ rmdir: ext2_rmdir,
+ mknod: ext2_mknod,
+ rename: ext2_rename,
++ setxattr: ext2_setxattr,
++ getxattr: ext2_getxattr,
++ listxattr: ext2_listxattr,
++ removexattr: ext2_removexattr,
++};
++
++struct inode_operations ext2_special_inode_operations = {
++ setxattr: ext2_setxattr,
++ getxattr: ext2_getxattr,
++ listxattr: ext2_listxattr,
++ removexattr: ext2_removexattr,
+ };
+--- linux-2.4.22-ac1/fs/ext2/super.c~xattr-0.8.54-2.4.22-rh 2002-11-29 02:53:15.000000000 +0300
++++ linux-2.4.22-ac1-alexey/fs/ext2/super.c 2003-09-25 23:57:02.000000000 +0400
+@@ -21,6 +21,7 @@
+ #include <linux/string.h>
+ #include <linux/fs.h>
+ #include <linux/ext2_fs.h>
++#include <linux/ext2_xattr.h>
+ #include <linux/slab.h>
+ #include <linux/init.h>
+ #include <linux/locks.h>
+@@ -125,6 +126,7 @@ void ext2_put_super (struct super_block
+ int db_count;
+ int i;
+
++ ext2_xattr_put_super(sb);
+ if (!(sb->s_flags & MS_RDONLY)) {
+ struct ext2_super_block *es = EXT2_SB(sb)->s_es;
+
+@@ -175,6 +177,13 @@ static int parse_options (char * options
+ this_char = strtok (NULL, ",")) {
+ if ((value = strchr (this_char, '=')) != NULL)
+ *value++ = 0;
++#ifdef CONFIG_EXT2_FS_XATTR_USER
++ if (!strcmp (this_char, "user_xattr"))
++ set_opt (*mount_options, XATTR_USER);
++ else if (!strcmp (this_char, "nouser_xattr"))
++ clear_opt (*mount_options, XATTR_USER);
++ else
++#endif
+ if (!strcmp (this_char, "bsddf"))
+ clear_opt (*mount_options, MINIX_DF);
+ else if (!strcmp (this_char, "nouid32")) {
+@@ -424,6 +433,9 @@ struct super_block * ext2_read_super (st
+ blocksize = BLOCK_SIZE;
+
+ sb->u.ext2_sb.s_mount_opt = 0;
++#ifdef CONFIG_EXT2_FS_XATTR_USER
++ /* set_opt (sb->u.ext2_sb.s_mount_opt, XATTR_USER); */
++#endif
+ if (!parse_options ((char *) data, &sb_block, &resuid, &resgid,
+ &sb->u.ext2_sb.s_mount_opt)) {
+ return NULL;
+@@ -813,12 +825,27 @@ static DECLARE_FSTYPE_DEV(ext2_fs_type,
+
+ static int __init init_ext2_fs(void)
+ {
+- return register_filesystem(&ext2_fs_type);
++ int error = init_ext2_xattr();
++ if (error)
++ return error;
++ error = init_ext2_xattr_user();
++ if (error)
++ goto fail;
++ error = register_filesystem(&ext2_fs_type);
++ if (!error)
++ return 0;
++
++ exit_ext2_xattr_user();
++fail:
++ exit_ext2_xattr();
++ return error;
+ }
+
+ static void __exit exit_ext2_fs(void)
+ {
+ unregister_filesystem(&ext2_fs_type);
++ exit_ext2_xattr_user();
++ exit_ext2_xattr();
+ }
+
+ EXPORT_NO_SYMBOLS;
+--- linux-2.4.22-ac1/fs/ext2/symlink.c~xattr-0.8.54-2.4.22-rh 2000-09-28 00:41:33.000000000 +0400
++++ linux-2.4.22-ac1-alexey/fs/ext2/symlink.c 2003-09-25 23:57:02.000000000 +0400
+@@ -19,6 +19,7 @@
+
+ #include <linux/fs.h>
+ #include <linux/ext2_fs.h>
++#include <linux/ext2_xattr.h>
+
+ static int ext2_readlink(struct dentry *dentry, char *buffer, int buflen)
+ {
+@@ -32,7 +33,20 @@ static int ext2_follow_link(struct dentr
+ return vfs_follow_link(nd, s);
+ }
+
++struct inode_operations ext2_symlink_inode_operations = {
++ readlink: page_readlink,
++ follow_link: page_follow_link,
++ setxattr: ext2_setxattr,
++ getxattr: ext2_getxattr,
++ listxattr: ext2_listxattr,
++ removexattr: ext2_removexattr,
++};
++
+ struct inode_operations ext2_fast_symlink_inode_operations = {
+ readlink: ext2_readlink,
+ follow_link: ext2_follow_link,
++ setxattr: ext2_setxattr,
++ getxattr: ext2_getxattr,
++ listxattr: ext2_listxattr,
++ removexattr: ext2_removexattr,
+ };
+--- /dev/null 2003-01-30 13:24:37.000000000 +0300
++++ linux-2.4.22-ac1-alexey/fs/ext2/xattr.c 2003-09-25 23:57:02.000000000 +0400
+@@ -0,0 +1,1212 @@
++/*
++ * linux/fs/ext2/xattr.c
++ *
++ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
++ *
++ * Fix by Harrison Xing <harrison@mountainviewdata.com>.
++ * Extended attributes for symlinks and special files added per
++ * suggestion of Luka Renko <luka.renko@hermes.si>.
++ */
++
++/*
++ * Extended attributes are stored on disk blocks allocated outside of
++ * any inode. The i_file_acl field is then made to point to this allocated
++ * block. If all extended attributes of an inode are identical, these
++ * inodes may share the same extended attribute block. Such situations
++ * are automatically detected by keeping a cache of recent attribute block
++ * numbers and hashes over the block's contents in memory.
++ *
++ *
++ * Extended attribute block layout:
++ *
++ * +------------------+
++ * | header |
++ * | entry 1 | |
++ * | entry 2 | | growing downwards
++ * | entry 3 | v
++ * | four null bytes |
++ * | . . . |
++ * | value 1 | ^
++ * | value 3 | | growing upwards
++ * | value 2 | |
++ * +------------------+
++ *
++ * The block header is followed by multiple entry descriptors. These entry
++ * descriptors are variable in size, and alligned to EXT2_XATTR_PAD
++ * byte boundaries. The entry descriptors are sorted by attribute name,
++ * so that two extended attribute blocks can be compared efficiently.
++ *
++ * Attribute values are aligned to the end of the block, stored in
++ * no specific order. They are also padded to EXT2_XATTR_PAD byte
++ * boundaries. No additional gaps are left between them.
++ *
++ * Locking strategy
++ * ----------------
++ * The VFS already holds the BKL and the inode->i_sem semaphore when any of
++ * the xattr inode operations are called, so we are guaranteed that only one
++ * processes accesses extended attributes of an inode at any time.
++ *
++ * For writing we also grab the ext2_xattr_sem semaphore. This ensures that
++ * only a single process is modifying an extended attribute block, even
++ * if the block is shared among inodes.
++ *
++ * Note for porting to 2.5
++ * -----------------------
++ * The BKL will no longer be held in the xattr inode operations.
++ */
++
++#include <linux/module.h>
++#include <linux/locks.h>
++#include <linux/slab.h>
++#include <linux/fs.h>
++#include <linux/ext2_fs.h>
++#include <linux/ext2_xattr.h>
++#include <linux/mbcache.h>
++#include <linux/quotaops.h>
++#include <asm/semaphore.h>
++#include <linux/compatmac.h>
++
++/* These symbols may be needed by a module. */
++EXPORT_SYMBOL(ext2_xattr_register);
++EXPORT_SYMBOL(ext2_xattr_unregister);
++EXPORT_SYMBOL(ext2_xattr_get);
++EXPORT_SYMBOL(ext2_xattr_list);
++EXPORT_SYMBOL(ext2_xattr_set);
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0)
++# define mark_buffer_dirty(bh) mark_buffer_dirty(bh, 1)
++#endif
++
++#define HDR(bh) ((struct ext2_xattr_header *)((bh)->b_data))
++#define ENTRY(ptr) ((struct ext2_xattr_entry *)(ptr))
++#define FIRST_ENTRY(bh) ENTRY(HDR(bh)+1)
++#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
++
++#ifdef EXT2_XATTR_DEBUG
++# define ea_idebug(inode, f...) do { \
++ printk(KERN_DEBUG "inode %s:%ld: ", \
++ kdevname(inode->i_dev), inode->i_ino); \
++ printk(f); \
++ printk("\n"); \
++ } while (0)
++# define ea_bdebug(bh, f...) do { \
++ printk(KERN_DEBUG "block %s:%ld: ", \
++ kdevname(bh->b_dev), bh->b_blocknr); \
++ printk(f); \
++ printk("\n"); \
++ } while (0)
++#else
++# define ea_idebug(f...)
++# define ea_bdebug(f...)
++#endif
++
++static int ext2_xattr_set2(struct inode *, struct buffer_head *,
++ struct ext2_xattr_header *);
++
++#ifdef CONFIG_EXT2_FS_XATTR_SHARING
++
++static int ext2_xattr_cache_insert(struct buffer_head *);
++static struct buffer_head *ext2_xattr_cache_find(struct inode *,
++ struct ext2_xattr_header *);
++static void ext2_xattr_cache_remove(struct buffer_head *);
++static void ext2_xattr_rehash(struct ext2_xattr_header *,
++ struct ext2_xattr_entry *);
++
++static struct mb_cache *ext2_xattr_cache;
++
++#else
++# define ext2_xattr_cache_insert(bh) 0
++# define ext2_xattr_cache_find(inode, header) NULL
++# define ext2_xattr_cache_remove(bh) while(0) {}
++# define ext2_xattr_rehash(header, entry) while(0) {}
++#endif
++
++/*
++ * If a file system does not share extended attributes among inodes,
++ * we should not need the ext2_xattr_sem semaphore. However, the
++ * filesystem may still contain shared blocks, so we always take
++ * the lock.
++ */
++
++DECLARE_MUTEX(ext2_xattr_sem);
++
++static inline int
++ext2_xattr_new_block(struct inode *inode, int * errp, int force)
++{
++ struct super_block *sb = inode->i_sb;
++ int goal = le32_to_cpu(EXT2_SB(sb)->s_es->s_first_data_block) +
++ EXT2_I(inode)->i_block_group * EXT2_BLOCKS_PER_GROUP(sb);
++
++ /* How can we enforce the allocation? */
++ int block = ext2_new_block(inode, goal, 0, 0, errp);
++#ifdef OLD_QUOTAS
++ if (!*errp)
++ inode->i_blocks += inode->i_sb->s_blocksize >> 9;
++#endif
++ return block;
++}
++
++static inline int
++ext2_xattr_quota_alloc(struct inode *inode, int force)
++{
++ /* How can we enforce the allocation? */
++#ifdef OLD_QUOTAS
++ int error = DQUOT_ALLOC_BLOCK(inode->i_sb, inode, 1);
++ if (!error)
++ inode->i_blocks += inode->i_sb->s_blocksize >> 9;
++#else
++ int error = DQUOT_ALLOC_BLOCK(inode, 1);
++#endif
++ return error;
++}
++
++#ifdef OLD_QUOTAS
++
++static inline void
++ext2_xattr_quota_free(struct inode *inode)
++{
++ DQUOT_FREE_BLOCK(inode->i_sb, inode, 1);
++ inode->i_blocks -= inode->i_sb->s_blocksize >> 9;
++}
++
++static inline void
++ext2_xattr_free_block(struct inode * inode, unsigned long block)
++{
++ ext2_free_blocks(inode, block, 1);
++ inode->i_blocks -= inode->i_sb->s_blocksize >> 9;
++}
++
++#else
++# define ext2_xattr_quota_free(inode) \
++ DQUOT_FREE_BLOCK(inode, 1)
++# define ext2_xattr_free_block(inode, block) \
++ ext2_free_blocks(inode, block, 1)
++#endif
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,18)
++
++static inline struct buffer_head *
++sb_bread(struct super_block *sb, int block)
++{
++ return bread(sb->s_dev, block, sb->s_blocksize);
++}
++
++static inline struct buffer_head *
++sb_getblk(struct super_block *sb, int block)
++{
++ return getblk(sb->s_dev, block, sb->s_blocksize);
++}
++
++#endif
++
++struct ext2_xattr_handler *ext2_xattr_handlers[EXT2_XATTR_INDEX_MAX];
++rwlock_t ext2_handler_lock = RW_LOCK_UNLOCKED;
++
++int
++ext2_xattr_register(int name_index, struct ext2_xattr_handler *handler)
++{
++ int error = -EINVAL;
++
++ if (name_index > 0 && name_index <= EXT2_XATTR_INDEX_MAX) {
++ write_lock(&ext2_handler_lock);
++ if (!ext2_xattr_handlers[name_index-1]) {
++ ext2_xattr_handlers[name_index-1] = handler;
++ error = 0;
++ }
++ write_unlock(&ext2_handler_lock);
++ }
++ return error;
++}
++
++void
++ext2_xattr_unregister(int name_index, struct ext2_xattr_handler *handler)
++{
++ if (name_index > 0 || name_index <= EXT2_XATTR_INDEX_MAX) {
++ write_lock(&ext2_handler_lock);
++ ext2_xattr_handlers[name_index-1] = NULL;
++ write_unlock(&ext2_handler_lock);
++ }
++}
++
++static inline const char *
++strcmp_prefix(const char *a, const char *a_prefix)
++{
++ while (*a_prefix && *a == *a_prefix) {
++ a++;
++ a_prefix++;
++ }
++ return *a_prefix ? NULL : a;
++}
++
++/*
++ * Decode the extended attribute name, and translate it into
++ * the name_index and name suffix.
++ */
++static struct ext2_xattr_handler *
++ext2_xattr_resolve_name(const char **name)
++{
++ struct ext2_xattr_handler *handler = NULL;
++ int i;
++
++ if (!*name)
++ return NULL;
++ read_lock(&ext2_handler_lock);
++ for (i=0; i<EXT2_XATTR_INDEX_MAX; i++) {
++ if (ext2_xattr_handlers[i]) {
++ const char *n = strcmp_prefix(*name,
++ ext2_xattr_handlers[i]->prefix);
++ if (n) {
++ handler = ext2_xattr_handlers[i];
++ *name = n;
++ break;
++ }
++ }
++ }
++ read_unlock(&ext2_handler_lock);
++ return handler;
++}
++
++static inline struct ext2_xattr_handler *
++ext2_xattr_handler(int name_index)
++{
++ struct ext2_xattr_handler *handler = NULL;
++ if (name_index > 0 && name_index <= EXT2_XATTR_INDEX_MAX) {
++ read_lock(&ext2_handler_lock);
++ handler = ext2_xattr_handlers[name_index-1];
++ read_unlock(&ext2_handler_lock);
++ }
++ return handler;
++}
++
++/*
++ * Inode operation getxattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++ssize_t
++ext2_getxattr(struct dentry *dentry, const char *name,
++ void *buffer, size_t size)
++{
++ struct ext2_xattr_handler *handler;
++ struct inode *inode = dentry->d_inode;
++
++ handler = ext2_xattr_resolve_name(&name);
++ if (!handler)
++ return -ENOTSUP;
++ return handler->get(inode, name, buffer, size);
++}
++
++/*
++ * Inode operation listxattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++ssize_t
++ext2_listxattr(struct dentry *dentry, char *buffer, size_t size)
++{
++ return ext2_xattr_list(dentry->d_inode, buffer, size);
++}
++
++/*
++ * Inode operation setxattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++int
++ext2_setxattr(struct dentry *dentry, const char *name,
++ const void *value, size_t size, int flags)
++{
++ struct ext2_xattr_handler *handler;
++ struct inode *inode = dentry->d_inode;
++
++ if (size == 0)
++ value = ""; /* empty EA, do not remove */
++ handler = ext2_xattr_resolve_name(&name);
++ if (!handler)
++ return -ENOTSUP;
++ return handler->set(inode, name, value, size, flags);
++}
++
++/*
++ * Inode operation removexattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++int
++ext2_removexattr(struct dentry *dentry, const char *name)
++{
++ struct ext2_xattr_handler *handler;
++ struct inode *inode = dentry->d_inode;
++
++ handler = ext2_xattr_resolve_name(&name);
++ if (!handler)
++ return -ENOTSUP;
++ return handler->set(inode, name, NULL, 0, XATTR_REPLACE);
++}
++
++/*
++ * ext2_xattr_get()
++ *
++ * Copy an extended attribute into the buffer
++ * provided, or compute the buffer size required.
++ * Buffer is NULL to compute the size of the buffer required.
++ *
++ * Returns a negative error number on failure, or the number of bytes
++ * used / required on success.
++ */
++int
++ext2_xattr_get(struct inode *inode, int name_index, const char *name,
++ void *buffer, size_t buffer_size)
++{
++ struct buffer_head *bh = NULL;
++ struct ext2_xattr_entry *entry;
++ unsigned int block, size;
++ char *end;
++ int name_len, error;
++
++ ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
++ name_index, name, buffer, (long)buffer_size);
++
++ if (name == NULL)
++ return -EINVAL;
++ if (!EXT2_I(inode)->i_file_acl)
++ return -ENOATTR;
++ block = EXT2_I(inode)->i_file_acl;
++ ea_idebug(inode, "reading block %d", block);
++ bh = sb_bread(inode->i_sb, block);
++ if (!bh)
++ return -EIO;
++ ea_bdebug(bh, "b_count=%d, refcount=%d",
++ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
++ end = bh->b_data + bh->b_size;
++ if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) ||
++ HDR(bh)->h_blocks != cpu_to_le32(1)) {
++bad_block: ext2_error(inode->i_sb, "ext2_xattr_get",
++ "inode %ld: bad block %d", inode->i_ino, block);
++ error = -EIO;
++ goto cleanup;
++ }
++ /* find named attribute */
++ name_len = strlen(name);
++
++ error = -ERANGE;
++ if (name_len > 255)
++ goto cleanup;
++ entry = FIRST_ENTRY(bh);
++ while (!IS_LAST_ENTRY(entry)) {
++ struct ext2_xattr_entry *next =
++ EXT2_XATTR_NEXT(entry);
++ if ((char *)next >= end)
++ goto bad_block;
++ if (name_index == entry->e_name_index &&
++ name_len == entry->e_name_len &&
++ memcmp(name, entry->e_name, name_len) == 0)
++ goto found;
++ entry = next;
++ }
++ /* Check the remaining name entries */
++ while (!IS_LAST_ENTRY(entry)) {
++ struct ext2_xattr_entry *next =
++ EXT2_XATTR_NEXT(entry);
++ if ((char *)next >= end)
++ goto bad_block;
++ entry = next;
++ }
++ if (ext2_xattr_cache_insert(bh))
++ ea_idebug(inode, "cache insert failed");
++ error = -ENOATTR;
++ goto cleanup;
++found:
++ /* check the buffer size */
++ if (entry->e_value_block != 0)
++ goto bad_block;
++ size = le32_to_cpu(entry->e_value_size);
++ if (size > inode->i_sb->s_blocksize ||
++ le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize)
++ goto bad_block;
++
++ if (ext2_xattr_cache_insert(bh))
++ ea_idebug(inode, "cache insert failed");
++ if (buffer) {
++ error = -ERANGE;
++ if (size > buffer_size)
++ goto cleanup;
++ /* return value of attribute */
++ memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs),
++ size);
++ }
++ error = size;
++
++cleanup:
++ brelse(bh);
++
++ return error;
++}
++
++/*
++ * ext2_xattr_list()
++ *
++ * Copy a list of attribute names into the buffer
++ * provided, or compute the buffer size required.
++ * Buffer is NULL to compute the size of the buffer required.
++ *
++ * Returns a negative error number on failure, or the number of bytes
++ * used / required on success.
++ */
++int
++ext2_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
++{
++ struct buffer_head *bh = NULL;
++ struct ext2_xattr_entry *entry;
++ unsigned int block, size = 0;
++ char *buf, *end;
++ int error;
++
++ ea_idebug(inode, "buffer=%p, buffer_size=%ld",
++ buffer, (long)buffer_size);
++
++ if (!EXT2_I(inode)->i_file_acl)
++ return 0;
++ block = EXT2_I(inode)->i_file_acl;
++ ea_idebug(inode, "reading block %d", block);
++ bh = sb_bread(inode->i_sb, block);
++ if (!bh)
++ return -EIO;
++ ea_bdebug(bh, "b_count=%d, refcount=%d",
++ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
++ end = bh->b_data + bh->b_size;
++ if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) ||
++ HDR(bh)->h_blocks != cpu_to_le32(1)) {
++bad_block: ext2_error(inode->i_sb, "ext2_xattr_list",
++ "inode %ld: bad block %d", inode->i_ino, block);
++ error = -EIO;
++ goto cleanup;
++ }
++ /* compute the size required for the list of attribute names */
++ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
++ entry = EXT2_XATTR_NEXT(entry)) {
++ struct ext2_xattr_handler *handler;
++ struct ext2_xattr_entry *next =
++ EXT2_XATTR_NEXT(entry);
++ if ((char *)next >= end)
++ goto bad_block;
++
++ handler = ext2_xattr_handler(entry->e_name_index);
++ if (handler)
++ size += handler->list(NULL, inode, entry->e_name,
++ entry->e_name_len);
++ }
++
++ if (ext2_xattr_cache_insert(bh))
++ ea_idebug(inode, "cache insert failed");
++ if (!buffer) {
++ error = size;
++ goto cleanup;
++ } else {
++ error = -ERANGE;
++ if (size > buffer_size)
++ goto cleanup;
++ }
++
++ /* list the attribute names */
++ buf = buffer;
++ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
++ entry = EXT2_XATTR_NEXT(entry)) {
++ struct ext2_xattr_handler *handler;
++
++ handler = ext2_xattr_handler(entry->e_name_index);
++ if (handler)
++ buf += handler->list(buf, inode, entry->e_name,
++ entry->e_name_len);
++ }
++ error = size;
++
++cleanup:
++ brelse(bh);
++
++ return error;
++}
++
++/*
++ * If the EXT2_FEATURE_COMPAT_EXT_ATTR feature of this file system is
++ * not set, set it.
++ */
++static void ext2_xattr_update_super_block(struct super_block *sb)
++{
++ if (EXT2_HAS_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR))
++ return;
++
++ lock_super(sb);
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0)
++ EXT2_SB(sb)->s_feature_compat |= EXT2_FEATURE_COMPAT_EXT_ATTR;
++#endif
++ EXT2_SB(sb)->s_es->s_feature_compat |=
++ cpu_to_le32(EXT2_FEATURE_COMPAT_EXT_ATTR);
++ sb->s_dirt = 1;
++ mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
++ unlock_super(sb);
++}
++
++/*
++ * ext2_xattr_set()
++ *
++ * Create, replace or remove an extended attribute for this inode. Buffer
++ * is NULL to remove an existing extended attribute, and non-NULL to
++ * either replace an existing extended attribute, or create a new extended
++ * attribute. The flags XATTR_REPLACE and XATTR_CREATE
++ * specify that an extended attribute must exist and must not exist
++ * previous to the call, respectively.
++ *
++ * Returns 0, or a negative error number on failure.
++ */
++int
++ext2_xattr_set(struct inode *inode, int name_index, const char *name,
++ const void *value, size_t value_len, int flags)
++{
++ struct super_block *sb = inode->i_sb;
++ struct buffer_head *bh = NULL;
++ struct ext2_xattr_header *header = NULL;
++ struct ext2_xattr_entry *here, *last;
++ unsigned int name_len;
++ int block = EXT2_I(inode)->i_file_acl;
++ int min_offs = sb->s_blocksize, not_found = 1, free, error;
++ char *end;
++
++ /*
++ * header -- Points either into bh, or to a temporarily
++ * allocated buffer.
++ * here -- The named entry found, or the place for inserting, within
++ * the block pointed to by header.
++ * last -- Points right after the last named entry within the block
++ * pointed to by header.
++ * min_offs -- The offset of the first value (values are aligned
++ * towards the end of the block).
++ * end -- Points right after the block pointed to by header.
++ */
++
++ ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld",
++ name_index, name, value, (long)value_len);
++
++ if (IS_RDONLY(inode))
++ return -EROFS;
++ if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
++ return -EPERM;
++ if (value == NULL)
++ value_len = 0;
++ if (name == NULL)
++ return -EINVAL;
++ name_len = strlen(name);
++ if (name_len > 255 || value_len > sb->s_blocksize)
++ return -ERANGE;
++ down(&ext2_xattr_sem);
++
++ if (block) {
++ /* The inode already has an extended attribute block. */
++
++ bh = sb_bread(sb, block);
++ error = -EIO;
++ if (!bh)
++ goto cleanup;
++ ea_bdebug(bh, "b_count=%d, refcount=%d",
++ atomic_read(&(bh->b_count)),
++ le32_to_cpu(HDR(bh)->h_refcount));
++ header = HDR(bh);
++ end = bh->b_data + bh->b_size;
++ if (header->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) ||
++ header->h_blocks != cpu_to_le32(1)) {
++bad_block: ext2_error(sb, "ext2_xattr_set",
++ "inode %ld: bad block %d", inode->i_ino, block);
++ error = -EIO;
++ goto cleanup;
++ }
++ /* Find the named attribute. */
++ here = FIRST_ENTRY(bh);
++ while (!IS_LAST_ENTRY(here)) {
++ struct ext2_xattr_entry *next = EXT2_XATTR_NEXT(here);
++ if ((char *)next >= end)
++ goto bad_block;
++ if (!here->e_value_block && here->e_value_size) {
++ int offs = le16_to_cpu(here->e_value_offs);
++ if (offs < min_offs)
++ min_offs = offs;
++ }
++ not_found = name_index - here->e_name_index;
++ if (!not_found)
++ not_found = name_len - here->e_name_len;
++ if (!not_found)
++ not_found = memcmp(name, here->e_name,name_len);
++ if (not_found <= 0)
++ break;
++ here = next;
++ }
++ last = here;
++ /* We still need to compute min_offs and last. */
++ while (!IS_LAST_ENTRY(last)) {
++ struct ext2_xattr_entry *next = EXT2_XATTR_NEXT(last);
++ if ((char *)next >= end)
++ goto bad_block;
++ if (!last->e_value_block && last->e_value_size) {
++ int offs = le16_to_cpu(last->e_value_offs);
++ if (offs < min_offs)
++ min_offs = offs;
++ }
++ last = next;
++ }
++
++ /* Check whether we have enough space left. */
++ free = min_offs - ((char*)last - (char*)header) - sizeof(__u32);
++ } else {
++ /* We will use a new extended attribute block. */
++ free = sb->s_blocksize -
++ sizeof(struct ext2_xattr_header) - sizeof(__u32);
++ here = last = NULL; /* avoid gcc uninitialized warning. */
++ }
++
++ if (not_found) {
++ /* Request to remove a nonexistent attribute? */
++ error = -ENOATTR;
++ if (flags & XATTR_REPLACE)
++ goto cleanup;
++ error = 0;
++ if (value == NULL)
++ goto cleanup;
++ else
++ free -= EXT2_XATTR_LEN(name_len);
++ } else {
++ /* Request to create an existing attribute? */
++ error = -EEXIST;
++ if (flags & XATTR_CREATE)
++ goto cleanup;
++ if (!here->e_value_block && here->e_value_size) {
++ unsigned int size = le32_to_cpu(here->e_value_size);
++
++ if (le16_to_cpu(here->e_value_offs) + size >
++ sb->s_blocksize || size > sb->s_blocksize)
++ goto bad_block;
++ free += EXT2_XATTR_SIZE(size);
++ }
++ }
++ free -= EXT2_XATTR_SIZE(value_len);
++ error = -ENOSPC;
++ if (free < 0)
++ goto cleanup;
++
++ /* Here we know that we can set the new attribute. */
++
++ if (header) {
++ if (header->h_refcount == cpu_to_le32(1)) {
++ ea_bdebug(bh, "modifying in-place");
++ ext2_xattr_cache_remove(bh);
++ } else {
++ int offset;
++
++ ea_bdebug(bh, "cloning");
++ header = kmalloc(bh->b_size, GFP_KERNEL);
++ error = -ENOMEM;
++ if (header == NULL)
++ goto cleanup;
++ memcpy(header, HDR(bh), bh->b_size);
++ header->h_refcount = cpu_to_le32(1);
++ offset = (char *)header - bh->b_data;
++ here = ENTRY((char *)here + offset);
++ last = ENTRY((char *)last + offset);
++ }
++ } else {
++ /* Allocate a buffer where we construct the new block. */
++ header = kmalloc(sb->s_blocksize, GFP_KERNEL);
++ error = -ENOMEM;
++ if (header == NULL)
++ goto cleanup;
++ memset(header, 0, sb->s_blocksize);
++ end = (char *)header + sb->s_blocksize;
++ header->h_magic = cpu_to_le32(EXT2_XATTR_MAGIC);
++ header->h_blocks = header->h_refcount = cpu_to_le32(1);
++ last = here = ENTRY(header+1);
++ }
++
++ if (not_found) {
++ /* Insert the new name. */
++ int size = EXT2_XATTR_LEN(name_len);
++ int rest = (char *)last - (char *)here;
++ memmove((char *)here + size, here, rest);
++ memset(here, 0, size);
++ here->e_name_index = name_index;
++ here->e_name_len = name_len;
++ memcpy(here->e_name, name, name_len);
++ } else {
++ /* Remove the old value. */
++ if (!here->e_value_block && here->e_value_size) {
++ char *first_val = (char *)header + min_offs;
++ int offs = le16_to_cpu(here->e_value_offs);
++ char *val = (char *)header + offs;
++ size_t size = EXT2_XATTR_SIZE(
++ le32_to_cpu(here->e_value_size));
++ memmove(first_val + size, first_val, val - first_val);
++ memset(first_val, 0, size);
++ here->e_value_offs = 0;
++ min_offs += size;
++
++ /* Adjust all value offsets. */
++ last = ENTRY(header+1);
++ while (!IS_LAST_ENTRY(last)) {
++ int o = le16_to_cpu(last->e_value_offs);
++ if (!last->e_value_block && o < offs)
++ last->e_value_offs =
++ cpu_to_le16(o + size);
++ last = EXT2_XATTR_NEXT(last);
++ }
++ }
++ if (value == NULL) {
++ /* Remove this attribute. */
++ if (EXT2_XATTR_NEXT(ENTRY(header+1)) == last) {
++ /* This block is now empty. */
++ error = ext2_xattr_set2(inode, bh, NULL);
++ goto cleanup;
++ } else {
++ /* Remove the old name. */
++ int size = EXT2_XATTR_LEN(name_len);
++ last = ENTRY((char *)last - size);
++ memmove(here, (char*)here + size,
++ (char*)last - (char*)here);
++ memset(last, 0, size);
++ }
++ }
++ }
++
++ if (value != NULL) {
++ /* Insert the new value. */
++ here->e_value_size = cpu_to_le32(value_len);
++ if (value_len) {
++ size_t size = EXT2_XATTR_SIZE(value_len);
++ char *val = (char *)header + min_offs - size;
++ here->e_value_offs =
++ cpu_to_le16((char *)val - (char *)header);
++ memset(val + size - EXT2_XATTR_PAD, 0,
++ EXT2_XATTR_PAD); /* Clear the pad bytes. */
++ memcpy(val, value, value_len);
++ }
++ }
++ ext2_xattr_rehash(header, here);
++
++ error = ext2_xattr_set2(inode, bh, header);
++
++cleanup:
++ brelse(bh);
++ if (!(bh && header == HDR(bh)))
++ kfree(header);
++ up(&ext2_xattr_sem);
++
++ return error;
++}
++
++/*
++ * Second half of ext2_xattr_set(): Update the file system.
++ */
++static int
++ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
++ struct ext2_xattr_header *header)
++{
++ struct super_block *sb = inode->i_sb;
++ struct buffer_head *new_bh = NULL;
++ int error;
++
++ if (header) {
++ new_bh = ext2_xattr_cache_find(inode, header);
++ if (new_bh) {
++ /*
++ * We found an identical block in the cache.
++ * The old block will be released after updating
++ * the inode.
++ */
++ ea_bdebug(old_bh, "reusing block %ld",
++ new_bh->b_blocknr);
++
++ error = -EDQUOT;
++ if (ext2_xattr_quota_alloc(inode, 1))
++ goto cleanup;
++
++ HDR(new_bh)->h_refcount = cpu_to_le32(
++ le32_to_cpu(HDR(new_bh)->h_refcount) + 1);
++ ea_bdebug(new_bh, "refcount now=%d",
++ le32_to_cpu(HDR(new_bh)->h_refcount));
++ } else if (old_bh && header == HDR(old_bh)) {
++ /* Keep this block. */
++ new_bh = old_bh;
++ ext2_xattr_cache_insert(new_bh);
++ } else {
++ /* We need to allocate a new block */
++ int force = EXT2_I(inode)->i_file_acl != 0;
++ int block = ext2_xattr_new_block(inode, &error, force);
++ if (error)
++ goto cleanup;
++ ea_idebug(inode, "creating block %d", block);
++
++ new_bh = sb_getblk(sb, block);
++ if (!new_bh) {
++ ext2_xattr_free_block(inode, block);
++ error = -EIO;
++ goto cleanup;
++ }
++ lock_buffer(new_bh);
++ memcpy(new_bh->b_data, header, new_bh->b_size);
++ mark_buffer_uptodate(new_bh, 1);
++ unlock_buffer(new_bh);
++ ext2_xattr_cache_insert(new_bh);
++
++ ext2_xattr_update_super_block(sb);
++ }
++ mark_buffer_dirty(new_bh);
++ if (IS_SYNC(inode)) {
++ ll_rw_block(WRITE, 1, &new_bh);
++ wait_on_buffer(new_bh);
++ error = -EIO;
++ if (buffer_req(new_bh) && !buffer_uptodate(new_bh))
++ goto cleanup;
++ }
++ }
++
++ /* Update the inode. */
++ EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
++ inode->i_ctime = CURRENT_TIME;
++ if (IS_SYNC(inode)) {
++ error = ext2_sync_inode (inode);
++ if (error)
++ goto cleanup;
++ } else
++ mark_inode_dirty(inode);
++
++ error = 0;
++ if (old_bh && old_bh != new_bh) {
++ /*
++ * If there was an old block, and we are not still using it,
++ * we now release the old block.
++ */
++ unsigned int refcount = le32_to_cpu(HDR(old_bh)->h_refcount);
++
++ if (refcount == 1) {
++ /* Free the old block. */
++ ea_bdebug(old_bh, "freeing");
++ ext2_xattr_free_block(inode, old_bh->b_blocknr);
++ mark_buffer_clean(old_bh);
++ } else {
++ /* Decrement the refcount only. */
++ refcount--;
++ HDR(old_bh)->h_refcount = cpu_to_le32(refcount);
++ ext2_xattr_quota_free(inode);
++ mark_buffer_dirty(old_bh);
++ ea_bdebug(old_bh, "refcount now=%d", refcount);
++ }
++ }
++
++cleanup:
++ if (old_bh != new_bh)
++ brelse(new_bh);
++
++ return error;
++}
++
++/*
++ * ext2_xattr_delete_inode()
++ *
++ * Free extended attribute resources associated with this inode. This
++ * is called immediately before an inode is freed.
++ */
++void
++ext2_xattr_delete_inode(struct inode *inode)
++{
++ struct buffer_head *bh;
++ unsigned int block = EXT2_I(inode)->i_file_acl;
++
++ if (!block)
++ return;
++ down(&ext2_xattr_sem);
++
++ bh = sb_bread(inode->i_sb, block);
++ if (!bh) {
++ ext2_error(inode->i_sb, "ext2_xattr_delete_inode",
++ "inode %ld: block %d read error", inode->i_ino, block);
++ goto cleanup;
++ }
++ ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count)));
++ if (HDR(bh)->h_magic != cpu_to_le32(EXT2_XATTR_MAGIC) ||
++ HDR(bh)->h_blocks != cpu_to_le32(1)) {
++ ext2_error(inode->i_sb, "ext2_xattr_delete_inode",
++ "inode %ld: bad block %d", inode->i_ino, block);
++ goto cleanup;
++ }
++ ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1);
++ if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
++ ext2_xattr_cache_remove(bh);
++ ext2_xattr_free_block(inode, block);
++ bforget(bh);
++ bh = NULL;
++ } else {
++ HDR(bh)->h_refcount = cpu_to_le32(
++ le32_to_cpu(HDR(bh)->h_refcount) - 1);
++ mark_buffer_dirty(bh);
++ if (IS_SYNC(inode)) {
++ ll_rw_block(WRITE, 1, &bh);
++ wait_on_buffer(bh);
++ }
++ ext2_xattr_quota_free(inode);
++ }
++ EXT2_I(inode)->i_file_acl = 0;
++
++cleanup:
++ brelse(bh);
++ up(&ext2_xattr_sem);
++}
++
++/*
++ * ext2_xattr_put_super()
++ *
++ * This is called when a file system is unmounted.
++ */
++void
++ext2_xattr_put_super(struct super_block *sb)
++{
++#ifdef CONFIG_EXT2_FS_XATTR_SHARING
++ mb_cache_shrink(ext2_xattr_cache, sb->s_dev);
++#endif
++}
++
++#ifdef CONFIG_EXT2_FS_XATTR_SHARING
++
++/*
++ * ext2_xattr_cache_insert()
++ *
++ * Create a new entry in the extended attribute cache, and insert
++ * it unless such an entry is already in the cache.
++ *
++ * Returns 0, or a negative error number on failure.
++ */
++static int
++ext2_xattr_cache_insert(struct buffer_head *bh)
++{
++ __u32 hash = le32_to_cpu(HDR(bh)->h_hash);
++ struct mb_cache_entry *ce;
++ int error;
++
++ ce = mb_cache_entry_alloc(ext2_xattr_cache);
++ if (!ce)
++ return -ENOMEM;
++ error = mb_cache_entry_insert(ce, bh->b_dev, bh->b_blocknr, &hash);
++ if (error) {
++ mb_cache_entry_free(ce);
++ if (error == -EBUSY) {
++ ea_bdebug(bh, "already in cache (%d cache entries)",
++ atomic_read(&ext2_xattr_cache->c_entry_count));
++ error = 0;
++ }
++ } else {
++ ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash,
++ atomic_read(&ext2_xattr_cache->c_entry_count));
++ mb_cache_entry_release(ce);
++ }
++ return error;
++}
++
++/*
++ * ext2_xattr_cmp()
++ *
++ * Compare two extended attribute blocks for equality.
++ *
++ * Returns 0 if the blocks are equal, 1 if they differ, and
++ * a negative error number on errors.
++ */
++static int
++ext2_xattr_cmp(struct ext2_xattr_header *header1,
++ struct ext2_xattr_header *header2)
++{
++ struct ext2_xattr_entry *entry1, *entry2;
++
++ entry1 = ENTRY(header1+1);
++ entry2 = ENTRY(header2+1);
++ while (!IS_LAST_ENTRY(entry1)) {
++ if (IS_LAST_ENTRY(entry2))
++ return 1;
++ if (entry1->e_hash != entry2->e_hash ||
++ entry1->e_name_len != entry2->e_name_len ||
++ entry1->e_value_size != entry2->e_value_size ||
++ memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
++ return 1;
++ if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
++ return -EIO;
++ if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
++ (char *)header2 + le16_to_cpu(entry2->e_value_offs),
++ le32_to_cpu(entry1->e_value_size)))
++ return 1;
++
++ entry1 = EXT2_XATTR_NEXT(entry1);
++ entry2 = EXT2_XATTR_NEXT(entry2);
++ }
++ if (!IS_LAST_ENTRY(entry2))
++ return 1;
++ return 0;
++}
++
++/*
++ * ext2_xattr_cache_find()
++ *
++ * Find an identical extended attribute block.
++ *
++ * Returns a pointer to the block found, or NULL if such a block was
++ * not found or an error occurred.
++ */
++static struct buffer_head *
++ext2_xattr_cache_find(struct inode *inode, struct ext2_xattr_header *header)
++{
++ __u32 hash = le32_to_cpu(header->h_hash);
++ struct mb_cache_entry *ce;
++
++ if (!header->h_hash)
++ return NULL; /* never share */
++ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
++ ce = mb_cache_entry_find_first(ext2_xattr_cache, 0, inode->i_dev, hash);
++ while (ce) {
++ struct buffer_head *bh = sb_bread(inode->i_sb, ce->e_block);
++
++ if (!bh) {
++ ext2_error(inode->i_sb, "ext2_xattr_cache_find",
++ "inode %ld: block %ld read error",
++ inode->i_ino, ce->e_block);
++ } else if (le32_to_cpu(HDR(bh)->h_refcount) >
++ EXT2_XATTR_REFCOUNT_MAX) {
++ ea_idebug(inode, "block %ld refcount %d>%d",ce->e_block,
++ le32_to_cpu(HDR(bh)->h_refcount),
++ EXT2_XATTR_REFCOUNT_MAX);
++ } else if (!ext2_xattr_cmp(header, HDR(bh))) {
++ ea_bdebug(bh, "b_count=%d",atomic_read(&(bh->b_count)));
++ mb_cache_entry_release(ce);
++ return bh;
++ }
++ brelse(bh);
++ ce = mb_cache_entry_find_next(ce, 0, inode->i_dev, hash);
++ }
++ return NULL;
++}
++
++/*
++ * ext2_xattr_cache_remove()
++ *
++ * Remove the cache entry of a block from the cache. Called when a
++ * block becomes invalid.
++ */
++static void
++ext2_xattr_cache_remove(struct buffer_head *bh)
++{
++ struct mb_cache_entry *ce;
++
++ ce = mb_cache_entry_get(ext2_xattr_cache, bh->b_dev, bh->b_blocknr);
++ if (ce) {
++ ea_bdebug(bh, "removing (%d cache entries remaining)",
++ atomic_read(&ext2_xattr_cache->c_entry_count)-1);
++ mb_cache_entry_free(ce);
++ } else
++ ea_bdebug(bh, "no cache entry");
++}
++
++#define NAME_HASH_SHIFT 5
++#define VALUE_HASH_SHIFT 16
++
++/*
++ * ext2_xattr_hash_entry()
++ *
++ * Compute the hash of an extended attribute.
++ */
++static inline void ext2_xattr_hash_entry(struct ext2_xattr_header *header,
++ struct ext2_xattr_entry *entry)
++{
++ __u32 hash = 0;
++ char *name = entry->e_name;
++ int n;
++
++ for (n=0; n < entry->e_name_len; n++) {
++ hash = (hash << NAME_HASH_SHIFT) ^
++ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
++ *name++;
++ }
++
++ if (entry->e_value_block == 0 && entry->e_value_size != 0) {
++ __u32 *value = (__u32 *)((char *)header +
++ le16_to_cpu(entry->e_value_offs));
++ for (n = (le32_to_cpu(entry->e_value_size) +
++ EXT2_XATTR_ROUND) >> EXT2_XATTR_PAD_BITS; n; n--) {
++ hash = (hash << VALUE_HASH_SHIFT) ^
++ (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^
++ le32_to_cpu(*value++);
++ }
++ }
++ entry->e_hash = cpu_to_le32(hash);
++}
++
++#undef NAME_HASH_SHIFT
++#undef VALUE_HASH_SHIFT
++
++#define BLOCK_HASH_SHIFT 16
++
++/*
++ * ext2_xattr_rehash()
++ *
++ * Re-compute the extended attribute hash value after an entry has changed.
++ */
++static void ext2_xattr_rehash(struct ext2_xattr_header *header,
++ struct ext2_xattr_entry *entry)
++{
++ struct ext2_xattr_entry *here;
++ __u32 hash = 0;
++
++ ext2_xattr_hash_entry(header, entry);
++ here = ENTRY(header+1);
++ while (!IS_LAST_ENTRY(here)) {
++ if (!here->e_hash) {
++ /* Block is not shared if an entry's hash value == 0 */
++ hash = 0;
++ break;
++ }
++ hash = (hash << BLOCK_HASH_SHIFT) ^
++ (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^
++ le32_to_cpu(here->e_hash);
++ here = EXT2_XATTR_NEXT(here);
++ }
++ header->h_hash = cpu_to_le32(hash);
++}
++
++#undef BLOCK_HASH_SHIFT
++
++int __init
++init_ext2_xattr(void)
++{
++ ext2_xattr_cache = mb_cache_create("ext2_xattr", NULL,
++ sizeof(struct mb_cache_entry) +
++ sizeof(struct mb_cache_entry_index), 1, 61);
++ if (!ext2_xattr_cache)
++ return -ENOMEM;
++
++ return 0;
++}
++
++void
++exit_ext2_xattr(void)
++{
++ mb_cache_destroy(ext2_xattr_cache);
++}
++
++#else /* CONFIG_EXT2_FS_XATTR_SHARING */
++
++int __init
++init_ext2_xattr(void)
++{
++ return 0;
++}
++
++void
++exit_ext2_xattr(void)
++{
++}
++
++#endif /* CONFIG_EXT2_FS_XATTR_SHARING */
+--- /dev/null 2003-01-30 13:24:37.000000000 +0300
++++ linux-2.4.22-ac1-alexey/fs/ext2/xattr_user.c 2003-09-25 23:57:02.000000000 +0400
+@@ -0,0 +1,103 @@
++/*
++ * linux/fs/ext2/xattr_user.c
++ * Handler for extended user attributes.
++ *
++ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
++ */
++
++#include <linux/module.h>
++#include <linux/string.h>
++#include <linux/fs.h>
++#include <linux/ext2_fs.h>
++#include <linux/ext2_xattr.h>
++
++#ifdef CONFIG_EXT2_FS_POSIX_ACL
++# include <linux/ext2_acl.h>
++#endif
++
++#define XATTR_USER_PREFIX "user."
++
++static size_t
++ext2_xattr_user_list(char *list, struct inode *inode,
++ const char *name, int name_len)
++{
++ const int prefix_len = sizeof(XATTR_USER_PREFIX)-1;
++
++ if (!test_opt(inode->i_sb, XATTR_USER))
++ return 0;
++
++ if (list) {
++ memcpy(list, XATTR_USER_PREFIX, prefix_len);
++ memcpy(list+prefix_len, name, name_len);
++ list[prefix_len + name_len] = '\0';
++ }
++ return prefix_len + name_len + 1;
++}
++
++static int
++ext2_xattr_user_get(struct inode *inode, const char *name,
++ void *buffer, size_t size)
++{
++ int error;
++
++ if (strcmp(name, "") == 0)
++ return -EINVAL;
++ if (!test_opt(inode->i_sb, XATTR_USER))
++ return -ENOTSUP;
++#ifdef CONFIG_EXT2_FS_POSIX_ACL
++ error = ext2_permission_locked(inode, MAY_READ);
++#else
++ error = permission(inode, MAY_READ);
++#endif
++ if (error)
++ return error;
++
++ return ext2_xattr_get(inode, EXT2_XATTR_INDEX_USER, name,
++ buffer, size);
++}
++
++static int
++ext2_xattr_user_set(struct inode *inode, const char *name,
++ const void *value, size_t size, int flags)
++{
++ int error;
++
++ if (strcmp(name, "") == 0)
++ return -EINVAL;
++ if (!test_opt(inode->i_sb, XATTR_USER))
++ return -ENOTSUP;
++ if ( !S_ISREG(inode->i_mode) &&
++ (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
++ return -EPERM;
++#ifdef CONFIG_EXT2_FS_POSIX_ACL
++ error = ext2_permission_locked(inode, MAY_WRITE);
++#else
++ error = permission(inode, MAY_WRITE);
++#endif
++ if (error)
++ return error;
++
++ return ext2_xattr_set(inode, EXT2_XATTR_INDEX_USER, name,
++ value, size, flags);
++}
++
++struct ext2_xattr_handler ext2_xattr_user_handler = {
++ prefix: XATTR_USER_PREFIX,
++ list: ext2_xattr_user_list,
++ get: ext2_xattr_user_get,
++ set: ext2_xattr_user_set,
++};
++
++int __init
++init_ext2_xattr_user(void)
++{
++ return ext2_xattr_register(EXT2_XATTR_INDEX_USER,
++ &ext2_xattr_user_handler);
++}
++
++void
++exit_ext2_xattr_user(void)
++{
++ ext2_xattr_unregister(EXT2_XATTR_INDEX_USER,
++ &ext2_xattr_user_handler);
++}
+--- /dev/null 2003-01-30 13:24:37.000000000 +0300
++++ linux-2.4.22-ac1-alexey/fs/ext3/ext3-exports.c 2003-09-25 23:57:02.000000000 +0400
+@@ -0,0 +1,13 @@
++#include <linux/config.h>
++#include <linux/module.h>
++#include <linux/ext3_fs.h>
++#include <linux/ext3_jbd.h>
++#include <linux/ext3_xattr.h>
++
++EXPORT_SYMBOL(ext3_force_commit);
++EXPORT_SYMBOL(ext3_bread);
++EXPORT_SYMBOL(ext3_xattr_register);
++EXPORT_SYMBOL(ext3_xattr_unregister);
++EXPORT_SYMBOL(ext3_xattr_get);
++EXPORT_SYMBOL(ext3_xattr_list);
++EXPORT_SYMBOL(ext3_xattr_set);
+--- linux-2.4.22-ac1/fs/ext3/file.c~xattr-0.8.54-2.4.22-rh 2003-09-25 14:55:12.000000000 +0400
++++ linux-2.4.22-ac1-alexey/fs/ext3/file.c 2003-09-25 23:57:02.000000000 +0400
+@@ -23,6 +23,7 @@
+ #include <linux/locks.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
++#include <linux/ext3_xattr.h>
+ #include <linux/ext3_jbd.h>
+ #include <linux/smp_lock.h>
+
+@@ -127,5 +128,9 @@ struct file_operations ext3_file_operati
+ struct inode_operations ext3_file_inode_operations = {
+ truncate: ext3_truncate, /* BKL held */
+ setattr: ext3_setattr, /* BKL held */
++ setxattr: ext3_setxattr, /* BKL held */
++ getxattr: ext3_getxattr, /* BKL held */
++ listxattr: ext3_listxattr, /* BKL held */
++ removexattr: ext3_removexattr, /* BKL held */
+ };
+
+--- linux-2.4.22-ac1/fs/ext3/ialloc.c~xattr-0.8.54-2.4.22-rh 2003-09-25 14:16:29.000000000 +0400
++++ linux-2.4.22-ac1-alexey/fs/ext3/ialloc.c 2003-09-25 23:57:02.000000000 +0400
+@@ -17,6 +17,7 @@
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
++#include <linux/ext3_xattr.h>
+ #include <linux/stat.h>
+ #include <linux/string.h>
+ #include <linux/locks.h>
+@@ -217,6 +218,7 @@ void ext3_free_inode (handle_t *handle,
+ * as writing the quota to disk may need the lock as well.
+ */
+ DQUOT_INIT(inode);
++ ext3_xattr_delete_inode(handle, inode);
+ DQUOT_FREE_INODE(inode);
+ DQUOT_DROP(inode);
+
+--- linux-2.4.22-ac1/fs/ext3/inode.c~xattr-0.8.54-2.4.22-rh 2003-09-25 14:16:29.000000000 +0400
++++ linux-2.4.22-ac1-alexey/fs/ext3/inode.c 2003-09-26 00:10:09.000000000 +0400
+@@ -39,6 +39,18 @@
+ */
+ #undef SEARCH_FROM_ZERO
+
++/*
++ * Test whether an inode is a fast symlink.
++ */
++static inline int ext3_inode_is_fast_symlink(struct inode *inode)
++{
++ int ea_blocks = inode->u.ext3_i.i_file_acl ?
++ (inode->i_sb->s_blocksize >> 9) : 0;
++
++ return (S_ISLNK(inode->i_mode) &&
++ inode->i_blocks - ea_blocks == 0);
++}
++
+ /* The ext3 forget function must perform a revoke if we are freeing data
+ * which has been journaled. Metadata (eg. indirect blocks) must be
+ * revoked in all cases.
+@@ -48,7 +60,7 @@
+ * still needs to be revoked.
+ */
+
+-static int ext3_forget(handle_t *handle, int is_metadata,
++int ext3_forget(handle_t *handle, int is_metadata,
+ struct inode *inode, struct buffer_head *bh,
+ int blocknr)
+ {
+@@ -179,9 +191,7 @@ void ext3_delete_inode (struct inode * i
+ {
+ handle_t *handle;
+
+- if (is_bad_inode(inode) ||
+- inode->i_ino == EXT3_ACL_IDX_INO ||
+- inode->i_ino == EXT3_ACL_DATA_INO)
++ if (is_bad_inode(inode))
+ goto no_delete;
+
+ lock_kernel();
+@@ -1874,6 +1884,8 @@ void ext3_truncate(struct inode * inode)
+ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ S_ISLNK(inode->i_mode)))
+ return;
++ if (ext3_inode_is_fast_symlink(inode))
++ return;
+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+ return;
+
+@@ -2021,8 +2033,6 @@ int ext3_get_inode_loc (struct inode *in
+ struct ext3_group_desc * gdp;
+
+ if ((inode->i_ino != EXT3_ROOT_INO &&
+- inode->i_ino != EXT3_ACL_IDX_INO &&
+- inode->i_ino != EXT3_ACL_DATA_INO &&
+ inode->i_ino != EXT3_JOURNAL_INO &&
+ inode->i_ino < EXT3_FIRST_INO(inode->i_sb)) ||
+ inode->i_ino > le32_to_cpu(
+@@ -2163,10 +2173,7 @@ void ext3_read_inode(struct inode * inod
+ inode->u.ext3_i.i_data[block] = iloc.raw_inode->i_block[block];
+ INIT_LIST_HEAD(&inode->u.ext3_i.i_orphan);
+
+- if (inode->i_ino == EXT3_ACL_IDX_INO ||
+- inode->i_ino == EXT3_ACL_DATA_INO)
+- /* Nothing to do */ ;
+- else if (S_ISREG(inode->i_mode)) {
++ if (S_ISREG(inode->i_mode)) {
+ inode->i_op = &ext3_file_inode_operations;
+ inode->i_fop = &ext3_file_operations;
+ inode->i_mapping->a_ops = &ext3_aops;
+@@ -2174,15 +2181,17 @@ void ext3_read_inode(struct inode * inod
+ inode->i_op = &ext3_dir_inode_operations;
+ inode->i_fop = &ext3_dir_operations;
+ } else if (S_ISLNK(inode->i_mode)) {
+- if (!inode->i_blocks)
++ if (ext3_inode_is_fast_symlink(inode))
+ inode->i_op = &ext3_fast_symlink_inode_operations;
+ else {
+- inode->i_op = &page_symlink_inode_operations;
++ inode->i_op = &ext3_symlink_inode_operations;
+ inode->i_mapping->a_ops = &ext3_aops;
+ }
+- } else
++ } else {
++ inode->i_op = &ext3_special_inode_operations;
+ init_special_inode(inode, inode->i_mode,
+ le32_to_cpu(iloc.raw_inode->i_block[0]));
++ }
+ brelse(iloc.bh);
+ ext3_set_inode_flags(inode);
+ return;
+--- linux-2.4.22-ac1/fs/ext3/Makefile~xattr-0.8.54-2.4.22-rh 2003-09-25 14:55:12.000000000 +0400
++++ linux-2.4.22-ac1-alexey/fs/ext3/Makefile 2003-09-25 23:57:02.000000000 +0400
+@@ -1,5 +1,5 @@
+ #
+-# Makefile for the linux ext2-filesystem routines.
++# Makefile for the linux ext3-filesystem routines.
+ #
+ # Note! Dependencies are done automagically by 'make dep', which also
+ # removes any old dependencies. DON'T put your own dependencies here
+@@ -9,10 +9,14 @@
+
+ O_TARGET := ext3.o
+
+-export-objs := super.o inode.o
++export-objs := ext3-exports.o
+
+ obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
+- ioctl.o namei.o super.o symlink.o hash.o
++ ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o
+ obj-m := $(O_TARGET)
+
++export-objs += xattr.o
++obj-$(CONFIG_EXT3_FS_XATTR) += xattr.o
++obj-$(CONFIG_EXT3_FS_XATTR_USER) += xattr_user.o
++
+ include $(TOPDIR)/Rules.make
+--- linux-2.4.22-ac1/fs/ext3/namei.c~xattr-0.8.54-2.4.22-rh 2003-09-25 14:58:37.000000000 +0400
++++ linux-2.4.22-ac1-alexey/fs/ext3/namei.c 2003-09-25 23:57:02.000000000 +0400
+@@ -29,6 +29,7 @@
+ #include <linux/sched.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
++#include <linux/ext3_xattr.h>
+ #include <linux/fcntl.h>
+ #include <linux/stat.h>
+ #include <linux/string.h>
+@@ -1614,7 +1615,7 @@ static int ext3_mkdir(struct inode * dir
+ if (IS_SYNC(dir))
+ handle->h_sync = 1;
+
+- inode = ext3_new_inode (handle, dir, S_IFDIR);
++ inode = ext3_new_inode (handle, dir, S_IFDIR | mode);
+ err = PTR_ERR(inode);
+ if (IS_ERR(inode))
+ goto out_stop;
+@@ -1622,7 +1623,6 @@ static int ext3_mkdir(struct inode * dir
+ inode->i_op = &ext3_dir_inode_operations;
+ inode->i_fop = &ext3_dir_operations;
+ inode->i_size = EXT3_I(inode)->i_disksize = inode->i_sb->s_blocksize;
+- inode->i_blocks = 0;
+ dir_block = ext3_bread (handle, inode, 0, 1, &err);
+ if (!dir_block) {
+ inode->i_nlink--; /* is this nlink == 0? */
+@@ -1649,9 +1649,6 @@ static int ext3_mkdir(struct inode * dir
+ BUFFER_TRACE(dir_block, "call ext3_journal_dirty_metadata");
+ ext3_journal_dirty_metadata(handle, dir_block);
+ brelse (dir_block);
+- inode->i_mode = S_IFDIR | mode;
+- if (dir->i_mode & S_ISGID)
+- inode->i_mode |= S_ISGID;
+ ext3_mark_inode_dirty(handle, inode);
+ err = ext3_add_entry (handle, dentry, inode);
+ if (err) {
+@@ -2020,7 +2017,7 @@ static int ext3_symlink (struct inode *
+ goto out_stop;
+
+ if (l > sizeof (EXT3_I(inode)->i_data)) {
+- inode->i_op = &page_symlink_inode_operations;
++ inode->i_op = &ext3_symlink_inode_operations;
+ inode->i_mapping->a_ops = &ext3_aops;
+ /*
+ * block_symlink() calls back into ext3_prepare/commit_write.
+@@ -2245,4 +2242,16 @@ struct inode_operations ext3_dir_inode_o
+ rmdir: ext3_rmdir, /* BKL held */
+ mknod: ext3_mknod, /* BKL held */
+ rename: ext3_rename, /* BKL held */
++ setxattr: ext3_setxattr, /* BKL held */
++ getxattr: ext3_getxattr, /* BKL held */
++ listxattr: ext3_listxattr, /* BKL held */
++ removexattr: ext3_removexattr, /* BKL held */
+ };
++
++struct inode_operations ext3_special_inode_operations = {
++ setxattr: ext3_setxattr, /* BKL held */
++ getxattr: ext3_getxattr, /* BKL held */
++ listxattr: ext3_listxattr, /* BKL held */
++ removexattr: ext3_removexattr, /* BKL held */
++};
++
+--- linux-2.4.22-ac1/fs/ext3/super.c~xattr-0.8.54-2.4.22-rh 2003-09-25 14:55:12.000000000 +0400
++++ linux-2.4.22-ac1-alexey/fs/ext3/super.c 2003-09-26 00:12:23.000000000 +0400
+@@ -24,6 +24,7 @@
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
+ #include <linux/ext3_jbd.h>
++#include <linux/ext3_xattr.h>
+ #include <linux/slab.h>
+ #include <linux/init.h>
+ #include <linux/locks.h>
+@@ -406,6 +407,7 @@ void ext3_put_super (struct super_block
+ kdev_t j_dev = sbi->s_journal->j_dev;
+ int i;
+
++ ext3_xattr_put_super(sb);
+ journal_destroy(sbi->s_journal);
+ if (!(sb->s_flags & MS_RDONLY)) {
+ EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
+@@ -506,6 +508,7 @@ static int parse_options (char * options
+ int is_remount)
+ {
+ unsigned long *mount_options = &sbi->s_mount_opt;
++
+ uid_t *resuid = &sbi->s_resuid;
+ gid_t *resgid = &sbi->s_resgid;
+ char * this_char;
+@@ -518,6 +521,13 @@ static int parse_options (char * options
+ this_char = strtok (NULL, ",")) {
+ if ((value = strchr (this_char, '=')) != NULL)
+ *value++ = 0;
++#ifdef CONFIG_EXT3_FS_XATTR_USER
++ if (!strcmp (this_char, "user_xattr"))
++ set_opt (*mount_options, XATTR_USER);
++ else if (!strcmp (this_char, "nouser_xattr"))
++ clear_opt (*mount_options, XATTR_USER);
++ else
++#endif
+ if (!strcmp (this_char, "bsddf"))
+ clear_opt (*mount_options, MINIX_DF);
+ else if (!strcmp (this_char, "nouid32")) {
+@@ -935,6 +945,12 @@ struct super_block * ext3_read_super (st
+ sbi->s_mount_opt = 0;
+ sbi->s_resuid = EXT3_DEF_RESUID;
+ sbi->s_resgid = EXT3_DEF_RESGID;
++
++ /* Default extended attribute flags */
++#ifdef CONFIG_EXT3_FS_XATTR_USER
++ /* set_opt(sbi->s_mount_opt, XATTR_USER); */
++#endif
++
+ if (!parse_options ((char *) data, &sb_block, sbi, &journal_inum, 0)) {
+ sb->s_dev = 0;
+ goto out_fail;
+@@ -1839,22 +1855,35 @@ static DECLARE_FSTYPE_DEV(ext3_fs_type,
+
+ static int __init init_ext3_fs(void)
+ {
++ int error;
+ #ifdef CONFIG_QUOTA
+ init_dquot_operations(&ext3_qops);
+ old_sync_dquot = ext3_qops.sync_dquot;
+ ext3_qops.sync_dquot = ext3_sync_dquot;
+ #endif
+- return register_filesystem(&ext3_fs_type);
++ error = init_ext3_xattr();
++ if (error)
++ return error;
++ error = init_ext3_xattr_user();
++ if (error)
++ goto fail;
++ error = register_filesystem(&ext3_fs_type);
++ if (!error)
++ return 0;
++
++ exit_ext3_xattr_user();
++fail:
++ exit_ext3_xattr();
++ return error;
+ }
+
+ static void __exit exit_ext3_fs(void)
+ {
+ unregister_filesystem(&ext3_fs_type);
++ exit_ext3_xattr_user();
++ exit_ext3_xattr();
+ }
+
+-EXPORT_SYMBOL(ext3_force_commit);
+-EXPORT_SYMBOL(ext3_bread);
+-
+ MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
+ MODULE_DESCRIPTION("Second Extended Filesystem with journaling extensions");
+ MODULE_LICENSE("GPL");
+--- linux-2.4.22-ac1/fs/ext3/symlink.c~xattr-0.8.54-2.4.22-rh 2001-11-10 01:25:04.000000000 +0300
++++ linux-2.4.22-ac1-alexey/fs/ext3/symlink.c 2003-09-25 23:57:02.000000000 +0400
+@@ -20,6 +20,7 @@
+ #include <linux/fs.h>
+ #include <linux/jbd.h>
+ #include <linux/ext3_fs.h>
++#include <linux/ext3_xattr.h>
+
+ static int ext3_readlink(struct dentry *dentry, char *buffer, int buflen)
+ {
+@@ -33,7 +34,20 @@ static int ext3_follow_link(struct dentr
+ return vfs_follow_link(nd, s);
+ }
+
++struct inode_operations ext3_symlink_inode_operations = {
++ readlink: page_readlink, /* BKL not held. Don't need */
++ follow_link: page_follow_link, /* BKL not held. Don't need */
++ setxattr: ext3_setxattr, /* BKL held */
++ getxattr: ext3_getxattr, /* BKL held */
++ listxattr: ext3_listxattr, /* BKL held */
++ removexattr: ext3_removexattr, /* BKL held */
++};
++
+ struct inode_operations ext3_fast_symlink_inode_operations = {
+ readlink: ext3_readlink, /* BKL not held. Don't need */
+ follow_link: ext3_follow_link, /* BKL not held. Don't need */
++ setxattr: ext3_setxattr, /* BKL held */
++ getxattr: ext3_getxattr, /* BKL held */
++ listxattr: ext3_listxattr, /* BKL held */
++ removexattr: ext3_removexattr, /* BKL held */
+ };
+--- /dev/null 2003-01-30 13:24:37.000000000 +0300
++++ linux-2.4.22-ac1-alexey/fs/ext3/xattr.c 2003-09-25 23:57:02.000000000 +0400
+@@ -0,0 +1,1225 @@
++/*
++ * linux/fs/ext3/xattr.c
++ *
++ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
++ *
++ * Fix by Harrison Xing <harrison@mountainviewdata.com>.
++ * Ext3 code with a lot of help from Eric Jarman <ejarman@acm.org>.
++ * Extended attributes for symlinks and special files added per
++ * suggestion of Luka Renko <luka.renko@hermes.si>.
++ */
++
++/*
++ * Extended attributes are stored on disk blocks allocated outside of
++ * any inode. The i_file_acl field is then made to point to this allocated
++ * block. If all extended attributes of an inode are identical, these
++ * inodes may share the same extended attribute block. Such situations
++ * are automatically detected by keeping a cache of recent attribute block
++ * numbers and hashes over the block's contents in memory.
++ *
++ *
++ * Extended attribute block layout:
++ *
++ * +------------------+
++ * | header |
++ * | entry 1 | |
++ * | entry 2 | | growing downwards
++ * | entry 3 | v
++ * | four null bytes |
++ * | . . . |
++ * | value 1 | ^
++ * | value 3 | | growing upwards
++ * | value 2 | |
++ * +------------------+
++ *
++ * The block header is followed by multiple entry descriptors. These entry
++ * descriptors are variable in size, and alligned to EXT3_XATTR_PAD
++ * byte boundaries. The entry descriptors are sorted by attribute name,
++ * so that two extended attribute blocks can be compared efficiently.
++ *
++ * Attribute values are aligned to the end of the block, stored in
++ * no specific order. They are also padded to EXT3_XATTR_PAD byte
++ * boundaries. No additional gaps are left between them.
++ *
++ * Locking strategy
++ * ----------------
++ * The VFS already holds the BKL and the inode->i_sem semaphore when any of
++ * the xattr inode operations are called, so we are guaranteed that only one
++ * processes accesses extended attributes of an inode at any time.
++ *
++ * For writing we also grab the ext3_xattr_sem semaphore. This ensures that
++ * only a single process is modifying an extended attribute block, even
++ * if the block is shared among inodes.
++ *
++ * Note for porting to 2.5
++ * -----------------------
++ * The BKL will no longer be held in the xattr inode operations.
++ */
++
++#include <linux/module.h>
++#include <linux/fs.h>
++#include <linux/locks.h>
++#include <linux/slab.h>
++#include <linux/ext3_jbd.h>
++#include <linux/ext3_fs.h>
++#include <linux/ext3_xattr.h>
++#include <linux/mbcache.h>
++#include <linux/quotaops.h>
++#include <asm/semaphore.h>
++#include <linux/compatmac.h>
++
++#define EXT3_EA_USER "user."
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0)
++# define mark_buffer_dirty(bh) mark_buffer_dirty(bh, 1)
++#endif
++
++#define HDR(bh) ((struct ext3_xattr_header *)((bh)->b_data))
++#define ENTRY(ptr) ((struct ext3_xattr_entry *)(ptr))
++#define FIRST_ENTRY(bh) ENTRY(HDR(bh)+1)
++#define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0)
++
++#ifdef EXT3_XATTR_DEBUG
++# define ea_idebug(inode, f...) do { \
++ printk(KERN_DEBUG "inode %s:%ld: ", \
++ kdevname(inode->i_dev), inode->i_ino); \
++ printk(f); \
++ printk("\n"); \
++ } while (0)
++# define ea_bdebug(bh, f...) do { \
++ printk(KERN_DEBUG "block %s:%ld: ", \
++ kdevname(bh->b_dev), bh->b_blocknr); \
++ printk(f); \
++ printk("\n"); \
++ } while (0)
++#else
++# define ea_idebug(f...)
++# define ea_bdebug(f...)
++#endif
++
++static int ext3_xattr_set2(handle_t *, struct inode *, struct buffer_head *,
++ struct ext3_xattr_header *);
++
++#ifdef CONFIG_EXT3_FS_XATTR_SHARING
++
++static int ext3_xattr_cache_insert(struct buffer_head *);
++static struct buffer_head *ext3_xattr_cache_find(struct inode *,
++ struct ext3_xattr_header *);
++static void ext3_xattr_cache_remove(struct buffer_head *);
++static void ext3_xattr_rehash(struct ext3_xattr_header *,
++ struct ext3_xattr_entry *);
++
++static struct mb_cache *ext3_xattr_cache;
++
++#else
++# define ext3_xattr_cache_insert(bh) 0
++# define ext3_xattr_cache_find(inode, header) NULL
++# define ext3_xattr_cache_remove(bh) while(0) {}
++# define ext3_xattr_rehash(header, entry) while(0) {}
++#endif
++
++/*
++ * If a file system does not share extended attributes among inodes,
++ * we should not need the ext3_xattr_sem semaphore. However, the
++ * filesystem may still contain shared blocks, so we always take
++ * the lock.
++ */
++
++DECLARE_MUTEX(ext3_xattr_sem);
++
++static inline int
++ext3_xattr_new_block(handle_t *handle, struct inode *inode,
++ int * errp, int force)
++{
++ struct super_block *sb = inode->i_sb;
++ int goal = le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block) +
++ EXT3_I(inode)->i_block_group * EXT3_BLOCKS_PER_GROUP(sb);
++
++ /* How can we enforce the allocation? */
++ int block = ext3_new_block(handle, inode, goal, 0, 0, errp);
++#ifdef OLD_QUOTAS
++ if (!*errp)
++ inode->i_blocks += inode->i_sb->s_blocksize >> 9;
++#endif
++ return block;
++}
++
++static inline int
++ext3_xattr_quota_alloc(struct inode *inode, int force)
++{
++ /* How can we enforce the allocation? */
++#ifdef OLD_QUOTAS
++ int error = DQUOT_ALLOC_BLOCK(inode->i_sb, inode, 1);
++ if (!error)
++ inode->i_blocks += inode->i_sb->s_blocksize >> 9;
++#else
++ int error = DQUOT_ALLOC_BLOCK(inode, 1);
++#endif
++ return error;
++}
++
++#ifdef OLD_QUOTAS
++
++static inline void
++ext3_xattr_quota_free(struct inode *inode)
++{
++ DQUOT_FREE_BLOCK(inode->i_sb, inode, 1);
++ inode->i_blocks -= inode->i_sb->s_blocksize >> 9;
++}
++
++static inline void
++ext3_xattr_free_block(handle_t *handle, struct inode * inode,
++ unsigned long block)
++{
++ ext3_free_blocks(handle, inode, block, 1);
++ inode->i_blocks -= inode->i_sb->s_blocksize >> 9;
++}
++
++#else
++# define ext3_xattr_quota_free(inode) \
++ DQUOT_FREE_BLOCK(inode, 1)
++# define ext3_xattr_free_block(handle, inode, block) \
++ ext3_free_blocks(handle, inode, block, 1)
++#endif
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,18)
++
++static inline struct buffer_head *
++sb_bread(struct super_block *sb, int block)
++{
++ return bread(sb->s_dev, block, sb->s_blocksize);
++}
++
++static inline struct buffer_head *
++sb_getblk(struct super_block *sb, int block)
++{
++ return getblk(sb->s_dev, block, sb->s_blocksize);
++}
++
++#endif
++
++struct ext3_xattr_handler *ext3_xattr_handlers[EXT3_XATTR_INDEX_MAX];
++rwlock_t ext3_handler_lock = RW_LOCK_UNLOCKED;
++
++int
++ext3_xattr_register(int name_index, struct ext3_xattr_handler *handler)
++{
++ int error = -EINVAL;
++
++ if (name_index > 0 && name_index <= EXT3_XATTR_INDEX_MAX) {
++ write_lock(&ext3_handler_lock);
++ if (!ext3_xattr_handlers[name_index-1]) {
++ ext3_xattr_handlers[name_index-1] = handler;
++ error = 0;
++ }
++ write_unlock(&ext3_handler_lock);
++ }
++ return error;
++}
++
++void
++ext3_xattr_unregister(int name_index, struct ext3_xattr_handler *handler)
++{
++ if (name_index > 0 || name_index <= EXT3_XATTR_INDEX_MAX) {
++ write_lock(&ext3_handler_lock);
++ ext3_xattr_handlers[name_index-1] = NULL;
++ write_unlock(&ext3_handler_lock);
++ }
++}
++
++static inline const char *
++strcmp_prefix(const char *a, const char *a_prefix)
++{
++ while (*a_prefix && *a == *a_prefix) {
++ a++;
++ a_prefix++;
++ }
++ return *a_prefix ? NULL : a;
++}
++
++/*
++ * Decode the extended attribute name, and translate it into
++ * the name_index and name suffix.
++ */
++static inline struct ext3_xattr_handler *
++ext3_xattr_resolve_name(const char **name)
++{
++ struct ext3_xattr_handler *handler = NULL;
++ int i;
++
++ if (!*name)
++ return NULL;
++ read_lock(&ext3_handler_lock);
++ for (i=0; i<EXT3_XATTR_INDEX_MAX; i++) {
++ if (ext3_xattr_handlers[i]) {
++ const char *n = strcmp_prefix(*name,
++ ext3_xattr_handlers[i]->prefix);
++ if (n) {
++ handler = ext3_xattr_handlers[i];
++ *name = n;
++ break;
++ }
++ }
++ }
++ read_unlock(&ext3_handler_lock);
++ return handler;
++}
++
++static inline struct ext3_xattr_handler *
++ext3_xattr_handler(int name_index)
++{
++ struct ext3_xattr_handler *handler = NULL;
++ if (name_index > 0 && name_index <= EXT3_XATTR_INDEX_MAX) {
++ read_lock(&ext3_handler_lock);
++ handler = ext3_xattr_handlers[name_index-1];
++ read_unlock(&ext3_handler_lock);
++ }
++ return handler;
++}
++
++/*
++ * Inode operation getxattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++ssize_t
++ext3_getxattr(struct dentry *dentry, const char *name,
++ void *buffer, size_t size)
++{
++ struct ext3_xattr_handler *handler;
++ struct inode *inode = dentry->d_inode;
++
++ handler = ext3_xattr_resolve_name(&name);
++ if (!handler)
++ return -ENOTSUP;
++ return handler->get(inode, name, buffer, size);
++}
++
++/*
++ * Inode operation listxattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++ssize_t
++ext3_listxattr(struct dentry *dentry, char *buffer, size_t size)
++{
++ return ext3_xattr_list(dentry->d_inode, buffer, size);
++}
++
++/*
++ * Inode operation setxattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++int
++ext3_setxattr(struct dentry *dentry, const char *name,
++ const void *value, size_t size, int flags)
++{
++ struct ext3_xattr_handler *handler;
++ struct inode *inode = dentry->d_inode;
++
++ if (size == 0)
++ value = ""; /* empty EA, do not remove */
++ handler = ext3_xattr_resolve_name(&name);
++ if (!handler)
++ return -ENOTSUP;
++ return handler->set(inode, name, value, size, flags);
++}
++
++/*
++ * Inode operation removexattr()
++ *
++ * dentry->d_inode->i_sem down
++ * BKL held [before 2.5.x]
++ */
++int
++ext3_removexattr(struct dentry *dentry, const char *name)
++{
++ struct ext3_xattr_handler *handler;
++ struct inode *inode = dentry->d_inode;
++
++ handler = ext3_xattr_resolve_name(&name);
++ if (!handler)
++ return -ENOTSUP;
++ return handler->set(inode, name, NULL, 0, XATTR_REPLACE);
++}
++
++/*
++ * ext3_xattr_get()
++ *
++ * Copy an extended attribute into the buffer
++ * provided, or compute the buffer size required.
++ * Buffer is NULL to compute the size of the buffer required.
++ *
++ * Returns a negative error number on failure, or the number of bytes
++ * used / required on success.
++ */
++int
++ext3_xattr_get(struct inode *inode, int name_index, const char *name,
++ void *buffer, size_t buffer_size)
++{
++ struct buffer_head *bh = NULL;
++ struct ext3_xattr_entry *entry;
++ unsigned int block, size;
++ char *end;
++ int name_len, error;
++
++ ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
++ name_index, name, buffer, (long)buffer_size);
++
++ if (name == NULL)
++ return -EINVAL;
++ if (!EXT3_I(inode)->i_file_acl)
++ return -ENOATTR;
++ block = EXT3_I(inode)->i_file_acl;
++ ea_idebug(inode, "reading block %d", block);
++ bh = sb_bread(inode->i_sb, block);
++ if (!bh)
++ return -EIO;
++ ea_bdebug(bh, "b_count=%d, refcount=%d",
++ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
++ end = bh->b_data + bh->b_size;
++ if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
++ HDR(bh)->h_blocks != cpu_to_le32(1)) {
++bad_block: ext3_error(inode->i_sb, "ext3_xattr_get",
++ "inode %ld: bad block %d", inode->i_ino, block);
++ error = -EIO;
++ goto cleanup;
++ }
++ /* find named attribute */
++ name_len = strlen(name);
++
++ error = -ERANGE;
++ if (name_len > 255)
++ goto cleanup;
++ entry = FIRST_ENTRY(bh);
++ while (!IS_LAST_ENTRY(entry)) {
++ struct ext3_xattr_entry *next =
++ EXT3_XATTR_NEXT(entry);
++ if ((char *)next >= end)
++ goto bad_block;
++ if (name_index == entry->e_name_index &&
++ name_len == entry->e_name_len &&
++ memcmp(name, entry->e_name, name_len) == 0)
++ goto found;
++ entry = next;
++ }
++ /* Check the remaining name entries */
++ while (!IS_LAST_ENTRY(entry)) {
++ struct ext3_xattr_entry *next =
++ EXT3_XATTR_NEXT(entry);
++ if ((char *)next >= end)
++ goto bad_block;
++ entry = next;
++ }
++ if (ext3_xattr_cache_insert(bh))
++ ea_idebug(inode, "cache insert failed");
++ error = -ENOATTR;
++ goto cleanup;
++found:
++ /* check the buffer size */
++ if (entry->e_value_block != 0)
++ goto bad_block;
++ size = le32_to_cpu(entry->e_value_size);
++ if (size > inode->i_sb->s_blocksize ||
++ le16_to_cpu(entry->e_value_offs) + size > inode->i_sb->s_blocksize)
++ goto bad_block;
++
++ if (ext3_xattr_cache_insert(bh))
++ ea_idebug(inode, "cache insert failed");
++ if (buffer) {
++ error = -ERANGE;
++ if (size > buffer_size)
++ goto cleanup;
++ /* return value of attribute */
++ memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs),
++ size);
++ }
++ error = size;
++
++cleanup:
++ brelse(bh);
++
++ return error;
++}
++
++/*
++ * ext3_xattr_list()
++ *
++ * Copy a list of attribute names into the buffer
++ * provided, or compute the buffer size required.
++ * Buffer is NULL to compute the size of the buffer required.
++ *
++ * Returns a negative error number on failure, or the number of bytes
++ * used / required on success.
++ */
++int
++ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size)
++{
++ struct buffer_head *bh = NULL;
++ struct ext3_xattr_entry *entry;
++ unsigned int block, size = 0;
++ char *buf, *end;
++ int error;
++
++ ea_idebug(inode, "buffer=%p, buffer_size=%ld",
++ buffer, (long)buffer_size);
++
++ if (!EXT3_I(inode)->i_file_acl)
++ return 0;
++ block = EXT3_I(inode)->i_file_acl;
++ ea_idebug(inode, "reading block %d", block);
++ bh = sb_bread(inode->i_sb, block);
++ if (!bh)
++ return -EIO;
++ ea_bdebug(bh, "b_count=%d, refcount=%d",
++ atomic_read(&(bh->b_count)), le32_to_cpu(HDR(bh)->h_refcount));
++ end = bh->b_data + bh->b_size;
++ if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
++ HDR(bh)->h_blocks != cpu_to_le32(1)) {
++bad_block: ext3_error(inode->i_sb, "ext3_xattr_list",
++ "inode %ld: bad block %d", inode->i_ino, block);
++ error = -EIO;
++ goto cleanup;
++ }
++ /* compute the size required for the list of attribute names */
++ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
++ entry = EXT3_XATTR_NEXT(entry)) {
++ struct ext3_xattr_handler *handler;
++ struct ext3_xattr_entry *next =
++ EXT3_XATTR_NEXT(entry);
++ if ((char *)next >= end)
++ goto bad_block;
++
++ handler = ext3_xattr_handler(entry->e_name_index);
++ if (handler)
++ size += handler->list(NULL, inode, entry->e_name,
++ entry->e_name_len);
++ }
++
++ if (ext3_xattr_cache_insert(bh))
++ ea_idebug(inode, "cache insert failed");
++ if (!buffer) {
++ error = size;
++ goto cleanup;
++ } else {
++ error = -ERANGE;
++ if (size > buffer_size)
++ goto cleanup;
++ }
++
++ /* list the attribute names */
++ buf = buffer;
++ for (entry = FIRST_ENTRY(bh); !IS_LAST_ENTRY(entry);
++ entry = EXT3_XATTR_NEXT(entry)) {
++ struct ext3_xattr_handler *handler;
++
++ handler = ext3_xattr_handler(entry->e_name_index);
++ if (handler)
++ buf += handler->list(buf, inode, entry->e_name,
++ entry->e_name_len);
++ }
++ error = size;
++
++cleanup:
++ brelse(bh);
++
++ return error;
++}
++
++/*
++ * If the EXT3_FEATURE_COMPAT_EXT_ATTR feature of this file system is
++ * not set, set it.
++ */
++static void ext3_xattr_update_super_block(handle_t *handle,
++ struct super_block *sb)
++{
++ if (EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR))
++ return;
++
++ lock_super(sb);
++ ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0)
++ EXT3_SB(sb)->s_feature_compat |= EXT3_FEATURE_COMPAT_EXT_ATTR;
++#endif
++ EXT3_SB(sb)->s_es->s_feature_compat |=
++ cpu_to_le32(EXT3_FEATURE_COMPAT_EXT_ATTR);
++ sb->s_dirt = 1;
++ ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
++ unlock_super(sb);
++}
++
++/*
++ * ext3_xattr_set()
++ *
++ * Create, replace or remove an extended attribute for this inode. Buffer
++ * is NULL to remove an existing extended attribute, and non-NULL to
++ * either replace an existing extended attribute, or create a new extended
++ * attribute. The flags XATTR_REPLACE and XATTR_CREATE
++ * specify that an extended attribute must exist and must not exist
++ * previous to the call, respectively.
++ *
++ * Returns 0, or a negative error number on failure.
++ */
++int
++ext3_xattr_set(handle_t *handle, struct inode *inode, int name_index,
++ const char *name, const void *value, size_t value_len, int flags)
++{
++ struct super_block *sb = inode->i_sb;
++ struct buffer_head *bh = NULL;
++ struct ext3_xattr_header *header = NULL;
++ struct ext3_xattr_entry *here, *last;
++ unsigned int name_len;
++ int block = EXT3_I(inode)->i_file_acl;
++ int min_offs = sb->s_blocksize, not_found = 1, free, error;
++ char *end;
++
++ /*
++ * header -- Points either into bh, or to a temporarily
++ * allocated buffer.
++ * here -- The named entry found, or the place for inserting, within
++ * the block pointed to by header.
++ * last -- Points right after the last named entry within the block
++ * pointed to by header.
++ * min_offs -- The offset of the first value (values are aligned
++ * towards the end of the block).
++ * end -- Points right after the block pointed to by header.
++ */
++
++ ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld",
++ name_index, name, value, (long)value_len);
++
++ if (IS_RDONLY(inode))
++ return -EROFS;
++ if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
++ return -EPERM;
++ if (value == NULL)
++ value_len = 0;
++ if (name == NULL)
++ return -EINVAL;
++ name_len = strlen(name);
++ if (name_len > 255 || value_len > sb->s_blocksize)
++ return -ERANGE;
++ down(&ext3_xattr_sem);
++
++ if (block) {
++ /* The inode already has an extended attribute block. */
++ bh = sb_bread(sb, block);
++ error = -EIO;
++ if (!bh)
++ goto cleanup;
++ ea_bdebug(bh, "b_count=%d, refcount=%d",
++ atomic_read(&(bh->b_count)),
++ le32_to_cpu(HDR(bh)->h_refcount));
++ header = HDR(bh);
++ end = bh->b_data + bh->b_size;
++ if (header->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
++ header->h_blocks != cpu_to_le32(1)) {
++bad_block: ext3_error(sb, "ext3_xattr_set",
++ "inode %ld: bad block %d", inode->i_ino, block);
++ error = -EIO;
++ goto cleanup;
++ }
++ /* Find the named attribute. */
++ here = FIRST_ENTRY(bh);
++ while (!IS_LAST_ENTRY(here)) {
++ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(here);
++ if ((char *)next >= end)
++ goto bad_block;
++ if (!here->e_value_block && here->e_value_size) {
++ int offs = le16_to_cpu(here->e_value_offs);
++ if (offs < min_offs)
++ min_offs = offs;
++ }
++ not_found = name_index - here->e_name_index;
++ if (!not_found)
++ not_found = name_len - here->e_name_len;
++ if (!not_found)
++ not_found = memcmp(name, here->e_name,name_len);
++ if (not_found <= 0)
++ break;
++ here = next;
++ }
++ last = here;
++ /* We still need to compute min_offs and last. */
++ while (!IS_LAST_ENTRY(last)) {
++ struct ext3_xattr_entry *next = EXT3_XATTR_NEXT(last);
++ if ((char *)next >= end)
++ goto bad_block;
++ if (!last->e_value_block && last->e_value_size) {
++ int offs = le16_to_cpu(last->e_value_offs);
++ if (offs < min_offs)
++ min_offs = offs;
++ }
++ last = next;
++ }
++
++ /* Check whether we have enough space left. */
++ free = min_offs - ((char*)last - (char*)header) - sizeof(__u32);
++ } else {
++ /* We will use a new extended attribute block. */
++ free = sb->s_blocksize -
++ sizeof(struct ext3_xattr_header) - sizeof(__u32);
++ here = last = NULL; /* avoid gcc uninitialized warning. */
++ }
++
++ if (not_found) {
++ /* Request to remove a nonexistent attribute? */
++ error = -ENOATTR;
++ if (flags & XATTR_REPLACE)
++ goto cleanup;
++ error = 0;
++ if (value == NULL)
++ goto cleanup;
++ else
++ free -= EXT3_XATTR_LEN(name_len);
++ } else {
++ /* Request to create an existing attribute? */
++ error = -EEXIST;
++ if (flags & XATTR_CREATE)
++ goto cleanup;
++ if (!here->e_value_block && here->e_value_size) {
++ unsigned int size = le32_to_cpu(here->e_value_size);
++
++ if (le16_to_cpu(here->e_value_offs) + size >
++ sb->s_blocksize || size > sb->s_blocksize)
++ goto bad_block;
++ free += EXT3_XATTR_SIZE(size);
++ }
++ }
++ free -= EXT3_XATTR_SIZE(value_len);
++ error = -ENOSPC;
++ if (free < 0)
++ goto cleanup;
++
++ /* Here we know that we can set the new attribute. */
++
++ if (header) {
++ if (header->h_refcount == cpu_to_le32(1)) {
++ ea_bdebug(bh, "modifying in-place");
++ ext3_xattr_cache_remove(bh);
++ error = ext3_journal_get_write_access(handle, bh);
++ if (error)
++ goto cleanup;
++ } else {
++ int offset;
++
++ ea_bdebug(bh, "cloning");
++ header = kmalloc(bh->b_size, GFP_KERNEL);
++ error = -ENOMEM;
++ if (header == NULL)
++ goto cleanup;
++ memcpy(header, HDR(bh), bh->b_size);
++ header->h_refcount = cpu_to_le32(1);
++ offset = (char *)header - bh->b_data;
++ here = ENTRY((char *)here + offset);
++ last = ENTRY((char *)last + offset);
++ }
++ } else {
++ /* Allocate a buffer where we construct the new block. */
++ header = kmalloc(sb->s_blocksize, GFP_KERNEL);
++ error = -ENOMEM;
++ if (header == NULL)
++ goto cleanup;
++ memset(header, 0, sb->s_blocksize);
++ end = (char *)header + sb->s_blocksize;
++ header->h_magic = cpu_to_le32(EXT3_XATTR_MAGIC);
++ header->h_blocks = header->h_refcount = cpu_to_le32(1);
++ last = here = ENTRY(header+1);
++ }
++
++ if (not_found) {
++ /* Insert the new name. */
++ int size = EXT3_XATTR_LEN(name_len);
++ int rest = (char *)last - (char *)here;
++ memmove((char *)here + size, here, rest);
++ memset(here, 0, size);
++ here->e_name_index = name_index;
++ here->e_name_len = name_len;
++ memcpy(here->e_name, name, name_len);
++ } else {
++ /* Remove the old value. */
++ if (!here->e_value_block && here->e_value_size) {
++ char *first_val = (char *)header + min_offs;
++ int offs = le16_to_cpu(here->e_value_offs);
++ char *val = (char *)header + offs;
++ size_t size = EXT3_XATTR_SIZE(
++ le32_to_cpu(here->e_value_size));
++ memmove(first_val + size, first_val, val - first_val);
++ memset(first_val, 0, size);
++ here->e_value_offs = 0;
++ min_offs += size;
++
++ /* Adjust all value offsets. */
++ last = ENTRY(header+1);
++ while (!IS_LAST_ENTRY(last)) {
++ int o = le16_to_cpu(last->e_value_offs);
++ if (!last->e_value_block && o < offs)
++ last->e_value_offs =
++ cpu_to_le16(o + size);
++ last = EXT3_XATTR_NEXT(last);
++ }
++ }
++ if (value == NULL) {
++ /* Remove this attribute. */
++ if (EXT3_XATTR_NEXT(ENTRY(header+1)) == last) {
++ /* This block is now empty. */
++ error = ext3_xattr_set2(handle, inode, bh,NULL);
++ goto cleanup;
++ } else {
++ /* Remove the old name. */
++ int size = EXT3_XATTR_LEN(name_len);
++ last = ENTRY((char *)last - size);
++ memmove(here, (char*)here + size,
++ (char*)last - (char*)here);
++ memset(last, 0, size);
++ }
++ }
++ }
++
++ if (value != NULL) {
++ /* Insert the new value. */
++ here->e_value_size = cpu_to_le32(value_len);
++ if (value_len) {
++ size_t size = EXT3_XATTR_SIZE(value_len);
++ char *val = (char *)header + min_offs - size;
++ here->e_value_offs =
++ cpu_to_le16((char *)val - (char *)header);
++ memset(val + size - EXT3_XATTR_PAD, 0,
++ EXT3_XATTR_PAD); /* Clear the pad bytes. */
++ memcpy(val, value, value_len);
++ }
++ }
++ ext3_xattr_rehash(header, here);
++
++ error = ext3_xattr_set2(handle, inode, bh, header);
++
++cleanup:
++ brelse(bh);
++ if (!(bh && header == HDR(bh)))
++ kfree(header);
++ up(&ext3_xattr_sem);
++
++ return error;
++}
++
++/*
++ * Second half of ext3_xattr_set(): Update the file system.
++ */
++static int
++ext3_xattr_set2(handle_t *handle, struct inode *inode,
++ struct buffer_head *old_bh, struct ext3_xattr_header *header)
++{
++ struct super_block *sb = inode->i_sb;
++ struct buffer_head *new_bh = NULL;
++ int error;
++
++ if (header) {
++ new_bh = ext3_xattr_cache_find(inode, header);
++ if (new_bh) {
++ /*
++ * We found an identical block in the cache.
++ * The old block will be released after updating
++ * the inode.
++ */
++ ea_bdebug(old_bh, "reusing block %ld",
++ new_bh->b_blocknr);
++
++ error = -EDQUOT;
++ if (ext3_xattr_quota_alloc(inode, 1))
++ goto cleanup;
++
++ error = ext3_journal_get_write_access(handle, new_bh);
++ if (error)
++ goto cleanup;
++ HDR(new_bh)->h_refcount = cpu_to_le32(
++ le32_to_cpu(HDR(new_bh)->h_refcount) + 1);
++ ea_bdebug(new_bh, "refcount now=%d",
++ le32_to_cpu(HDR(new_bh)->h_refcount));
++ } else if (old_bh && header == HDR(old_bh)) {
++ /* Keep this block. */
++ new_bh = old_bh;
++ ext3_xattr_cache_insert(new_bh);
++ } else {
++ /* We need to allocate a new block */
++ int force = EXT3_I(inode)->i_file_acl != 0;
++ int block = ext3_xattr_new_block(handle, inode,
++ &error, force);
++ if (error)
++ goto cleanup;
++ ea_idebug(inode, "creating block %d", block);
++
++ new_bh = sb_getblk(sb, block);
++ if (!new_bh) {
++getblk_failed: ext3_xattr_free_block(handle, inode, block);
++ error = -EIO;
++ goto cleanup;
++ }
++ lock_buffer(new_bh);
++ error = ext3_journal_get_create_access(handle, new_bh);
++ if (error) {
++ unlock_buffer(new_bh);
++ goto getblk_failed;
++ }
++ memcpy(new_bh->b_data, header, new_bh->b_size);
++ mark_buffer_uptodate(new_bh, 1);
++ unlock_buffer(new_bh);
++ ext3_xattr_cache_insert(new_bh);
++
++ ext3_xattr_update_super_block(handle, sb);
++ }
++ error = ext3_journal_dirty_metadata(handle, new_bh);
++ if (error)
++ goto cleanup;
++ }
++
++ /* Update the inode. */
++ EXT3_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0;
++ inode->i_ctime = CURRENT_TIME;
++ ext3_mark_inode_dirty(handle, inode);
++ if (IS_SYNC(inode))
++ handle->h_sync = 1;
++
++ error = 0;
++ if (old_bh && old_bh != new_bh) {
++ /*
++ * If there was an old block, and we are not still using it,
++ * we now release the old block.
++ */
++ unsigned int refcount = le32_to_cpu(HDR(old_bh)->h_refcount);
++
++ error = ext3_journal_get_write_access(handle, old_bh);
++ if (error)
++ goto cleanup;
++ if (refcount == 1) {
++ /* Free the old block. */
++ ea_bdebug(old_bh, "freeing");
++ ext3_xattr_free_block(handle, inode, old_bh->b_blocknr);
++
++ /* ext3_forget() calls bforget() for us, but we
++ let our caller release old_bh, so we need to
++ duplicate the handle before. */
++ get_bh(old_bh);
++ ext3_forget(handle, 1, inode, old_bh,old_bh->b_blocknr);
++ } else {
++ /* Decrement the refcount only. */
++ refcount--;
++ HDR(old_bh)->h_refcount = cpu_to_le32(refcount);
++ ext3_xattr_quota_free(inode);
++ ext3_journal_dirty_metadata(handle, old_bh);
++ ea_bdebug(old_bh, "refcount now=%d", refcount);
++ }
++ }
++
++cleanup:
++ if (old_bh != new_bh)
++ brelse(new_bh);
++
++ return error;
++}
++
++/*
++ * ext3_xattr_delete_inode()
++ *
++ * Free extended attribute resources associated with this inode. This
++ * is called immediately before an inode is freed.
++ */
++void
++ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
++{
++ struct buffer_head *bh;
++ unsigned int block = EXT3_I(inode)->i_file_acl;
++
++ if (!block)
++ return;
++ down(&ext3_xattr_sem);
++
++ bh = sb_bread(inode->i_sb, block);
++ if (!bh) {
++ ext3_error(inode->i_sb, "ext3_xattr_delete_inode",
++ "inode %ld: block %d read error", inode->i_ino, block);
++ goto cleanup;
++ }
++ ea_bdebug(bh, "b_count=%d", atomic_read(&(bh->b_count)));
++ if (HDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) ||
++ HDR(bh)->h_blocks != cpu_to_le32(1)) {
++ ext3_error(inode->i_sb, "ext3_xattr_delete_inode",
++ "inode %ld: bad block %d", inode->i_ino, block);
++ goto cleanup;
++ }
++ ext3_journal_get_write_access(handle, bh);
++ ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1);
++ if (HDR(bh)->h_refcount == cpu_to_le32(1)) {
++ ext3_xattr_cache_remove(bh);
++ ext3_xattr_free_block(handle, inode, block);
++ ext3_forget(handle, 1, inode, bh, block);
++ bh = NULL;
++ } else {
++ HDR(bh)->h_refcount = cpu_to_le32(
++ le32_to_cpu(HDR(bh)->h_refcount) - 1);
++ ext3_journal_dirty_metadata(handle, bh);
++ if (IS_SYNC(inode))
++ handle->h_sync = 1;
++ ext3_xattr_quota_free(inode);
++ }
++ EXT3_I(inode)->i_file_acl = 0;
++
++cleanup:
++ brelse(bh);
++ up(&ext3_xattr_sem);
++}
++
++/*
++ * ext3_xattr_put_super()
++ *
++ * This is called when a file system is unmounted.
++ */
++void
++ext3_xattr_put_super(struct super_block *sb)
++{
++#ifdef CONFIG_EXT3_FS_XATTR_SHARING
++ mb_cache_shrink(ext3_xattr_cache, sb->s_dev);
++#endif
++}
++
++#ifdef CONFIG_EXT3_FS_XATTR_SHARING
++
++/*
++ * ext3_xattr_cache_insert()
++ *
++ * Create a new entry in the extended attribute cache, and insert
++ * it unless such an entry is already in the cache.
++ *
++ * Returns 0, or a negative error number on failure.
++ */
++static int
++ext3_xattr_cache_insert(struct buffer_head *bh)
++{
++ __u32 hash = le32_to_cpu(HDR(bh)->h_hash);
++ struct mb_cache_entry *ce;
++ int error;
++
++ ce = mb_cache_entry_alloc(ext3_xattr_cache);
++ if (!ce)
++ return -ENOMEM;
++ error = mb_cache_entry_insert(ce, bh->b_dev, bh->b_blocknr, &hash);
++ if (error) {
++ mb_cache_entry_free(ce);
++ if (error == -EBUSY) {
++ ea_bdebug(bh, "already in cache (%d cache entries)",
++ atomic_read(&ext3_xattr_cache->c_entry_count));
++ error = 0;
++ }
++ } else {
++ ea_bdebug(bh, "inserting [%x] (%d cache entries)", (int)hash,
++ atomic_read(&ext3_xattr_cache->c_entry_count));
++ mb_cache_entry_release(ce);
++ }
++ return error;
++}
++
++/*
++ * ext3_xattr_cmp()
++ *
++ * Compare two extended attribute blocks for equality.
++ *
++ * Returns 0 if the blocks are equal, 1 if they differ, and
++ * a negative error number on errors.
++ */
++static int
++ext3_xattr_cmp(struct ext3_xattr_header *header1,
++ struct ext3_xattr_header *header2)
++{
++ struct ext3_xattr_entry *entry1, *entry2;
++
++ entry1 = ENTRY(header1+1);
++ entry2 = ENTRY(header2+1);
++ while (!IS_LAST_ENTRY(entry1)) {
++ if (IS_LAST_ENTRY(entry2))
++ return 1;
++ if (entry1->e_hash != entry2->e_hash ||
++ entry1->e_name_len != entry2->e_name_len ||
++ entry1->e_value_size != entry2->e_value_size ||
++ memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
++ return 1;
++ if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
++ return -EIO;
++ if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
++ (char *)header2 + le16_to_cpu(entry2->e_value_offs),
++ le32_to_cpu(entry1->e_value_size)))
++ return 1;
++
++ entry1 = EXT3_XATTR_NEXT(entry1);
++ entry2 = EXT3_XATTR_NEXT(entry2);
++ }
++ if (!IS_LAST_ENTRY(entry2))
++ return 1;
++ return 0;
++}
++
++/*
++ * ext3_xattr_cache_find()
++ *
++ * Find an identical extended attribute block.
++ *
++ * Returns a pointer to the block found, or NULL if such a block was
++ * not found or an error occurred.
++ */
++static struct buffer_head *
++ext3_xattr_cache_find(struct inode *inode, struct ext3_xattr_header *header)
++{
++ __u32 hash = le32_to_cpu(header->h_hash);
++ struct mb_cache_entry *ce;
++
++ if (!header->h_hash)
++ return NULL; /* never share */
++ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
++ ce = mb_cache_entry_find_first(ext3_xattr_cache, 0, inode->i_dev, hash);
++ while (ce) {
++ struct buffer_head *bh = sb_bread(inode->i_sb, ce->e_block);
++
++ if (!bh) {
++ ext3_error(inode->i_sb, "ext3_xattr_cache_find",
++ "inode %ld: block %ld read error",
++ inode->i_ino, ce->e_block);
++ } else if (le32_to_cpu(HDR(bh)->h_refcount) >
++ EXT3_XATTR_REFCOUNT_MAX) {
++ ea_idebug(inode, "block %ld refcount %d>%d",ce->e_block,
++ le32_to_cpu(HDR(bh)->h_refcount),
++ EXT3_XATTR_REFCOUNT_MAX);
++ } else if (!ext3_xattr_cmp(header, HDR(bh))) {
++ ea_bdebug(bh, "b_count=%d",atomic_read(&(bh->b_count)));
++ mb_cache_entry_release(ce);
++ return bh;
++ }
++ brelse(bh);
++ ce = mb_cache_entry_find_next(ce, 0, inode->i_dev, hash);
++ }
++ return NULL;
++}
++
++/*
++ * ext3_xattr_cache_remove()
++ *
++ * Remove the cache entry of a block from the cache. Called when a
++ * block becomes invalid.
++ */
++static void
++ext3_xattr_cache_remove(struct buffer_head *bh)
++{
++ struct mb_cache_entry *ce;
++
++ ce = mb_cache_entry_get(ext3_xattr_cache, bh->b_dev, bh->b_blocknr);
++ if (ce) {
++ ea_bdebug(bh, "removing (%d cache entries remaining)",
++ atomic_read(&ext3_xattr_cache->c_entry_count)-1);
++ mb_cache_entry_free(ce);
++ } else
++ ea_bdebug(bh, "no cache entry");
++}
++
++#define NAME_HASH_SHIFT 5
++#define VALUE_HASH_SHIFT 16
++
++/*
++ * ext3_xattr_hash_entry()
++ *
++ * Compute the hash of an extended attribute.
++ */
++static inline void ext3_xattr_hash_entry(struct ext3_xattr_header *header,
++ struct ext3_xattr_entry *entry)
++{
++ __u32 hash = 0;
++ char *name = entry->e_name;
++ int n;
++
++ for (n=0; n < entry->e_name_len; n++) {
++ hash = (hash << NAME_HASH_SHIFT) ^
++ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
++ *name++;
++ }
++
++ if (entry->e_value_block == 0 && entry->e_value_size != 0) {
++ __u32 *value = (__u32 *)((char *)header +
++ le16_to_cpu(entry->e_value_offs));
++ for (n = (le32_to_cpu(entry->e_value_size) +
++ EXT3_XATTR_ROUND) >> EXT3_XATTR_PAD_BITS; n; n--) {
++ hash = (hash << VALUE_HASH_SHIFT) ^
++ (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^
++ le32_to_cpu(*value++);
++ }
++ }
++ entry->e_hash = cpu_to_le32(hash);
++}
++
++#undef NAME_HASH_SHIFT
++#undef VALUE_HASH_SHIFT
++
++#define BLOCK_HASH_SHIFT 16
++
++/*
++ * ext3_xattr_rehash()
++ *
++ * Re-compute the extended attribute hash value after an entry has changed.
++ */
++static void ext3_xattr_rehash(struct ext3_xattr_header *header,
++ struct ext3_xattr_entry *entry)
++{
++ struct ext3_xattr_entry *here;
++ __u32 hash = 0;
++
++ ext3_xattr_hash_entry(header, entry);
++ here = ENTRY(header+1);
++ while (!IS_LAST_ENTRY(here)) {
++ if (!here->e_hash) {
++ /* Block is not shared if an entry's hash value == 0 */
++ hash = 0;
++ break;
++ }
++ hash = (hash << BLOCK_HASH_SHIFT) ^
++ (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^
++ le32_to_cpu(here->e_hash);
++ here = EXT3_XATTR_NEXT(here);
++ }
++ header->h_hash = cpu_to_le32(hash);
++}
++
++#undef BLOCK_HASH_SHIFT
++
++int __init
++init_ext3_xattr(void)
++{
++ ext3_xattr_cache = mb_cache_create("ext3_xattr", NULL,
++ sizeof(struct mb_cache_entry) +
++ sizeof(struct mb_cache_entry_index), 1, 61);
++ if (!ext3_xattr_cache)
++ return -ENOMEM;
++
++ return 0;
++}
++
++void
++exit_ext3_xattr(void)
++{
++ if (ext3_xattr_cache)
++ mb_cache_destroy(ext3_xattr_cache);
++ ext3_xattr_cache = NULL;
++}
++
++#else /* CONFIG_EXT3_FS_XATTR_SHARING */
++
++int __init
++init_ext3_xattr(void)
++{
++ return 0;
++}
++
++void
++exit_ext3_xattr(void)
++{
++}
++
++#endif /* CONFIG_EXT3_FS_XATTR_SHARING */
+--- /dev/null 2003-01-30 13:24:37.000000000 +0300
++++ linux-2.4.22-ac1-alexey/fs/ext3/xattr_user.c 2003-09-25 23:57:02.000000000 +0400
+@@ -0,0 +1,111 @@
++/*
++ * linux/fs/ext3/xattr_user.c
++ * Handler for extended user attributes.
++ *
++ * Copyright (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
++ */
++
++#include <linux/module.h>
++#include <linux/string.h>
++#include <linux/fs.h>
++#include <linux/ext3_jbd.h>
++#include <linux/ext3_fs.h>
++#include <linux/ext3_xattr.h>
++
++#ifdef CONFIG_EXT3_FS_POSIX_ACL
++# include <linux/ext3_acl.h>
++#endif
++
++#define XATTR_USER_PREFIX "user."
++
++static size_t
++ext3_xattr_user_list(char *list, struct inode *inode,
++ const char *name, int name_len)
++{
++ const int prefix_len = sizeof(XATTR_USER_PREFIX)-1;
++
++ if (!test_opt(inode->i_sb, XATTR_USER))
++ return 0;
++
++ if (list) {
++ memcpy(list, XATTR_USER_PREFIX, prefix_len);
++ memcpy(list+prefix_len, name, name_len);
++ list[prefix_len + name_len] = '\0';
++ }
++ return prefix_len + name_len + 1;
++}
++
++static int
++ext3_xattr_user_get(struct inode *inode, const char *name,
++ void *buffer, size_t size)
++{
++ int error;
++
++ if (strcmp(name, "") == 0)
++ return -EINVAL;
++ if (!test_opt(inode->i_sb, XATTR_USER))
++ return -ENOTSUP;
++#ifdef CONFIG_EXT3_FS_POSIX_ACL
++ error = ext3_permission_locked(inode, MAY_READ);
++#else
++ error = permission(inode, MAY_READ);
++#endif
++ if (error)
++ return error;
++
++ return ext3_xattr_get(inode, EXT3_XATTR_INDEX_USER, name,
++ buffer, size);
++}
++
++static int
++ext3_xattr_user_set(struct inode *inode, const char *name,
++ const void *value, size_t size, int flags)
++{
++ handle_t *handle;
++ int error;
++
++ if (strcmp(name, "") == 0)
++ return -EINVAL;
++ if (!test_opt(inode->i_sb, XATTR_USER))
++ return -ENOTSUP;
++ if ( !S_ISREG(inode->i_mode) &&
++ (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
++ return -EPERM;
++#ifdef CONFIG_EXT3_FS_POSIX_ACL
++ error = ext3_permission_locked(inode, MAY_WRITE);
++#else
++ error = permission(inode, MAY_WRITE);
++#endif
++ if (error)
++ return error;
++
++ handle = ext3_journal_start(inode, EXT3_XATTR_TRANS_BLOCKS);
++ if (IS_ERR(handle))
++ return PTR_ERR(handle);
++ error = ext3_xattr_set(handle, inode, EXT3_XATTR_INDEX_USER, name,
++ value, size, flags);
++ ext3_journal_stop(handle, inode);
++
++ return error;
++}
++
++struct ext3_xattr_handler ext3_xattr_user_handler = {
++ prefix: XATTR_USER_PREFIX,
++ list: ext3_xattr_user_list,
++ get: ext3_xattr_user_get,
++ set: ext3_xattr_user_set,
++};
++
++int __init
++init_ext3_xattr_user(void)
++{
++ return ext3_xattr_register(EXT3_XATTR_INDEX_USER,
++ &ext3_xattr_user_handler);
++}
++
++void
++exit_ext3_xattr_user(void)
++{
++ ext3_xattr_unregister(EXT3_XATTR_INDEX_USER,
++ &ext3_xattr_user_handler);
++}
+--- linux-2.4.22-ac1/fs/jfs/jfs_xattr.h~xattr-0.8.54-2.4.22-rh 2002-11-29 02:53:15.000000000 +0300
++++ linux-2.4.22-ac1-alexey/fs/jfs/jfs_xattr.h 2003-09-25 23:57:02.000000000 +0400
+@@ -52,8 +52,10 @@ struct jfs_ea_list {
+ #define END_EALIST(ealist) \
+ ((struct jfs_ea *) (((char *) (ealist)) + EALIST_SIZE(ealist)))
+
+-extern int __jfs_setxattr(struct inode *, const char *, void *, size_t, int);
+-extern int jfs_setxattr(struct dentry *, const char *, void *, size_t, int);
++extern int __jfs_setxattr(struct inode *, const char *, const void *, size_t,
++ int);
++extern int jfs_setxattr(struct dentry *, const char *, const void *, size_t,
++ int);
+ extern ssize_t __jfs_getxattr(struct inode *, const char *, void *, size_t);
+ extern ssize_t jfs_getxattr(struct dentry *, const char *, void *, size_t);
+ extern ssize_t jfs_listxattr(struct dentry *, char *, size_t);
+--- linux-2.4.22-ac1/fs/jfs/xattr.c~xattr-0.8.54-2.4.22-rh 2002-11-29 02:53:15.000000000 +0300
++++ linux-2.4.22-ac1-alexey/fs/jfs/xattr.c 2003-09-25 23:57:02.000000000 +0400
+@@ -641,7 +641,7 @@ static int ea_put(struct inode *inode, s
+ }
+
+ static int can_set_xattr(struct inode *inode, const char *name,
+- void *value, size_t value_len)
++ const void *value, size_t value_len)
+ {
+ if (IS_RDONLY(inode))
+ return -EROFS;
+@@ -660,7 +660,7 @@ static int can_set_xattr(struct inode *i
+ return permission(inode, MAY_WRITE);
+ }
+
+-int __jfs_setxattr(struct inode *inode, const char *name, void *value,
++int __jfs_setxattr(struct inode *inode, const char *name, const void *value,
+ size_t value_len, int flags)
+ {
+ struct jfs_ea_list *ealist;
+@@ -799,7 +799,7 @@ int __jfs_setxattr(struct inode *inode,
+ return rc;
+ }
+
+-int jfs_setxattr(struct dentry *dentry, const char *name, void *value,
++int jfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+ size_t value_len, int flags)
+ {
+ if (value == NULL) { /* empty EA, do not remove */
+--- linux-2.4.22-ac1/fs/Makefile~xattr-0.8.54-2.4.22-rh 2003-09-25 14:50:00.000000000 +0400
++++ linux-2.4.22-ac1-alexey/fs/Makefile 2003-09-25 23:57:02.000000000 +0400
+@@ -82,6 +82,9 @@ obj-y += binfmt_script.o
+
+ obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o
+
++export-objs += mbcache.o
++obj-$(CONFIG_FS_MBCACHE) += mbcache.o
++
+ # persistent filesystems
+ obj-y += $(join $(subdir-y),$(subdir-y:%=/%.o))
+
+--- /dev/null 2003-01-30 13:24:37.000000000 +0300
++++ linux-2.4.22-ac1-alexey/fs/mbcache.c 2003-09-25 23:57:02.000000000 +0400
+@@ -0,0 +1,648 @@
++/*
++ * linux/fs/mbcache.c
++ * (C) 2001-2002 Andreas Gruenbacher, <a.gruenbacher@computer.org>
++ */
++
++/*
++ * Filesystem Meta Information Block Cache (mbcache)
++ *
++ * The mbcache caches blocks of block devices that need to be located
++ * by their device/block number, as well as by other criteria (such
++ * as the block's contents).
++ *
++ * There can only be one cache entry in a cache per device and block number.
++ * Additional indexes need not be unique in this sense. The number of
++ * additional indexes (=other criteria) can be hardwired at compile time
++ * or specified at cache create time.
++ *
++ * Each cache entry is of fixed size. An entry may be `valid' or `invalid'
++ * in the cache. A valid entry is in the main hash tables of the cache,
++ * and may also be in the lru list. An invalid entry is not in any hashes
++ * or lists.
++ *
++ * A valid cache entry is only in the lru list if no handles refer to it.
++ * Invalid cache entries will be freed when the last handle to the cache
++ * entry is released. Entries that cannot be freed immediately are put
++ * back on the lru list.
++ */
++
++#include <linux/kernel.h>
++#include <linux/module.h>
++
++#include <linux/fs.h>
++#include <linux/slab.h>
++#include <linux/sched.h>
++#include <linux/cache_def.h>
++#include <linux/version.h>
++#include <linux/init.h>
++#include <linux/mbcache.h>
++
++
++#ifdef MB_CACHE_DEBUG
++# define mb_debug(f...) do { \
++ printk(KERN_DEBUG f); \
++ printk("\n"); \
++ } while (0)
++#define mb_assert(c) do { if (!(c)) \
++ printk(KERN_ERR "assertion " #c " failed\n"); \
++ } while(0)
++#else
++# define mb_debug(f...) do { } while(0)
++# define mb_assert(c) do { } while(0)
++#endif
++#define mb_error(f...) do { \
++ printk(KERN_ERR f); \
++ printk("\n"); \
++ } while(0)
++
++MODULE_AUTHOR("Andreas Gruenbacher <a.gruenbacher@computer.org>");
++MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0)
++MODULE_LICENSE("GPL");
++#endif
++
++EXPORT_SYMBOL(mb_cache_create);
++EXPORT_SYMBOL(mb_cache_shrink);
++EXPORT_SYMBOL(mb_cache_destroy);
++EXPORT_SYMBOL(mb_cache_entry_alloc);
++EXPORT_SYMBOL(mb_cache_entry_insert);
++EXPORT_SYMBOL(mb_cache_entry_release);
++EXPORT_SYMBOL(mb_cache_entry_takeout);
++EXPORT_SYMBOL(mb_cache_entry_free);
++EXPORT_SYMBOL(mb_cache_entry_dup);
++EXPORT_SYMBOL(mb_cache_entry_get);
++#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
++EXPORT_SYMBOL(mb_cache_entry_find_first);
++EXPORT_SYMBOL(mb_cache_entry_find_next);
++#endif
++
++
++/*
++ * Global data: list of all mbcache's, lru list, and a spinlock for
++ * accessing cache data structures on SMP machines. The lru list is
++ * global across all mbcaches.
++ */
++
++static LIST_HEAD(mb_cache_list);
++static LIST_HEAD(mb_cache_lru_list);
++static spinlock_t mb_cache_spinlock = SPIN_LOCK_UNLOCKED;
++
++static inline int
++mb_cache_indexes(struct mb_cache *cache)
++{
++#ifdef MB_CACHE_INDEXES_COUNT
++ return MB_CACHE_INDEXES_COUNT;
++#else
++ return cache->c_indexes_count;
++#endif
++}
++
++/*
++ * What the mbcache registers as to get shrunk dynamically.
++ */
++
++static void
++mb_cache_memory_pressure(int priority, unsigned int gfp_mask);
++
++static struct cache_definition mb_cache_definition = {
++ "mb_cache",
++ mb_cache_memory_pressure
++};
++
++
++static inline int
++__mb_cache_entry_is_hashed(struct mb_cache_entry *ce)
++{
++ return !list_empty(&ce->e_block_list);
++}
++
++
++static inline void
++__mb_cache_entry_unhash(struct mb_cache_entry *ce)
++{
++ int n;
++
++ if (__mb_cache_entry_is_hashed(ce)) {
++ list_del_init(&ce->e_block_list);
++ for (n=0; n<mb_cache_indexes(ce->e_cache); n++)
++ list_del(&ce->e_indexes[n].o_list);
++ }
++}
++
++
++static inline void
++__mb_cache_entry_forget(struct mb_cache_entry *ce, int gfp_mask)
++{
++ struct mb_cache *cache = ce->e_cache;
++
++ mb_assert(atomic_read(&ce->e_used) == 0);
++ if (cache->c_op.free && cache->c_op.free(ce, gfp_mask)) {
++ /* free failed -- put back on the lru list
++ for freeing later. */
++ spin_lock(&mb_cache_spinlock);
++ list_add(&ce->e_lru_list, &mb_cache_lru_list);
++ spin_unlock(&mb_cache_spinlock);
++ } else {
++ kmem_cache_free(cache->c_entry_cache, ce);
++ atomic_dec(&cache->c_entry_count);
++ }
++}
++
++
++static inline void
++__mb_cache_entry_release_unlock(struct mb_cache_entry *ce)
++{
++ if (atomic_dec_and_test(&ce->e_used)) {
++ if (__mb_cache_entry_is_hashed(ce))
++ list_add_tail(&ce->e_lru_list, &mb_cache_lru_list);
++ else {
++ spin_unlock(&mb_cache_spinlock);
++ __mb_cache_entry_forget(ce, GFP_KERNEL);
++ return;
++ }
++ }
++ spin_unlock(&mb_cache_spinlock);
++}
++
++
++/*
++ * mb_cache_memory_pressure() memory pressure callback
++ *
++ * This function is called by the kernel memory management when memory
++ * gets low.
++ *
++ * @priority: Amount by which to shrink the cache (0 = highes priority)
++ * @gfp_mask: (ignored)
++ */
++static void
++mb_cache_memory_pressure(int priority, unsigned int gfp_mask)
++{
++ LIST_HEAD(free_list);
++ struct list_head *l, *ltmp;
++ int count = 0;
++
++ spin_lock(&mb_cache_spinlock);
++ list_for_each(l, &mb_cache_list) {
++ struct mb_cache *cache =
++ list_entry(l, struct mb_cache, c_cache_list);
++ mb_debug("cache %s (%d)", cache->c_name,
++ atomic_read(&cache->c_entry_count));
++ count += atomic_read(&cache->c_entry_count);
++ }
++ mb_debug("trying to free %d of %d entries",
++ count / (priority ? priority : 1), count);
++ if (priority)
++ count /= priority;
++ while (count-- && !list_empty(&mb_cache_lru_list)) {
++ struct mb_cache_entry *ce =
++ list_entry(mb_cache_lru_list.next,
++ struct mb_cache_entry, e_lru_list);
++ list_del(&ce->e_lru_list);
++ __mb_cache_entry_unhash(ce);
++ list_add_tail(&ce->e_lru_list, &free_list);
++ }
++ spin_unlock(&mb_cache_spinlock);
++ list_for_each_safe(l, ltmp, &free_list) {
++ __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
++ e_lru_list), gfp_mask);
++ }
++}
++
++
++/*
++ * mb_cache_create() create a new cache
++ *
++ * All entries in one cache are equal size. Cache entries may be from
++ * multiple devices. If this is the first mbcache created, registers
++ * the cache with kernel memory management. Returns NULL if no more
++ * memory was available.
++ *
++ * @name: name of the cache (informal)
++ * @cache_op: contains the callback called when freeing a cache entry
++ * @entry_size: The size of a cache entry, including
++ * struct mb_cache_entry
++ * @indexes_count: number of additional indexes in the cache. Must equal
++ * MB_CACHE_INDEXES_COUNT if the number of indexes is
++ * hardwired.
++ * @bucket_count: number of hash buckets
++ */
++struct mb_cache *
++mb_cache_create(const char *name, struct mb_cache_op *cache_op,
++ size_t entry_size, int indexes_count, int bucket_count)
++{
++ int m=0, n;
++ struct mb_cache *cache = NULL;
++
++ if(entry_size < sizeof(struct mb_cache_entry) +
++ indexes_count * sizeof(struct mb_cache_entry_index))
++ return NULL;
++
++ MOD_INC_USE_COUNT;
++ cache = kmalloc(sizeof(struct mb_cache) +
++ indexes_count * sizeof(struct list_head), GFP_KERNEL);
++ if (!cache)
++ goto fail;
++ cache->c_name = name;
++ cache->c_op.free = NULL;
++ if (cache_op)
++ cache->c_op.free = cache_op->free;
++ atomic_set(&cache->c_entry_count, 0);
++ cache->c_bucket_count = bucket_count;
++#ifdef MB_CACHE_INDEXES_COUNT
++ mb_assert(indexes_count == MB_CACHE_INDEXES_COUNT);
++#else
++ cache->c_indexes_count = indexes_count;
++#endif
++ cache->c_block_hash = kmalloc(bucket_count * sizeof(struct list_head),
++ GFP_KERNEL);
++ if (!cache->c_block_hash)
++ goto fail;
++ for (n=0; n<bucket_count; n++)
++ INIT_LIST_HEAD(&cache->c_block_hash[n]);
++ for (m=0; m<indexes_count; m++) {
++ cache->c_indexes_hash[m] = kmalloc(bucket_count *
++ sizeof(struct list_head),
++ GFP_KERNEL);
++ if (!cache->c_indexes_hash[m])
++ goto fail;
++ for (n=0; n<bucket_count; n++)
++ INIT_LIST_HEAD(&cache->c_indexes_hash[m][n]);
++ }
++ cache->c_entry_cache = kmem_cache_create(name, entry_size, 0,
++ 0 /*SLAB_POISON | SLAB_RED_ZONE*/, NULL, NULL);
++ if (!cache->c_entry_cache)
++ goto fail;
++
++ spin_lock(&mb_cache_spinlock);
++ list_add(&cache->c_cache_list, &mb_cache_list);
++ spin_unlock(&mb_cache_spinlock);
++ return cache;
++
++fail:
++ if (cache) {
++ while (--m >= 0)
++ kfree(cache->c_indexes_hash[m]);
++ if (cache->c_block_hash)
++ kfree(cache->c_block_hash);
++ kfree(cache);
++ }
++ MOD_DEC_USE_COUNT;
++ return NULL;
++}
++
++
++/*
++ * mb_cache_shrink()
++ *
++ * Removes all cache entires of a device from the cache. All cache entries
++ * currently in use cannot be freed, and thus remain in the cache.
++ *
++ * @cache: which cache to shrink
++ * @dev: which device's cache entries to shrink
++ */
++void
++mb_cache_shrink(struct mb_cache *cache, kdev_t dev)
++{
++ LIST_HEAD(free_list);
++ struct list_head *l, *ltmp;
++
++ spin_lock(&mb_cache_spinlock);
++ list_for_each_safe(l, ltmp, &mb_cache_lru_list) {
++ struct mb_cache_entry *ce =
++ list_entry(l, struct mb_cache_entry, e_lru_list);
++ if (ce->e_dev == dev) {
++ list_del(&ce->e_lru_list);
++ list_add_tail(&ce->e_lru_list, &free_list);
++ __mb_cache_entry_unhash(ce);
++ }
++ }
++ spin_unlock(&mb_cache_spinlock);
++ list_for_each_safe(l, ltmp, &free_list) {
++ __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
++ e_lru_list), GFP_KERNEL);
++ }
++}
++
++
++/*
++ * mb_cache_destroy()
++ *
++ * Shrinks the cache to its minimum possible size (hopefully 0 entries),
++ * and then destroys it. If this was the last mbcache, un-registers the
++ * mbcache from kernel memory management.
++ */
++void
++mb_cache_destroy(struct mb_cache *cache)
++{
++ LIST_HEAD(free_list);
++ struct list_head *l, *ltmp;
++ int n;
++
++ spin_lock(&mb_cache_spinlock);
++ list_for_each_safe(l, ltmp, &mb_cache_lru_list) {
++ struct mb_cache_entry *ce =
++ list_entry(l, struct mb_cache_entry, e_lru_list);
++ if (ce->e_cache == cache) {
++ list_del(&ce->e_lru_list);
++ list_add_tail(&ce->e_lru_list, &free_list);
++ __mb_cache_entry_unhash(ce);
++ }
++ }
++ list_del(&cache->c_cache_list);
++ spin_unlock(&mb_cache_spinlock);
++ list_for_each_safe(l, ltmp, &free_list) {
++ __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry,
++ e_lru_list), GFP_KERNEL);
++ }
++
++ if (atomic_read(&cache->c_entry_count) > 0) {
++ mb_error("cache %s: %d orphaned entries",
++ cache->c_name,
++ atomic_read(&cache->c_entry_count));
++ }
++
++#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,3,0))
++ /* We don't have kmem_cache_destroy() in 2.2.x */
++ kmem_cache_shrink(cache->c_entry_cache);
++#else
++ kmem_cache_destroy(cache->c_entry_cache);
++#endif
++ for (n=0; n < mb_cache_indexes(cache); n++)
++ kfree(cache->c_indexes_hash[n]);
++ kfree(cache->c_block_hash);
++ kfree(cache);
++
++ MOD_DEC_USE_COUNT;
++}
++
++
++/*
++ * mb_cache_entry_alloc()
++ *
++ * Allocates a new cache entry. The new entry will not be valid initially,
++ * and thus cannot be looked up yet. It should be filled with data, and
++ * then inserted into the cache using mb_cache_entry_insert(). Returns NULL
++ * if no more memory was available.
++ */
++struct mb_cache_entry *
++mb_cache_entry_alloc(struct mb_cache *cache)
++{
++ struct mb_cache_entry *ce;
++
++ atomic_inc(&cache->c_entry_count);
++ ce = kmem_cache_alloc(cache->c_entry_cache, GFP_KERNEL);
++ if (ce) {
++ INIT_LIST_HEAD(&ce->e_lru_list);
++ INIT_LIST_HEAD(&ce->e_block_list);
++ ce->e_cache = cache;
++ atomic_set(&ce->e_used, 1);
++ }
++ return ce;
++}
++
++
++/*
++ * mb_cache_entry_insert()
++ *
++ * Inserts an entry that was allocated using mb_cache_entry_alloc() into
++ * the cache. After this, the cache entry can be looked up, but is not yet
++ * in the lru list as the caller still holds a handle to it. Returns 0 on
++ * success, or -EBUSY if a cache entry for that device + inode exists
++ * already (this may happen after a failed lookup, if another process has
++ * inserted the same cache entry in the meantime).
++ *
++ * @dev: device the cache entry belongs to
++ * @block: block number
++ * @keys: array of additional keys. There must be indexes_count entries
++ * in the array (as specified when creating the cache).
++ */
++int
++mb_cache_entry_insert(struct mb_cache_entry *ce, kdev_t dev,
++ unsigned long block, unsigned int keys[])
++{
++ struct mb_cache *cache = ce->e_cache;
++ unsigned int bucket = (HASHDEV(dev) + block) % cache->c_bucket_count;
++ struct list_head *l;
++ int error = -EBUSY, n;
++
++ spin_lock(&mb_cache_spinlock);
++ list_for_each(l, &cache->c_block_hash[bucket]) {
++ struct mb_cache_entry *ce =
++ list_entry(l, struct mb_cache_entry, e_block_list);
++ if (ce->e_dev == dev && ce->e_block == block)
++ goto out;
++ }
++ __mb_cache_entry_unhash(ce);
++ ce->e_dev = dev;
++ ce->e_block = block;
++ list_add(&ce->e_block_list, &cache->c_block_hash[bucket]);
++ for (n=0; n<mb_cache_indexes(cache); n++) {
++ ce->e_indexes[n].o_key = keys[n];
++ bucket = keys[n] % cache->c_bucket_count;
++ list_add(&ce->e_indexes[n].o_list,
++ &cache->c_indexes_hash[n][bucket]);
++ }
++out:
++ spin_unlock(&mb_cache_spinlock);
++ return error;
++}
++
++
++/*
++ * mb_cache_entry_release()
++ *
++ * Release a handle to a cache entry. When the last handle to a cache entry
++ * is released it is either freed (if it is invalid) or otherwise inserted
++ * in to the lru list.
++ */
++void
++mb_cache_entry_release(struct mb_cache_entry *ce)
++{
++ spin_lock(&mb_cache_spinlock);
++ __mb_cache_entry_release_unlock(ce);
++}
++
++
++/*
++ * mb_cache_entry_takeout()
++ *
++ * Take a cache entry out of the cache, making it invalid. The entry can later
++ * be re-inserted using mb_cache_entry_insert(), or released using
++ * mb_cache_entry_release().
++ */
++void
++mb_cache_entry_takeout(struct mb_cache_entry *ce)
++{
++ spin_lock(&mb_cache_spinlock);
++ mb_assert(list_empty(&ce->e_lru_list));
++ __mb_cache_entry_unhash(ce);
++ spin_unlock(&mb_cache_spinlock);
++}
++
++
++/*
++ * mb_cache_entry_free()
++ *
++ * This is equivalent to the sequence mb_cache_entry_takeout() --
++ * mb_cache_entry_release().
++ */
++void
++mb_cache_entry_free(struct mb_cache_entry *ce)
++{
++ spin_lock(&mb_cache_spinlock);
++ mb_assert(list_empty(&ce->e_lru_list));
++ __mb_cache_entry_unhash(ce);
++ __mb_cache_entry_release_unlock(ce);
++}
++
++
++/*
++ * mb_cache_entry_dup()
++ *
++ * Duplicate a handle to a cache entry (does not duplicate the cache entry
++ * itself). After the call, both the old and the new handle must be released.
++ */
++struct mb_cache_entry *
++mb_cache_entry_dup(struct mb_cache_entry *ce)
++{
++ atomic_inc(&ce->e_used);
++ return ce;
++}
++
++
++/*
++ * mb_cache_entry_get()
++ *
++ * Get a cache entry by device / block number. (There can only be one entry
++ * in the cache per device and block.) Returns NULL if no such cache entry
++ * exists.
++ */
++struct mb_cache_entry *
++mb_cache_entry_get(struct mb_cache *cache, kdev_t dev, unsigned long block)
++{
++ unsigned int bucket = (HASHDEV(dev) + block) % cache->c_bucket_count;
++ struct list_head *l;
++ struct mb_cache_entry *ce;
++
++ spin_lock(&mb_cache_spinlock);
++ list_for_each(l, &cache->c_block_hash[bucket]) {
++ ce = list_entry(l, struct mb_cache_entry, e_block_list);
++ if (ce->e_dev == dev && ce->e_block == block) {
++ if (!list_empty(&ce->e_lru_list))
++ list_del_init(&ce->e_lru_list);
++ atomic_inc(&ce->e_used);
++ goto cleanup;
++ }
++ }
++ ce = NULL;
++
++cleanup:
++ spin_unlock(&mb_cache_spinlock);
++ return ce;
++}
++
++#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
++
++static struct mb_cache_entry *
++__mb_cache_entry_find(struct list_head *l, struct list_head *head,
++ int index, kdev_t dev, unsigned int key)
++{
++ while (l != head) {
++ struct mb_cache_entry *ce =
++ list_entry(l, struct mb_cache_entry,
++ e_indexes[index].o_list);
++ if (ce->e_dev == dev && ce->e_indexes[index].o_key == key) {
++ if (!list_empty(&ce->e_lru_list))
++ list_del_init(&ce->e_lru_list);
++ atomic_inc(&ce->e_used);
++ return ce;
++ }
++ l = l->next;
++ }
++ return NULL;
++}
++
++
++/*
++ * mb_cache_entry_find_first()
++ *
++ * Find the first cache entry on a given device with a certain key in
++ * an additional index. Additonal matches can be found with
++ * mb_cache_entry_find_next(). Returns NULL if no match was found.
++ *
++ * @cache: the cache to search
++ * @index: the number of the additonal index to search (0<=index<indexes_count)
++ * @dev: the device the cache entry should belong to
++ * @key: the key in the index
++ */
++struct mb_cache_entry *
++mb_cache_entry_find_first(struct mb_cache *cache, int index, kdev_t dev,
++ unsigned int key)
++{
++ unsigned int bucket = key % cache->c_bucket_count;
++ struct list_head *l;
++ struct mb_cache_entry *ce;
++
++ mb_assert(index < mb_cache_indexes(cache));
++ spin_lock(&mb_cache_spinlock);
++ l = cache->c_indexes_hash[index][bucket].next;
++ ce = __mb_cache_entry_find(l, &cache->c_indexes_hash[index][bucket],
++ index, dev, key);
++ spin_unlock(&mb_cache_spinlock);
++ return ce;
++}
++
++
++/*
++ * mb_cache_entry_find_next()
++ *
++ * Find the next cache entry on a given device with a certain key in an
++ * additional index. Returns NULL if no match could be found. The previous
++ * entry is atomatically released, so that mb_cache_entry_find_next() can
++ * be called like this:
++ *
++ * entry = mb_cache_entry_find_first();
++ * while (entry) {
++ * ...
++ * entry = mb_cache_entry_find_next(entry, ...);
++ * }
++ *
++ * @prev: The previous match
++ * @index: the number of the additonal index to search (0<=index<indexes_count)
++ * @dev: the device the cache entry should belong to
++ * @key: the key in the index
++ */
++struct mb_cache_entry *
++mb_cache_entry_find_next(struct mb_cache_entry *prev, int index, kdev_t dev,
++ unsigned int key)
++{
++ struct mb_cache *cache = prev->e_cache;
++ unsigned int bucket = key % cache->c_bucket_count;
++ struct list_head *l;
++ struct mb_cache_entry *ce;
++
++ mb_assert(index < mb_cache_indexes(cache));
++ spin_lock(&mb_cache_spinlock);
++ l = prev->e_indexes[index].o_list.next;
++ ce = __mb_cache_entry_find(l, &cache->c_indexes_hash[index][bucket],
++ index, dev, key);
++ __mb_cache_entry_release_unlock(prev);
++ return ce;
++}
++
++#endif /* !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) */
++
++static int __init init_mbcache(void)
++{
++ register_cache(&mb_cache_definition);
++ return 0;
++}
++
++static void __exit exit_mbcache(void)
++{
++ unregister_cache(&mb_cache_definition);
++}
++
++module_init(init_mbcache)
++module_exit(exit_mbcache)
++
+--- linux-2.4.22-ac1/include/asm-arm/unistd.h~xattr-0.8.54-2.4.22-rh 2003-08-25 15:44:43.000000000 +0400
++++ linux-2.4.22-ac1-alexey/include/asm-arm/unistd.h 2003-09-25 23:57:02.000000000 +0400
+@@ -250,7 +250,6 @@
+ #define __NR_security (__NR_SYSCALL_BASE+223)
+ #define __NR_gettid (__NR_SYSCALL_BASE+224)
+ #define __NR_readahead (__NR_SYSCALL_BASE+225)
+-#if 0 /* allocated in 2.5 */
+ #define __NR_setxattr (__NR_SYSCALL_BASE+226)
+ #define __NR_lsetxattr (__NR_SYSCALL_BASE+227)
+ #define __NR_fsetxattr (__NR_SYSCALL_BASE+228)
+@@ -263,7 +262,6 @@
+ #define __NR_removexattr (__NR_SYSCALL_BASE+235)
+ #define __NR_lremovexattr (__NR_SYSCALL_BASE+236)
+ #define __NR_fremovexattr (__NR_SYSCALL_BASE+237)
+-#endif
+ #define __NR_tkill (__NR_SYSCALL_BASE+238)
+ #if 0 /* allocated in 2.5 */
+ #define __NR_sendfile64 (__NR_SYSCALL_BASE+239)
+--- linux-2.4.22-ac1/include/asm-ppc64/unistd.h~xattr-0.8.54-2.4.22-rh 2003-06-13 18:51:38.000000000 +0400
++++ linux-2.4.22-ac1-alexey/include/asm-ppc64/unistd.h 2003-09-25 23:57:02.000000000 +0400
+@@ -218,6 +218,7 @@
+ #define __NR_mincore 206
+ #define __NR_gettid 207
+ #define __NR_tkill 208
++#endif
+ #define __NR_setxattr 209
+ #define __NR_lsetxattr 210
+ #define __NR_fsetxattr 211
+@@ -230,6 +231,7 @@
+ #define __NR_removexattr 218
+ #define __NR_lremovexattr 219
+ #define __NR_fremovexattr 220
++#if 0 /* Reserved syscalls */
+ #define __NR_futex 221
+ #define __NR_sched_setaffinity 222
+ #define __NR_sched_getaffinity 223
+--- linux-2.4.22-ac1/include/asm-s390/unistd.h~xattr-0.8.54-2.4.22-rh 2003-06-13 18:51:38.000000000 +0400
++++ linux-2.4.22-ac1-alexey/include/asm-s390/unistd.h 2003-09-26 00:14:23.000000000 +0400
+@@ -213,6 +213,19 @@
+ #define __NR_getdents64 220
+ #define __NR_fcntl64 221
+ #define __NR_readahead 222
++#define __NR_setxattr 224
++#define __NR_lsetxattr 225
++#define __NR_fsetxattr 226
++#define __NR_getxattr 227
++#define __NR_lgetxattr 228
++#define __NR_fgetxattr 229
++#define __NR_listxattr 230
++#define __NR_llistxattr 231
++#define __NR_flistxattr 232
++#define __NR_removexattr 233
++#define __NR_lremovexattr 234
++#define __NR_fremovexattr 235
++
+ /*
+ * Numbers 224-235 are reserved for posix acl
+ */
+--- linux-2.4.22-ac1/include/asm-s390x/unistd.h~xattr-0.8.54-2.4.22-rh 2003-06-13 18:51:38.000000000 +0400
++++ linux-2.4.22-ac1-alexey/include/asm-s390x/unistd.h 2003-09-26 00:15:11.000000000 +0400
+@@ -181,6 +181,19 @@
+ #define __NR_mincore 218
+ #define __NR_madvise 219
+ #define __NR_readahead 222
++#define __NR_setxattr 224
++#define __NR_lsetxattr 225
++#define __NR_fsetxattr 226
++#define __NR_getxattr 227
++#define __NR_lgetxattr 228
++#define __NR_fgetxattr 229
++#define __NR_listxattr 230
++#define __NR_llistxattr 231
++#define __NR_flistxattr 232
++#define __NR_removexattr 233
++#define __NR_lremovexattr 234
++#define __NR_fremovexattr 235
++
+ /*
+ * Numbers 224-235 are reserved for posix acl
+ */
+--- /dev/null 2003-01-30 13:24:37.000000000 +0300
++++ linux-2.4.22-ac1-alexey/include/linux/cache_def.h 2003-09-25 23:57:02.000000000 +0400
+@@ -0,0 +1,15 @@
++/*
++ * linux/cache_def.h
++ * Handling of caches defined in drivers, filesystems, ...
++ *
++ * Copyright (C) 2002 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
++ */
++
++struct cache_definition {
++ const char *name;
++ void (*shrink)(int, unsigned int);
++ struct list_head link;
++};
++
++extern void register_cache(struct cache_definition *);
++extern void unregister_cache(struct cache_definition *);
+--- linux-2.4.22-ac1/include/linux/errno.h~xattr-0.8.54-2.4.22-rh 2001-02-10 01:46:13.000000000 +0300
++++ linux-2.4.22-ac1-alexey/include/linux/errno.h 2003-09-25 23:57:02.000000000 +0400
+@@ -23,4 +23,8 @@
+
+ #endif
+
++/* Defined for extended attributes */
++#define ENOATTR ENODATA /* No such attribute */
++#define ENOTSUP EOPNOTSUPP /* Operation not supported */
++
+ #endif
+--- linux-2.4.22-ac1/include/linux/ext2_fs.h~xattr-0.8.54-2.4.22-rh 2003-06-13 18:51:38.000000000 +0400
++++ linux-2.4.22-ac1-alexey/include/linux/ext2_fs.h 2003-09-25 23:57:02.000000000 +0400
+@@ -57,8 +57,6 @@
+ */
+ #define EXT2_BAD_INO 1 /* Bad blocks inode */
+ #define EXT2_ROOT_INO 2 /* Root inode */
+-#define EXT2_ACL_IDX_INO 3 /* ACL inode */
+-#define EXT2_ACL_DATA_INO 4 /* ACL inode */
+ #define EXT2_BOOT_LOADER_INO 5 /* Boot loader inode */
+ #define EXT2_UNDEL_DIR_INO 6 /* Undelete directory inode */
+
+@@ -86,7 +84,6 @@
+ #else
+ # define EXT2_BLOCK_SIZE(s) (EXT2_MIN_BLOCK_SIZE << (s)->s_log_block_size)
+ #endif
+-#define EXT2_ACLE_PER_BLOCK(s) (EXT2_BLOCK_SIZE(s) / sizeof (struct ext2_acl_entry))
+ #define EXT2_ADDR_PER_BLOCK(s) (EXT2_BLOCK_SIZE(s) / sizeof (__u32))
+ #ifdef __KERNEL__
+ # define EXT2_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits)
+@@ -121,28 +118,6 @@
+ #endif
+
+ /*
+- * ACL structures
+- */
+-struct ext2_acl_header /* Header of Access Control Lists */
+-{
+- __u32 aclh_size;
+- __u32 aclh_file_count;
+- __u32 aclh_acle_count;
+- __u32 aclh_first_acle;
+-};
+-
+-struct ext2_acl_entry /* Access Control List Entry */
+-{
+- __u32 acle_size;
+- __u16 acle_perms; /* Access permissions */
+- __u16 acle_type; /* Type of entry */
+- __u16 acle_tag; /* User or group identity */
+- __u16 acle_pad1;
+- __u32 acle_next; /* Pointer on next entry for the */
+- /* same inode or on next free entry */
+-};
+-
+-/*
+ * Structure of a blocks group descriptor
+ */
+ struct ext2_group_desc
+@@ -314,6 +289,7 @@ struct ext2_inode {
+ #define EXT2_MOUNT_ERRORS_PANIC 0x0040 /* Panic on errors */
+ #define EXT2_MOUNT_MINIX_DF 0x0080 /* Mimics the Minix statfs */
+ #define EXT2_MOUNT_NO_UID32 0x0200 /* Disable 32-bit UIDs */
++#define EXT2_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */
+
+ #define clear_opt(o, opt) o &= ~EXT2_MOUNT_##opt
+ #define set_opt(o, opt) o |= EXT2_MOUNT_##opt
+@@ -397,6 +373,7 @@ struct ext2_super_block {
+
+ #ifdef __KERNEL__
+ #define EXT2_SB(sb) (&((sb)->u.ext2_sb))
++#define EXT2_I(inode) (&((inode)->u.ext2_i))
+ #else
+ /* Assume that user mode programs are passing in an ext2fs superblock, not
+ * a kernel struct super_block. This will allow us to call the feature-test
+@@ -466,7 +443,7 @@ struct ext2_super_block {
+ #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008
+ #define EXT2_FEATURE_INCOMPAT_ANY 0xffffffff
+
+-#define EXT2_FEATURE_COMPAT_SUPP 0
++#define EXT2_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
+ #define EXT2_FEATURE_INCOMPAT_SUPP EXT2_FEATURE_INCOMPAT_FILETYPE
+ #define EXT2_FEATURE_RO_COMPAT_SUPP (EXT2_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+ EXT2_FEATURE_RO_COMPAT_LARGE_FILE| \
+@@ -624,8 +601,10 @@ extern struct address_space_operations e
+
+ /* namei.c */
+ extern struct inode_operations ext2_dir_inode_operations;
++extern struct inode_operations ext2_special_inode_operations;
+
+ /* symlink.c */
++extern struct inode_operations ext2_symlink_inode_operations;
+ extern struct inode_operations ext2_fast_symlink_inode_operations;
+
+ #endif /* __KERNEL__ */
+--- /dev/null 2003-01-30 13:24:37.000000000 +0300
++++ linux-2.4.22-ac1-alexey/include/linux/ext2_xattr.h 2003-09-25 23:57:02.000000000 +0400
+@@ -0,0 +1,157 @@
++/*
++ File: linux/ext2_xattr.h
++
++ On-disk format of extended attributes for the ext2 filesystem.
++
++ (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
++*/
++
++#include <linux/config.h>
++#include <linux/init.h>
++#include <linux/xattr.h>
++
++/* Magic value in attribute blocks */
++#define EXT2_XATTR_MAGIC 0xEA020000
++
++/* Maximum number of references to one attribute block */
++#define EXT2_XATTR_REFCOUNT_MAX 1024
++
++/* Name indexes */
++#define EXT2_XATTR_INDEX_MAX 10
++#define EXT2_XATTR_INDEX_USER 1
++#define EXT2_XATTR_INDEX_POSIX_ACL_ACCESS 2
++#define EXT2_XATTR_INDEX_POSIX_ACL_DEFAULT 3
++
++struct ext2_xattr_header {
++ __u32 h_magic; /* magic number for identification */
++ __u32 h_refcount; /* reference count */
++ __u32 h_blocks; /* number of disk blocks used */
++ __u32 h_hash; /* hash value of all attributes */
++ __u32 h_reserved[4]; /* zero right now */
++};
++
++struct ext2_xattr_entry {
++ __u8 e_name_len; /* length of name */
++ __u8 e_name_index; /* attribute name index */
++ __u16 e_value_offs; /* offset in disk block of value */
++ __u32 e_value_block; /* disk block attribute is stored on (n/i) */
++ __u32 e_value_size; /* size of attribute value */
++ __u32 e_hash; /* hash value of name and value */
++ char e_name[0]; /* attribute name */
++};
++
++#define EXT2_XATTR_PAD_BITS 2
++#define EXT2_XATTR_PAD (1<<EXT2_XATTR_PAD_BITS)
++#define EXT2_XATTR_ROUND (EXT2_XATTR_PAD-1)
++#define EXT2_XATTR_LEN(name_len) \
++ (((name_len) + EXT2_XATTR_ROUND + \
++ sizeof(struct ext2_xattr_entry)) & ~EXT2_XATTR_ROUND)
++#define EXT2_XATTR_NEXT(entry) \
++ ( (struct ext2_xattr_entry *)( \
++ (char *)(entry) + EXT2_XATTR_LEN((entry)->e_name_len)) )
++#define EXT2_XATTR_SIZE(size) \
++ (((size) + EXT2_XATTR_ROUND) & ~EXT2_XATTR_ROUND)
++
++#ifdef __KERNEL__
++
++# ifdef CONFIG_EXT2_FS_XATTR
++
++struct ext2_xattr_handler {
++ char *prefix;
++ size_t (*list)(char *list, struct inode *inode, const char *name,
++ int name_len);
++ int (*get)(struct inode *inode, const char *name, void *buffer,
++ size_t size);
++ int (*set)(struct inode *inode, const char *name, const void *buffer,
++ size_t size, int flags);
++};
++
++extern int ext2_xattr_register(int, struct ext2_xattr_handler *);
++extern void ext2_xattr_unregister(int, struct ext2_xattr_handler *);
++
++extern int ext2_setxattr(struct dentry *, const char *, const void *, size_t, int);
++extern ssize_t ext2_getxattr(struct dentry *, const char *, void *, size_t);
++extern ssize_t ext2_listxattr(struct dentry *, char *, size_t);
++extern int ext2_removexattr(struct dentry *, const char *);
++
++extern int ext2_xattr_get(struct inode *, int, const char *, void *, size_t);
++extern int ext2_xattr_list(struct inode *, char *, size_t);
++extern int ext2_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
++
++extern void ext2_xattr_delete_inode(struct inode *);
++extern void ext2_xattr_put_super(struct super_block *);
++
++extern int init_ext2_xattr(void) __init;
++extern void exit_ext2_xattr(void);
++
++# else /* CONFIG_EXT2_FS_XATTR */
++# define ext2_setxattr NULL
++# define ext2_getxattr NULL
++# define ext2_listxattr NULL
++# define ext2_removexattr NULL
++
++static inline int
++ext2_xattr_get(struct inode *inode, int name_index,
++ const char *name, void *buffer, size_t size)
++{
++ return -ENOTSUP;
++}
++
++static inline int
++ext2_xattr_list(struct inode *inode, char *buffer, size_t size)
++{
++ return -ENOTSUP;
++}
++
++static inline int
++ext2_xattr_set(struct inode *inode, int name_index, const char *name,
++ const void *value, size_t size, int flags)
++{
++ return -ENOTSUP;
++}
++
++static inline void
++ext2_xattr_delete_inode(struct inode *inode)
++{
++}
++
++static inline void
++ext2_xattr_put_super(struct super_block *sb)
++{
++}
++
++static inline int
++init_ext2_xattr(void)
++{
++ return 0;
++}
++
++static inline void
++exit_ext2_xattr(void)
++{
++}
++
++# endif /* CONFIG_EXT2_FS_XATTR */
++
++# ifdef CONFIG_EXT2_FS_XATTR_USER
++
++extern int init_ext2_xattr_user(void) __init;
++extern void exit_ext2_xattr_user(void);
++
++# else /* CONFIG_EXT2_FS_XATTR_USER */
++
++static inline int
++init_ext2_xattr_user(void)
++{
++ return 0;
++}
++
++static inline void
++exit_ext2_xattr_user(void)
++{
++}
++
++# endif /* CONFIG_EXT2_FS_XATTR_USER */
++
++#endif /* __KERNEL__ */
++
+--- linux-2.4.22-ac1/include/linux/ext3_fs.h~xattr-0.8.54-2.4.22-rh 2003-09-25 14:58:30.000000000 +0400
++++ linux-2.4.22-ac1-alexey/include/linux/ext3_fs.h 2003-09-25 23:57:02.000000000 +0400
+@@ -63,8 +63,6 @@
+ */
+ #define EXT3_BAD_INO 1 /* Bad blocks inode */
+ #define EXT3_ROOT_INO 2 /* Root inode */
+-#define EXT3_ACL_IDX_INO 3 /* ACL inode */
+-#define EXT3_ACL_DATA_INO 4 /* ACL inode */
+ #define EXT3_BOOT_LOADER_INO 5 /* Boot loader inode */
+ #define EXT3_UNDEL_DIR_INO 6 /* Undelete directory inode */
+ #define EXT3_RESIZE_INO 7 /* Reserved group descriptors inode */
+@@ -94,7 +92,6 @@
+ #else
+ # define EXT3_BLOCK_SIZE(s) (EXT3_MIN_BLOCK_SIZE << (s)->s_log_block_size)
+ #endif
+-#define EXT3_ACLE_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (struct ext3_acl_entry))
+ #define EXT3_ADDR_PER_BLOCK(s) (EXT3_BLOCK_SIZE(s) / sizeof (__u32))
+ #ifdef __KERNEL__
+ # define EXT3_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits)
+@@ -129,28 +126,6 @@
+ #endif
+
+ /*
+- * ACL structures
+- */
+-struct ext3_acl_header /* Header of Access Control Lists */
+-{
+- __u32 aclh_size;
+- __u32 aclh_file_count;
+- __u32 aclh_acle_count;
+- __u32 aclh_first_acle;
+-};
+-
+-struct ext3_acl_entry /* Access Control List Entry */
+-{
+- __u32 acle_size;
+- __u16 acle_perms; /* Access permissions */
+- __u16 acle_type; /* Type of entry */
+- __u16 acle_tag; /* User or group identity */
+- __u16 acle_pad1;
+- __u32 acle_next; /* Pointer on next entry for the */
+- /* same inode or on next free entry */
+-};
+-
+-/*
+ * Structure of a blocks group descriptor
+ */
+ struct ext3_group_desc
+@@ -346,6 +321,7 @@ struct ext3_inode {
+ #define EXT3_MOUNT_WRITEBACK_DATA 0x0C00 /* No data ordering */
+ #define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */
+ #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */
++#define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */
+
+ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
+ #ifndef _LINUX_EXT2_FS_H
+@@ -523,7 +499,7 @@ struct ext3_super_block {
+ #define EXT3_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */
+ #define EXT3_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */
+
+-#define EXT3_FEATURE_COMPAT_SUPP 0
++#define EXT3_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
+ #define EXT3_FEATURE_INCOMPAT_SUPP (EXT3_FEATURE_INCOMPAT_FILETYPE| \
+ EXT3_FEATURE_INCOMPAT_RECOVER)
+ #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+@@ -707,6 +683,7 @@ extern void ext3_check_inodes_bitmap (st
+ extern unsigned long ext3_count_free (struct buffer_head *, unsigned);
+
+ /* inode.c */
++extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int);
+ extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *);
+ extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *);
+
+@@ -776,8 +753,10 @@ extern struct address_space_operations e
+
+ /* namei.c */
+ extern struct inode_operations ext3_dir_inode_operations;
++extern struct inode_operations ext3_special_inode_operations;
+
+ /* symlink.c */
++extern struct inode_operations ext3_symlink_inode_operations;
+ extern struct inode_operations ext3_fast_symlink_inode_operations;
+
+
+--- linux-2.4.22-ac1/include/linux/ext3_jbd.h~xattr-0.8.54-2.4.22-rh 2003-09-25 14:55:12.000000000 +0400
++++ linux-2.4.22-ac1-alexey/include/linux/ext3_jbd.h 2003-09-25 23:57:02.000000000 +0400
+@@ -30,13 +30,19 @@
+
+ #define EXT3_SINGLEDATA_TRANS_BLOCKS 8U
+
++/* Extended attributes may touch two data buffers, two bitmap buffers,
++ * and two group and summaries. */
++
++#define EXT3_XATTR_TRANS_BLOCKS 8
++
+ /* Define the minimum size for a transaction which modifies data. This
+ * needs to take into account the fact that we may end up modifying two
+ * quota files too (one for the group, one for the user quota). The
+ * superblock only gets updated once, of course, so don't bother
+ * counting that again for the quota updates. */
+
+-#define EXT3_DATA_TRANS_BLOCKS (3 * EXT3_SINGLEDATA_TRANS_BLOCKS - 2)
++#define EXT3_DATA_TRANS_BLOCKS (3 * EXT3_SINGLEDATA_TRANS_BLOCKS + \
++ EXT3_XATTR_TRANS_BLOCKS - 2)
+
+ extern int ext3_writepage_trans_blocks(struct inode *inode);
+
+--- /dev/null 2003-01-30 13:24:37.000000000 +0300
++++ linux-2.4.22-ac1-alexey/include/linux/ext3_xattr.h 2003-09-25 23:57:02.000000000 +0400
+@@ -0,0 +1,157 @@
++/*
++ File: linux/ext3_xattr.h
++
++ On-disk format of extended attributes for the ext3 filesystem.
++
++ (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
++*/
++
++#include <linux/config.h>
++#include <linux/init.h>
++#include <linux/xattr.h>
++
++/* Magic value in attribute blocks */
++#define EXT3_XATTR_MAGIC 0xEA020000
++
++/* Maximum number of references to one attribute block */
++#define EXT3_XATTR_REFCOUNT_MAX 1024
++
++/* Name indexes */
++#define EXT3_XATTR_INDEX_MAX 10
++#define EXT3_XATTR_INDEX_USER 1
++#define EXT3_XATTR_INDEX_POSIX_ACL_ACCESS 2
++#define EXT3_XATTR_INDEX_POSIX_ACL_DEFAULT 3
++
++struct ext3_xattr_header {
++ __u32 h_magic; /* magic number for identification */
++ __u32 h_refcount; /* reference count */
++ __u32 h_blocks; /* number of disk blocks used */
++ __u32 h_hash; /* hash value of all attributes */
++ __u32 h_reserved[4]; /* zero right now */
++};
++
++struct ext3_xattr_entry {
++ __u8 e_name_len; /* length of name */
++ __u8 e_name_index; /* attribute name index */
++ __u16 e_value_offs; /* offset in disk block of value */
++ __u32 e_value_block; /* disk block attribute is stored on (n/i) */
++ __u32 e_value_size; /* size of attribute value */
++ __u32 e_hash; /* hash value of name and value */
++ char e_name[0]; /* attribute name */
++};
++
++#define EXT3_XATTR_PAD_BITS 2
++#define EXT3_XATTR_PAD (1<<EXT3_XATTR_PAD_BITS)
++#define EXT3_XATTR_ROUND (EXT3_XATTR_PAD-1)
++#define EXT3_XATTR_LEN(name_len) \
++ (((name_len) + EXT3_XATTR_ROUND + \
++ sizeof(struct ext3_xattr_entry)) & ~EXT3_XATTR_ROUND)
++#define EXT3_XATTR_NEXT(entry) \
++ ( (struct ext3_xattr_entry *)( \
++ (char *)(entry) + EXT3_XATTR_LEN((entry)->e_name_len)) )
++#define EXT3_XATTR_SIZE(size) \
++ (((size) + EXT3_XATTR_ROUND) & ~EXT3_XATTR_ROUND)
++
++#ifdef __KERNEL__
++
++# ifdef CONFIG_EXT3_FS_XATTR
++
++struct ext3_xattr_handler {
++ char *prefix;
++ size_t (*list)(char *list, struct inode *inode, const char *name,
++ int name_len);
++ int (*get)(struct inode *inode, const char *name, void *buffer,
++ size_t size);
++ int (*set)(struct inode *inode, const char *name, const void *buffer,
++ size_t size, int flags);
++};
++
++extern int ext3_xattr_register(int, struct ext3_xattr_handler *);
++extern void ext3_xattr_unregister(int, struct ext3_xattr_handler *);
++
++extern int ext3_setxattr(struct dentry *, const char *, const void *, size_t, int);
++extern ssize_t ext3_getxattr(struct dentry *, const char *, void *, size_t);
++extern ssize_t ext3_listxattr(struct dentry *, char *, size_t);
++extern int ext3_removexattr(struct dentry *, const char *);
++
++extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t);
++extern int ext3_xattr_list(struct inode *, char *, size_t);
++extern int ext3_xattr_set(handle_t *handle, struct inode *, int, const char *, const void *, size_t, int);
++
++extern void ext3_xattr_delete_inode(handle_t *, struct inode *);
++extern void ext3_xattr_put_super(struct super_block *);
++
++extern int init_ext3_xattr(void) __init;
++extern void exit_ext3_xattr(void);
++
++# else /* CONFIG_EXT3_FS_XATTR */
++# define ext3_setxattr NULL
++# define ext3_getxattr NULL
++# define ext3_listxattr NULL
++# define ext3_removexattr NULL
++
++static inline int
++ext3_xattr_get(struct inode *inode, int name_index, const char *name,
++ void *buffer, size_t size)
++{
++ return -ENOTSUP;
++}
++
++static inline int
++ext3_xattr_list(struct inode *inode, void *buffer, size_t size)
++{
++ return -ENOTSUP;
++}
++
++static inline int
++ext3_xattr_set(handle_t *handle, struct inode *inode, int name_index,
++ const char *name, const void *value, size_t size, int flags)
++{
++ return -ENOTSUP;
++}
++
++static inline void
++ext3_xattr_delete_inode(handle_t *handle, struct inode *inode)
++{
++}
++
++static inline void
++ext3_xattr_put_super(struct super_block *sb)
++{
++}
++
++static inline int
++init_ext3_xattr(void)
++{
++ return 0;
++}
++
++static inline void
++exit_ext3_xattr(void)
++{
++}
++
++# endif /* CONFIG_EXT3_FS_XATTR */
++
++# ifdef CONFIG_EXT3_FS_XATTR_USER
++
++extern int init_ext3_xattr_user(void) __init;
++extern void exit_ext3_xattr_user(void);
++
++# else /* CONFIG_EXT3_FS_XATTR_USER */
++
++static inline int
++init_ext3_xattr_user(void)
++{
++ return 0;
++}
++
++static inline void
++exit_ext3_xattr_user(void)
++{
++}
++
++#endif /* CONFIG_EXT3_FS_XATTR_USER */
++
++#endif /* __KERNEL__ */
++
+--- linux-2.4.22-ac1/include/linux/fs.h~xattr-0.8.54-2.4.22-rh 2003-09-25 14:45:32.000000000 +0400
++++ linux-2.4.22-ac1-alexey/include/linux/fs.h 2003-09-25 23:57:02.000000000 +0400
+@@ -918,7 +918,7 @@ struct inode_operations {
+ int (*setattr) (struct dentry *, struct iattr *);
+ int (*setattr_raw) (struct inode *, struct iattr *);
+ int (*getattr) (struct dentry *, struct iattr *);
+- int (*setxattr) (struct dentry *, const char *, void *, size_t, int);
++ int (*setxattr) (struct dentry *, const char *, const void *, size_t, int);
+ ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
+ ssize_t (*listxattr) (struct dentry *, char *, size_t);
+ int (*removexattr) (struct dentry *, const char *);
+--- /dev/null 2003-01-30 13:24:37.000000000 +0300
++++ linux-2.4.22-ac1-alexey/include/linux/mbcache.h 2003-09-25 23:57:02.000000000 +0400
+@@ -0,0 +1,69 @@
++/*
++ File: linux/mbcache.h
++
++ (C) 2001 by Andreas Gruenbacher, <a.gruenbacher@computer.org>
++*/
++
++/* Hardwire the number of additional indexes */
++#define MB_CACHE_INDEXES_COUNT 1
++
++struct mb_cache_entry;
++
++struct mb_cache_op {
++ int (*free)(struct mb_cache_entry *, int);
++};
++
++struct mb_cache {
++ struct list_head c_cache_list;
++ const char *c_name;
++ struct mb_cache_op c_op;
++ atomic_t c_entry_count;
++ int c_bucket_count;
++#ifndef MB_CACHE_INDEXES_COUNT
++ int c_indexes_count;
++#endif
++ kmem_cache_t *c_entry_cache;
++ struct list_head *c_block_hash;
++ struct list_head *c_indexes_hash[0];
++};
++
++struct mb_cache_entry_index {
++ struct list_head o_list;
++ unsigned int o_key;
++};
++
++struct mb_cache_entry {
++ struct list_head e_lru_list;
++ struct mb_cache *e_cache;
++ atomic_t e_used;
++ kdev_t e_dev;
++ unsigned long e_block;
++ struct list_head e_block_list;
++ struct mb_cache_entry_index e_indexes[0];
++};
++
++/* Functions on caches */
++
++struct mb_cache * mb_cache_create(const char *, struct mb_cache_op *, size_t,
++ int, int);
++void mb_cache_shrink(struct mb_cache *, kdev_t);
++void mb_cache_destroy(struct mb_cache *);
++
++/* Functions on cache entries */
++
++struct mb_cache_entry *mb_cache_entry_alloc(struct mb_cache *);
++int mb_cache_entry_insert(struct mb_cache_entry *, kdev_t, unsigned long,
++ unsigned int[]);
++void mb_cache_entry_rehash(struct mb_cache_entry *, unsigned int[]);
++void mb_cache_entry_release(struct mb_cache_entry *);
++void mb_cache_entry_takeout(struct mb_cache_entry *);
++void mb_cache_entry_free(struct mb_cache_entry *);
++struct mb_cache_entry *mb_cache_entry_dup(struct mb_cache_entry *);
++struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *, kdev_t,
++ unsigned long);
++#if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0)
++struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, int,
++ kdev_t, unsigned int);
++struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache_entry *, int,
++ kdev_t, unsigned int);
++#endif
+--- linux-2.4.22-ac1/kernel/ksyms.c~xattr-0.8.54-2.4.22-rh 2003-09-25 14:42:46.000000000 +0400
++++ linux-2.4.22-ac1-alexey/kernel/ksyms.c 2003-09-26 00:19:05.000000000 +0400
+@@ -11,6 +11,7 @@
+
+ #include <linux/config.h>
+ #include <linux/slab.h>
++#include <linux/cache_def.h>
+ #include <linux/smp.h>
+ #include <linux/module.h>
+ #include <linux/blkdev.h>
+@@ -106,6 +107,7 @@ EXPORT_SYMBOL(exit_files);
+ EXPORT_SYMBOL(exit_fs);
+ EXPORT_SYMBOL(exit_sighand);
+ EXPORT_SYMBOL(unshare_files);
++EXPORT_SYMBOL(copy_fs_struct);
+
+ /* internal kernel memory management */
+ EXPORT_SYMBOL(_alloc_pages);
+@@ -124,6 +126,8 @@ EXPORT_SYMBOL(kmem_cache_validate);
+ EXPORT_SYMBOL(kmem_cache_alloc);
+ EXPORT_SYMBOL(kmem_cache_free);
+ EXPORT_SYMBOL(kmem_cache_size);
++EXPORT_SYMBOL(register_cache);
++EXPORT_SYMBOL(unregister_cache);
+ EXPORT_SYMBOL(kmalloc);
+ EXPORT_SYMBOL(kfree);
+ EXPORT_SYMBOL(vfree);
+--- linux-2.4.22-ac1/mm/vmscan.c~xattr-0.8.54-2.4.22-rh 2003-09-25 14:16:28.000000000 +0400
++++ linux-2.4.22-ac1-alexey/mm/vmscan.c 2003-09-25 23:57:02.000000000 +0400
+@@ -18,6 +18,7 @@
+ #include <linux/kernel_stat.h>
+ #include <linux/swap.h>
+ #include <linux/swapctl.h>
++#include <linux/cache_def.h>
+ #include <linux/smp_lock.h>
+ #include <linux/pagemap.h>
+ #include <linux/init.h>
+@@ -34,6 +35,39 @@
+ */
+ #define DEF_PRIORITY (6)
+
++static DECLARE_MUTEX(other_caches_sem);
++static LIST_HEAD(cache_definitions);
++
++void register_cache(struct cache_definition *cache)
++{
++ down(&other_caches_sem);
++ list_add(&cache->link, &cache_definitions);
++ up(&other_caches_sem);
++}
++
++void unregister_cache(struct cache_definition *cache)
++{
++ down(&other_caches_sem);
++ list_del(&cache->link);
++ up(&other_caches_sem);
++}
++
++static void shrink_other_caches(unsigned int priority, int gfp_mask)
++{
++ struct list_head *p;
++
++ if (down_trylock(&other_caches_sem))
++ return;
++
++ list_for_each_prev(p, &cache_definitions) {
++ struct cache_definition *cache =
++ list_entry(p, struct cache_definition, link);
++
++ cache->shrink(priority, gfp_mask);
++ }
++ up(&other_caches_sem);
++}
++
+ /*
+ * The swap-out function returns 1 if it successfully
+ * scanned all the pages it was asked to (`count').
+@@ -577,6 +611,7 @@ static int shrink_caches(zone_t * classz
+
+ shrink_dcache_memory(priority, gfp_mask);
+ shrink_icache_memory(priority, gfp_mask);
++ shrink_other_caches(priority, gfp_mask);
+ #ifdef CONFIG_QUOTA
+ shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
+ #endif
+
+_
--- /dev/null
+fs/ext3/dir.c
+fs/ext3/file.c
+fs/ext3/hash.c
+fs/ext3/Makefile
+fs/ext3/namei.c
+fs/ext3/super.c
+include/linux/ext3_fs.h
+include/linux/ext3_fs_sb.h
+include/linux/ext3_jbd.h
+include/linux/rbtree.h
+lib/rbtree.c
--- /dev/null
+fs/ext3/namei.c
+fs/ext3/namei.c.orig
+fs/ext3/super.c
+include/linux/ext3_fs_sb.h
+include/linux/ext3_fs_sb.h.orig
--- /dev/null
+fs/ext3/ialloc.c
+fs/ext3/inode.c
+fs/ext3/ioctl.c
+fs/ext3/namei.c
+include/linux/dcache.h
+include/linux/ext3_fs.h
--- /dev/null
+fs/ext3/ialloc.c
+fs/ext3/inode.c
+fs/ext3/ioctl.c
+fs/ext3/namei.c
+include/linux/dcache.h
+include/linux/ext3_fs.h
--- /dev/null
+fs/inode.c
+fs/Makefile
+mm/page_alloc.c
--- /dev/null
+include/linux/socket.h
+net/netsyms.c
+net/socket.c
--- /dev/null
+include/linux/skbuff.h
+include/net/tcp.h
+net/core/skbuff.c
+net/ipv4/tcp.c
+net/netsyms.c
--- /dev/null
+fs/dcache.c
+fs/exec.c
+fs/namei.c
+fs/namespace.c
+fs/open.c
+fs/stat.c
+include/linux/dcache.h
+include/linux/fs.h
+include/linux/fs_struct.h
+kernel/exit.c
+kernel/fork.c
+kernel/ksyms.c
--- /dev/null
+arch/alpha/defconfig
+arch/alpha/kernel/entry.S
+arch/arm/defconfig
+arch/arm/kernel/calls.S
+arch/i386/defconfig
+arch/ia64/defconfig
+arch/m68k/defconfig
+arch/mips64/defconfig
+arch/mips/defconfig
+arch/ppc64/kernel/misc.S
+arch/ppc/defconfig
+arch/s390/defconfig
+arch/s390/kernel/entry.S
+arch/s390x/defconfig
+arch/s390x/kernel/entry.S
+arch/s390x/kernel/wrapper32.S
+arch/sparc64/defconfig
+arch/sparc/defconfig
+Documentation/Configure.help
+fs/Config.in
+fs/ext2/file.c
+fs/ext2/ialloc.c
+fs/ext2/inode.c
+fs/ext2/Makefile
+fs/ext2/namei.c
+fs/ext2/super.c
+fs/ext2/symlink.c
+fs/ext2/xattr.c
+fs/ext2/xattr_user.c
+fs/ext3/ext3-exports.c
+fs/ext3/file.c
+fs/ext3/ialloc.c
+fs/ext3/inode.c
+fs/ext3/Makefile
+fs/ext3/namei.c
+fs/ext3/super.c
+fs/ext3/symlink.c
+fs/ext3/xattr.c
+fs/ext3/xattr_user.c
+fs/jfs/jfs_xattr.h
+fs/jfs/xattr.c
+fs/Makefile
+fs/mbcache.c
+include/asm-arm/unistd.h
+include/asm-ppc64/unistd.h
+include/asm-s390/unistd.h
+include/asm-s390x/unistd.h
+include/linux/cache_def.h
+include/linux/errno.h
+include/linux/ext2_fs.h
+include/linux/ext2_xattr.h
+include/linux/ext3_fs.h
+include/linux/ext3_jbd.h
+include/linux/ext3_xattr.h
+include/linux/fs.h
+include/linux/mbcache.h
+kernel/ksyms.c
+mm/vmscan.c
--- /dev/null
+dev_read_only_2.4.20-rh.patch
+exports_2.4.20-rh-hp.patch
+kmem_cache_validate_2.4.20.patch
+lustre_version.patch
+vfs_intent-2.4.22-rh.patch
+invalidate_show-2.4.20-rh.patch
+export-truncate.patch
+iod-stock-exports-2.4.22-rh.patch
+ext3-htree-2.4.22-rh.patch
+xattr-0.8.54-2.4.22-rh.patch
+ext3-orphan_lock-2.4.22-rh.patch
+ext3-noread-2.4.20.patch
+ext3_delete_thread_2.4.20_chaos.patch
+extN-wantedi-2.4.22-rh.patch
+ext3-san-2.4.20.patch
+ext3-map_inode_page.patch
+ext3-error-export.patch
+iopen-2.4.20.patch
+tcp-zero-copy-2.4.22-rh.patch
+jbd-dont-account-blocks-twice.patch
+jbd-commit-tricks.patch
+add_page_private.patch
+socket-exports-2.4.22-rh.patch
--- /dev/null
+KERNEL=linux-2.4.20-20.9.tar.gz
+SERIES=rh-2.4.20
+CONFIG=linux-2.4.20-rh-i686-smp.config
+VERSION=2.4.20
+EXTRA_VERSION=20.9
if test "${with_gm}" = yes; then
with_gm="-I/usr/local/gm/include"
else
- with_gm=-I"$with_gm/include"
+ with_gm="-I$with_gm/include -I$with_gm/drivers -I$with_gm/drivers/linux/gm"
fi
GMNAL="gmnal"
else
extern unsigned int portal_stack;
extern unsigned int portal_debug;
extern unsigned int portal_printk;
+extern unsigned int portal_cerror;
/* Debugging subsystems (32 bits, non-overlapping) */
#define S_UNDEFINED (1 << 0)
#define S_MDC (1 << 1)
#if 1
#define CDEBUG(mask, format, a...) \
do { \
+ if (portal_cerror == 0) \
+ break; \
CHECK_STACK(CDEBUG_STACK); \
if (!(mask) || ((mask) & (D_ERROR | D_EMERG)) || \
(portal_debug & (mask) && \
const int line);
#define LASSERT(e) ((e) ? 0 : kportal_assertion_failed( #e , __FILE__, \
__FUNCTION__, __LINE__))
+/* it would be great to dump_stack() here, but some kernels
+ * export it as show_stack() and I can't be bothered to
+ * proprely engage in that dance right now */
+#define LASSERTF(cond, fmt...) \
+ do { \
+ if (unlikely(!(cond))) { \
+ portals_debug_msg(0, D_EMERG, __FILE__, __FUNCTION__,\
+ __LINE__, CDEBUG_STACK, \
+ "ASSERTION(" #cond ") failed:" fmt);\
+ LBUG(); \
+ } \
+ } while (0)
+
#else
#define LASSERT(e)
+#define LASSERTF(cond, fmt...) do { } while (0)
#endif
#ifdef __arch_um__
} kpr_fwd_desc_t;
typedef void (*kpr_fwd_t)(void *arg, kpr_fwd_desc_t *fwd);
+typedef void (*kpr_notify_t)(void *arg, ptl_nid_t peer, int alive);
/* NAL's routing interface (Kernel Portals Routing Nal Interface) */
typedef const struct {
int kprni_nalid; /* NAL's id */
void *kprni_arg; /* Arg to pass when calling into NAL */
kpr_fwd_t kprni_fwd; /* NAL's forwarding entrypoint */
+ kpr_notify_t kprni_notify; /* NAL's notification entrypoint */
} kpr_nal_interface_t;
/* Router's routing interface (Kernel Portals Routing Router Interface) */
int (*kprri_register) (kpr_nal_interface_t *nal_interface,
void **router_arg);
- /* ask the router to find a gateway that forwards to 'nid' and is a peer
- * of the calling NAL */
- int (*kprri_lookup) (void *router_arg, ptl_nid_t nid,
+ /* ask the router to find a gateway that forwards to 'nid' and is a
+ * peer of the calling NAL; assume caller will send 'nob' bytes of
+ * payload there */
+ int (*kprri_lookup) (void *router_arg, ptl_nid_t nid, int nob,
ptl_nid_t *gateway_nid);
/* hand a packet over to the router for forwarding */
void (*kprri_fwd_done) (void *router_arg, kpr_fwd_desc_t *fwd,
int error);
+ /* notify the router about peer state */
+ void (*kprri_notify) (void *router_arg, ptl_nid_t peer,
+ int alive, time_t when);
+
/* the calling NAL is shutting down */
void (*kprri_shutdown) (void *router_arg);
typedef const struct {
int (*kprci_add_route)(int gateway_nal, ptl_nid_t gateway_nid,
ptl_nid_t lo_nid, ptl_nid_t hi_nid);
- int (*kprci_del_route)(ptl_nid_t nid);
+ int (*kprci_del_route)(int gateway_nal, ptl_nid_t gateway_nid,
+ ptl_nid_t lo_nid, ptl_nid_t hi_nid);
int (*kprci_get_route)(int index, int *gateway_nal,
- ptl_nid_t *gateway, ptl_nid_t *lo_nid,
- ptl_nid_t *hi_nid);
+ ptl_nid_t *gateway,
+ ptl_nid_t *lo_nid, ptl_nid_t *hi_nid,
+ int *alive);
+ int (*kprci_notify)(int gateway_nal, ptl_nid_t gateway_nid,
+ int alive, time_t when);
} kpr_control_interface_t;
extern kpr_control_interface_t kpr_control_interface;
}
static inline int
-kpr_lookup (kpr_router_t *router, ptl_nid_t nid, ptl_nid_t *gateway_nid)
+kpr_lookup (kpr_router_t *router, ptl_nid_t nid, int nob, ptl_nid_t *gateway_nid)
{
if (!kpr_routing (router))
- return (-EHOSTUNREACH);
+ return (-ENETUNREACH);
- return (router->kpr_interface->kprri_lookup(router->kpr_arg, nid,
+ return (router->kpr_interface->kprri_lookup(router->kpr_arg, nid, nob,
gateway_nid));
}
kpr_fwd_start (kpr_router_t *router, kpr_fwd_desc_t *fwd)
{
if (!kpr_routing (router))
- fwd->kprfd_callback (fwd->kprfd_callback_arg, -EHOSTUNREACH);
+ fwd->kprfd_callback (fwd->kprfd_callback_arg, -ENETUNREACH);
else
router->kpr_interface->kprri_fwd_start (router->kpr_arg, fwd);
}
}
static inline void
+kpr_notify (kpr_router_t *router,
+ ptl_nid_t peer, int alive, time_t when)
+{
+ if (!kpr_routing (router))
+ return;
+
+ router->kpr_interface->kprri_notify(router->kpr_arg, peer, alive, when);
+}
+
+static inline void
kpr_shutdown (kpr_router_t *router)
{
if (kpr_routing (router))
#endif /* PORTALS_PROFILING */
/* debug.c */
+void portals_run_upcall(char **argv);
void portals_run_lbug_upcall(char * file, const char *fn, const int line);
void portals_debug_dumplog(void);
int portals_debug_init(unsigned long bufsize);
# undef NDEBUG
# include <assert.h>
# define LASSERT(e) assert(e)
+# define LASSERTF(cond, args...) assert(cond)
# else
# define LASSERT(e)
+# define LASSERTF(cond, args...) do { } while (0)
# endif
# define printk(format, args...) printf (format, ## args)
# define PORTAL_ALLOC(ptr, size) do { (ptr) = malloc(size); } while (0);
# define CURRENT_TIME time(0)
#endif
+/******************************************************************************/
+/* Light-weight trace
+ * Support for temporary event tracing with minimal Heisenberg effect. */
+#define LWT_SUPPORT 1
+
+typedef struct {
+ cycles_t lwte_when;
+ char *lwte_where;
+ void *lwte_task;
+ long lwte_p1;
+ long lwte_p2;
+ long lwte_p3;
+ long lwte_p4;
+} lwt_event_t;
+
+#if LWT_SUPPORT
+#ifdef __KERNEL__
+#define LWT_EVENTS_PER_PAGE (PAGE_SIZE / sizeof (lwt_event_t))
+
+typedef struct _lwt_page {
+ struct list_head lwtp_list;
+ struct page *lwtp_page;
+ lwt_event_t *lwtp_events;
+} lwt_page_t;
+
+typedef struct {
+ int lwtc_current_index;
+ lwt_page_t *lwtc_current_page;
+} lwt_cpu_t;
+
+extern int lwt_enabled;
+extern lwt_cpu_t lwt_cpus[];
+
+extern int lwt_init (void);
+extern void lwt_fini (void);
+extern int lwt_lookup_string (int *size, char *knlptr,
+ char *usrptr, int usrsize);
+extern int lwt_control (int enable, int clear);
+extern int lwt_snapshot (int *ncpu, int *total_size,
+ void *user_ptr, int user_size);
+
+/* Note that we _don't_ define LWT_EVENT at all if LWT_SUPPORT isn't set.
+ * This stuff is meant for finding specific problems; it never stays in
+ * production code... */
+
+#define LWTSTR(n) #n
+#define LWTWHERE(f,l) f ":" LWTSTR(l)
+
+#define LWT_EVENT(p1, p2, p3, p4) \
+do { \
+ unsigned long flags; \
+ lwt_cpu_t *cpu; \
+ lwt_page_t *p; \
+ lwt_event_t *e; \
+ \
+ local_irq_save (flags); \
+ \
+ if (lwt_enabled) { \
+ cpu = &lwt_cpus[smp_processor_id()]; \
+ p = cpu->lwtc_current_page; \
+ e = &p->lwtp_events[cpu->lwtc_current_index++]; \
+ \
+ if (cpu->lwtc_current_index >= LWT_EVENTS_PER_PAGE) { \
+ cpu->lwtc_current_page = \
+ list_entry (p->lwtp_list.next, \
+ lwt_page_t, lwtp_list); \
+ cpu->lwtc_current_index = 0; \
+ } \
+ \
+ e->lwte_when = get_cycles(); \
+ e->lwte_where = LWTWHERE(__FILE__,__LINE__); \
+ e->lwte_task = current; \
+ e->lwte_p1 = (long)(p1); \
+ e->lwte_p2 = (long)(p2); \
+ e->lwte_p3 = (long)(p3); \
+ e->lwte_p4 = (long)(p4); \
+ } \
+ \
+ local_irq_restore (flags); \
+} while (0)
+#else /* __KERNEL__ */
+#define LWT_EVENT(p1,p2,p3,p4) /* no userland implementation yet */
+#endif /* __KERNEL__ */
+#endif /* LWT_SUPPORT */
+
+
#include <linux/portals_lib.h>
/*
#define IOC_PORTAL_GET_NID _IOWR('e', 39, long)
#define IOC_PORTAL_FAIL_NID _IOWR('e', 40, long)
#define IOC_PORTAL_SET_DAEMON _IOWR('e', 41, long)
-
-#define IOC_PORTAL_MAX_NR 41
+#define IOC_PORTAL_NOTIFY_ROUTER _IOWR('e', 42, long)
+#define IOC_PORTAL_LWT_CONTROL _IOWR('e', 43, long)
+#define IOC_PORTAL_LWT_SNAPSHOT _IOWR('e', 44, long)
+#define IOC_PORTAL_LWT_LOOKUP_STRING _IOWR('e', 45, long)
+#define IOC_PORTAL_MAX_NR 45
enum {
QSWNAL = 1,
int jt_ptl_nagle (int argc, char **argv);
int jt_ptl_add_route (int argc, char **argv);
int jt_ptl_del_route (int argc, char **argv);
+int jt_ptl_notify_router (int argc, char **argv);
int jt_ptl_print_routes (int argc, char **argv);
int jt_ptl_fail_nid (int argc, char **argv);
+int jt_ptl_lwt(int argc, char **argv);
int dbg_initialize(int argc, char **argv);
int jt_dbg_filter(int argc, char **argv);
modulenet_DATA = kgmnal.o
EXTRA_PROGRAMS = kgmnal
-DEFS =
-kgmnal_SOURCES = gmnal.c gmnal_cb.c gmnal.h
+DEFS = -DGM_KERNEL
+kgmnal_SOURCES = gmnal.h gmnal_api.c gmnal_cb.c gmnal_comm.c gmnal_utils.c gmnal_module.c
+++ /dev/null
-diff -ru gm-1.5.2.1_Linux/drivers/linux/gm/gm_arch.c gm-1.5.2.1_Linux-cfs/drivers/linux/gm/gm_arch.c
---- gm-1.5.2.1_Linux/drivers/linux/gm/gm_arch.c Mon Jul 1 10:35:09 2002
-+++ gm-1.5.2.1_Linux-cfs/drivers/linux/gm/gm_arch.c Thu Sep 19 14:19:38 2002
-@@ -30,6 +30,8 @@
- *
- ************************************************************************/
-
-+#define EXPORT_SYMTAB
-+
- #include <linux/config.h>
- #include <linux/module.h>
-
-@@ -4075,6 +4077,28 @@
- return 0;
- }
-
-+EXPORT_SYMBOL(gm_blocking_receive_no_spin);
-+EXPORT_SYMBOL(gm_close);
-+EXPORT_SYMBOL(gm_dma_free);
-+EXPORT_SYMBOL(gm_dma_malloc);
-+EXPORT_SYMBOL(gm_drop_sends);
-+EXPORT_SYMBOL(gm_finalize);
-+EXPORT_SYMBOL(gm_get_node_id);
-+EXPORT_SYMBOL(gm_init);
-+EXPORT_SYMBOL(gm_initialize_alarm);
-+EXPORT_SYMBOL(gm_max_node_id_in_use);
-+EXPORT_SYMBOL(gm_min_size_for_length);
-+EXPORT_SYMBOL(gm_num_receive_tokens);
-+EXPORT_SYMBOL(gm_num_send_tokens);
-+EXPORT_SYMBOL(gm_open);
-+EXPORT_SYMBOL(gm_provide_receive_buffer);
-+EXPORT_SYMBOL(gm_resume_sending);
-+EXPORT_SYMBOL(gm_send_with_callback);
-+EXPORT_SYMBOL(gm_set_acceptable_sizes);
-+EXPORT_SYMBOL(gm_set_alarm);
-+EXPORT_SYMBOL(gm_unknown);
-+
-+
- /*
- This file uses GM standard indentation.
-
-Only in gm-1.5.2.1_Linux-cfs/drivers/linux/gm: gm_arch.c~
-Only in gm-1.5.2.1_Linux-cfs/: trace
+++ /dev/null
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- * Based on ksocknal and qswnal
- *
- * Copyright (C) 2002 Cluster File Systems, Inc.
- * Author: Robert Read <rread@datarithm.net>
- *
- * This file is part of Portals, http://www.sf.net/projects/sandiaportals/
- *
- * Portals is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * Portals is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with Portals; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include "gmnal.h"
-
-ptl_handle_ni_t kgmnal_ni;
-nal_t kgmnal_api;
-
-kgmnal_data_t kgmnal_data;
-int gmnal_debug = 0;
-
-kpr_nal_interface_t kqswnal_router_interface = {
- kprni_nalid: GMNAL,
- kprni_arg: NULL,
- kprni_fwd: kgmnal_fwd_packet,
-};
-
-static int kgmnal_forward(nal_t *nal,
- int id,
- void *args, size_t args_len,
- void *ret, size_t ret_len)
-{
- kgmnal_data_t *k = nal->nal_data;
- nal_cb_t *nal_cb = k->kgm_cb;
-
- LASSERT (nal == &kgmnal_api);
- LASSERT (k == &kgmnal_data);
- LASSERT (nal_cb == &kgmnal_lib);
-
- lib_dispatch(nal_cb, k, id, args, ret); /* nal needs k */
- return PTL_OK;
-}
-
-static void kgmnal_lock(nal_t *nal, unsigned long *flags)
-{
- kgmnal_data_t *k = nal->nal_data;
- nal_cb_t *nal_cb = k->kgm_cb;
-
-
- LASSERT (nal == &kgmnal_api);
- LASSERT (k == &kgmnal_data);
- LASSERT (nal_cb == &kgmnal_lib);
-
- nal_cb->cb_cli(nal_cb,flags);
-}
-
-static void kgmnal_unlock(nal_t *nal, unsigned long *flags)
-{
- kgmnal_data_t *k = nal->nal_data;
- nal_cb_t *nal_cb = k->kgm_cb;
-
-
- LASSERT (nal == &kgmnal_api);
- LASSERT (k == &kgmnal_data);
- LASSERT (nal_cb == &kgmnal_lib);
-
- nal_cb->cb_sti(nal_cb,flags);
-}
-
-static int kgmnal_shutdown(nal_t *nal, int ni)
-{
- LASSERT (nal == &kgmnal_api);
- return 0;
-}
-
-static void kgmnal_yield( nal_t *nal )
-{
- LASSERT (nal == &kgmnal_api);
-
- if (current->need_resched)
- schedule();
- return;
-}
-
-kgmnal_rx_t *kgm_add_recv(kgmnal_data_t *data,int ndx)
-{
- kgmnal_rx_t *conn;
-
- PORTAL_ALLOC(conn, sizeof(kgmnal_rx_t));
- /* Check for out of mem here */
- if (conn==NULL) {
- printk("kgm_add_recv: memory alloc failed\n");
- return NULL;
- }
-
- list_add(&conn->krx_item,(struct list_head *)&data->kgm_list);
- // conn->ndx=ndx;
- // conn->len=conn->ptlhdr_copied=0;
- // conn->loopback=0;
- return conn;
-}
-
-static nal_t *kgmnal_init(int interface, ptl_pt_index_t ptl_size,
- ptl_ac_index_t ac_size, ptl_pid_t requested_pid)
-{
- unsigned int nnids;
-
- gm_max_node_id_in_use(kgmnal_data.kgm_port, &nnids);
-
- CDEBUG(D_NET, "calling lib_init with nid 0x%Lx of %d\n",
- kgmnal_data.kgm_nid, nnids);
- lib_init(&kgmnal_lib, kgmnal_data.kgm_nid, 0, nnids,ptl_size, ac_size);
- return &kgmnal_api;
-}
-
-static void /*__exit*/
-kgmnal_finalize(void)
-{
- struct list_head *tmp;
-
- PORTAL_SYMBOL_UNREGISTER (kgmnal_ni);
- PtlNIFini(kgmnal_ni);
- lib_fini(&kgmnal_api);
-
- if (kgmnal_data.kgm_port) {
- gm_close(kgmnal_data.kgm_port);
- }
-
- /* FIXME: free dma buffers */
- /* FIXME: kill receiver thread */
-
- PORTAL_FREE (kgmnal_data.kgm_trans, bsizeof(kgmnal_tx_t)*TXMSGS);
-
- list_for_each(tmp, &kgmnal_data.kgm_list) {
- kgmnal_rx_t *conn;
- conn = list_entry(tmp, kgmnal_rx_t, krx_item);
- CDEBUG(D_IOCTL, "freeing conn %p\n",conn);
- tmp = tmp->next;
- list_del(&conn->krx_item);
- PORTAL_FREE(conn, sizeof(*conn));
- }
-
- CDEBUG (D_MALLOC, "done kmem %d\n", atomic_read (&portal_kmemory));
-
- return;
-}
-
-static int __init
-kgmnal_initialize(void)
-{
- int rc;
- int ntok;
- unsigned long sizemask;
- unsigned int nid;
-
- CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read (&portal_kmemory));
-
- kgmnal_api.forward = kgmnal_forward;
- kgmnal_api.shutdown = kgmnal_shutdown;
- kgmnal_api.yield = kgmnal_yield;
- kgmnal_api.validate = NULL; /* our api validate is a NOOP */
- kgmnal_api.lock= kgmnal_lock;
- kgmnal_api.unlock= kgmnal_unlock;
- kgmnal_api.nal_data = &kgmnal_data;
-
- kgmnal_lib.nal_data = &kgmnal_data;
-
- memset(&kgmnal_data, 0, sizeof(kgmnal_data));
-
- INIT_LIST_HEAD(&kgmnal_data.kgm_list);
- kgmnal_data.kgm_cb = &kgmnal_lib;
-
- /* Allocate transmit descriptors */
- PORTAL_ALLOC (kgmnal_data.kgm_trans, sizeof(kgmnal_tx_t)*TXMSGS);
- if (kgmnal_data.kgm_trans==NULL) {
- printk("kgmnal: init: failed to allocate transmit "
- "descriptors\n");
- return -1;
- }
- memset(kgmnal_data.kgm_trans,-1,sizeof(kgmnal_tx_t)*(TXMSGS));
-
- spin_lock_init(&kgmnal_data.kgm_dispatch_lock);
- spin_lock_init(&kgmnal_data.kgm_update_lock);
- spin_lock_init(&kgmnal_data.kgm_send_lock);
-
- /* Do the receiver and xmtr allocation */
-
- rc = gm_init();
- if (rc != GM_SUCCESS) {
- CERROR("gm_init failed: %d\n", rc);
- return -1;
- }
-
- rc = gm_open(&kgmnal_data.kgm_port, 0 , KGM_PORT_NUM, KGM_HOSTNAME,
- GM_API_VERSION_1_1);
- if (rc != GM_SUCCESS) {
- gm_finalize();
- kgmnal_data.kgm_port = NULL;
- CERROR("gm_open failed: %d\n", rc);
- return -1;
- }
- gm_get_node_id(kgmnal_data.kgm_port, &nid);
- kgmnal_data.kgm_nid = nid;
- /* Allocate 2 different sizes of buffers. For new, use half
- the tokens for each. */
- ntok = gm_num_receive_tokens(kgmnal_data.kgm_port)/2;
- CDEBUG(D_NET, "gmnal_init: creating %d large %d byte recv buffers\n",
- ntok, MSG_LEN_LARGE);
- while (ntok-- > 0) {
- void * buffer = gm_dma_malloc(kgmnal_data.kgm_port,
- MSG_LEN_LARGE);
- if (buffer == NULL) {
- CERROR("gm_init failed: %d\n", rc);
- return (-ENOMEM);
- }
- CDEBUG(D_NET, " add buffer: port %p buf %p len %d size %d "
- "pri %d\n ", kgmnal_data.kgm_port, buffer,
- MSG_LEN_LARGE, MSG_SIZE_LARGE, GM_LOW_PRIORITY);
-
- gm_provide_receive_buffer(kgmnal_data.kgm_port, buffer,
- MSG_SIZE_LARGE, GM_LOW_PRIORITY);
- }
-
- ntok = gm_num_receive_tokens(kgmnal_data.kgm_port)/2;
- CDEBUG(D_NET, "gmnal_init: creating %d small %d byte recv buffers\n",
- ntok, MSG_LEN_SMALL);
- while (ntok-- > 0) {
- void * buffer = gm_dma_malloc(kgmnal_data.kgm_port,
- MSG_LEN_SMALL);
- if (buffer == NULL) {
- CERROR("gm_init failed: %d\n", rc);
- return (-ENOMEM);
- }
- CDEBUG(D_NET, " add buffer: port %p buf %p len %d size %d "
- "pri %d\n ", kgmnal_data.kgm_port, buffer,
- MSG_LEN_SMALL, MSG_SIZE_SMALL, GM_LOW_PRIORITY);
-
- gm_provide_receive_buffer(kgmnal_data.kgm_port, buffer,
- MSG_SIZE_SMALL, GM_LOW_PRIORITY);
-
- }
- sizemask = (1 << MSG_SIZE_LARGE) | (1 << MSG_SIZE_SMALL);
- CDEBUG(D_NET, "gm_set_acceptable_sizes port %p pri %d mask 0x%x\n",
- kgmnal_data.kgm_port, GM_LOW_PRIORITY, sizemask);
- gm_set_acceptable_sizes(kgmnal_data.kgm_port, GM_LOW_PRIORITY,
- sizemask);
- gm_set_acceptable_sizes(kgmnal_data.kgm_port, GM_HIGH_PRIORITY, 0);
-
- /* Initialize Network Interface */
- rc = PtlNIInit(kgmnal_init, 32, 4, 0, &kgmnal_ni);
- if (rc) {
- CERROR("PtlNIInit failed %d\n", rc);
- return (-ENOMEM);
- }
-
- /* Start receiver thread */
- kernel_thread(kgmnal_recv_thread, &kgmnal_data, 0);
-
- PORTAL_SYMBOL_REGISTER(kgmnal_ni);
-
- kgmnal_data.kgm_init = 1;
-
- return 0;
-}
-
-MODULE_AUTHOR("Robert Read <rread@datarithm.net>");
-MODULE_DESCRIPTION("Kernel Myrinet GM NAL v0.1");
-MODULE_LICENSE("GPL");
-
-module_init (kgmnal_initialize);
-module_exit (kgmnal_finalize);
-
-EXPORT_SYMBOL (kgmnal_ni);
/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
* vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (c) 2003 Los Alamos National Laboratory (LANL)
+ *
+ * This file is part of Lustre, http://www.lustre.org/
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
-#ifndef _GMNAL_H
-#define _GMNAL_H
-
-#include <linux/config.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/stat.h>
-#include <linux/errno.h>
-#include <linux/locks.h>
-#include <linux/unistd.h>
-#include <linux/init.h>
-
-#include <asm/system.h>
-#include <asm/uaccess.h>
-
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/stat.h>
-#include <linux/list.h>
-#include <asm/uaccess.h>
-#include <asm/segment.h>
+
+
+/*
+ * Portals GM kernel NAL header file
+ * This file makes all declaration and prototypes
+ * for the API side and CB side of the NAL
+ */
+#ifndef __INCLUDE_GMNAL_H__
+#define __INCLUDE_GMNAL_H__
+
+#include "linux/config.h"
+#include "linux/module.h"
+#include "linux/tty.h"
+#include "linux/kernel.h"
+#include "linux/mm.h"
+#include "linux/string.h"
+#include "linux/stat.h"
+#include "linux/errno.h"
+#include "linux/locks.h"
+#include "linux/unistd.h"
+#include "linux/init.h"
+#include "linux/sem.h"
+#include "linux/vmalloc.h"
+#ifdef MODVERSIONS
+#include <linux/modversions.h>
+#endif
#define DEBUG_SUBSYSTEM S_GMNAL
-#include <linux/kp30.h>
-#include <portals/p30.h>
-#include <portals/lib-p30.h>
+#include "portals/nal.h"
+#include "portals/api.h"
+#include "portals/errno.h"
+#include "linux/kp30.h"
+#include "portals/p30.h"
+
+#include "portals/lib-nal.h"
+#include "portals/lib-p30.h"
+
+#define GM_STRONG_TYPES 1
+#include "gm.h"
+#include "gm_internal.h"
+
+
+/*
+ * Defines for the API NAL
+ */
+
+/*
+ * Small message size is configurable
+ * insmod can set small_msg_size
+ * which is used to populate nal_data.small_msg_size
+ */
+#define GMNAL_SMALL_MESSAGE 1078
+#define GMNAL_LARGE_MESSAGE_INIT 1079
+#define GMNAL_LARGE_MESSAGE_ACK 1080
+#define GMNAL_LARGE_MESSAGE_FINI 1081
+
+extern int gmnal_small_msg_size;
+extern int num_rx_threads;
+extern int num_stxds;
+#define GMNAL_SMALL_MSG_SIZE(a) a->small_msg_size
+#define GMNAL_IS_SMALL_MESSAGE(n,a,b,c) gmnal_is_small_msg(n, a, b, c)
+#define GMNAL_MAGIC 0x1234abcd
+
+
+/*
+ * Small Transmit Descriptor
+ * A structre to keep track of a small transmit operation
+ * This structure has a one-to-one relationship with a small
+ * transmit buffer (both create by gmnal_stxd_alloc).
+ * There are two free list of stxd. One for use by clients of the NAL
+ * and the other by the NAL rxthreads when doing sends.
+ * This helps prevent deadlock caused by stxd starvation.
+ */
+typedef struct _gmnal_stxd_t {
+ void *buffer;
+ int buffer_size;
+ gm_size_t gm_size;
+ int msg_size;
+ int gm_target_node;
+ int gm_priority;
+ int type;
+ struct _gmnal_data_t *nal_data;
+ lib_msg_t *cookie;
+ int niov;
+ struct iovec iov[PTL_MD_MAX_IOV];
+ struct _gmnal_srxd_t *srxd;
+ struct _gmnal_stxd_t *next;
+ int rxt;
+ int kniov;
+ struct iovec *iovec_dup;
+} gmnal_stxd_t;
+
+/*
+ * as for gmnal_stxd_t
+ * a hash table in nal_data find srxds from
+ * the rx buffer address. hash table populated at init time
+ */
+typedef struct _gmnal_srxd_t {
+ void *buffer;
+ int size;
+ gm_size_t gmsize;
+ unsigned int gm_source_node;
+ gmnal_stxd_t *source_stxd;
+ int type;
+ int nsiov;
+ int nriov;
+ struct iovec *riov;
+ int ncallbacks;
+ spinlock_t callback_lock;
+ int callback_status;
+ lib_msg_t *cookie;
+ struct _gmnal_srxd_t *next;
+ struct _gmnal_data_t *nal_data;
+} gmnal_srxd_t;
+
+/*
+ * Header which lmgnal puts at the start of each message
+ */
+typedef struct _gmnal_msghdr {
+ int magic;
+ int type;
+ unsigned int sender_node_id;
+ gmnal_stxd_t *stxd;
+ int niov;
+ } gmnal_msghdr_t;
+#define GMNAL_MSGHDR_SIZE sizeof(gmnal_msghdr_t)
+
+/*
+ * the caretaker thread (ct_thread) gets receive events
+ * (and other events) from the myrinet device via the GM2 API.
+ * caretaker thread populates one work entry for each receive event,
+ * puts it on a Q in nal_data and wakes a receive thread to
+ * process the receive.
+ * Processing a portals receive can involve a transmit operation.
+ * Because of this the caretaker thread cannot process receives
+ * as it may get deadlocked when supply of transmit descriptors
+ * is exhausted (as caretaker thread is responsible for replacing
+ * transmit descriptors on the free list)
+ */
+typedef struct _gmnal_rxtwe {
+ gm_recv_event_t *rx;
+ struct _gmnal_rxtwe *next;
+} gmnal_rxtwe_t;
+
+/*
+ * 1 receive thread started on each CPU
+ */
+#define NRXTHREADS 10 /* max number of receiver threads */
+
+typedef struct _gmnal_data_t {
+ int refcnt;
+ spinlock_t cb_lock;
+ spinlock_t stxd_lock;
+ struct semaphore stxd_token;
+ gmnal_stxd_t *stxd;
+ spinlock_t rxt_stxd_lock;
+ struct semaphore rxt_stxd_token;
+ gmnal_stxd_t *rxt_stxd;
+ spinlock_t srxd_lock;
+ struct semaphore srxd_token;
+ gmnal_srxd_t *srxd;
+ struct gm_hash *srxd_hash;
+ nal_t *nal;
+ nal_cb_t *nal_cb;
+ struct gm_port *gm_port;
+ unsigned int gm_local_nid;
+ unsigned int gm_global_nid;
+ spinlock_t gm_lock;
+ long rxthread_pid[NRXTHREADS];
+ int rxthread_stop_flag;
+ spinlock_t rxthread_flag_lock;
+ long rxthread_flag;
+ long ctthread_pid;
+ int ctthread_flag;
+ gm_alarm_t ctthread_alarm;
+ int small_msg_size;
+ int small_msg_gmsize;
+ gmnal_rxtwe_t *rxtwe_head;
+ gmnal_rxtwe_t *rxtwe_tail;
+ spinlock_t rxtwe_lock;
+ struct semaphore rxtwe_wait;
+} gmnal_data_t;
+
+/*
+ * Flags to start/stop and check status of threads
+ * each rxthread sets 1 bit (any bit) of the flag on startup
+ * and clears 1 bit when exiting
+ */
+#define GMNAL_THREAD_RESET 0
+#define GMNAL_THREAD_STOP 666
+#define GMNAL_CTTHREAD_STARTED 333
+#define GMNAL_RXTHREADS_STARTED ( (1<<num_rx_threads)-1)
+
+
+extern gmnal_data_t *global_nal_data;
+
+/*
+ * The gm_port to use for gmnal
+ */
+#define GMNAL_GM_PORT 4
+
+/*
+ * for ioctl get pid
+ */
+#define GMNAL_IOC_GET_GNID 1
+
+/*
+ * Return codes
+ */
+#define GMNAL_STATUS_OK 0
+#define GMNAL_STATUS_FAIL 1
+#define GMNAL_STATUS_NOMEM 2
-#include <gm.h>
+/*
+ * FUNCTION PROTOTYPES
+ */
+
+/*
+ * Locking macros
+ */
/*
- * Myrinet GM NAL
+ * For the Small tx and rx descriptor lists
*/
-#define NPAGES_LARGE 16
-#define NPAGES_SMALL 1
-#define MSG_LEN_LARGE NPAGES_LARGE*PAGE_SIZE
-#define MSG_LEN_SMALL NPAGES_SMALL*PAGE_SIZE
-#define MSG_SIZE_LARGE (gm_min_size_for_length(MSG_LEN_LARGE))
-#define MSG_SIZE_SMALL (gm_min_size_for_length(MSG_LEN_SMALL))
+#define GMNAL_TXD_LOCK_INIT(a) spin_lock_init(&a->stxd_lock);
+#define GMNAL_TXD_LOCK(a) spin_lock(&a->stxd_lock);
+#define GMNAL_TXD_UNLOCK(a) spin_unlock(&a->stxd_lock);
+#define GMNAL_TXD_TOKEN_INIT(a, n) sema_init(&a->stxd_token, n);
+#define GMNAL_TXD_GETTOKEN(a) down(&a->stxd_token);
+#define GMNAL_TXD_TRYGETTOKEN(a) down_trylock(&a->stxd_token)
+#define GMNAL_TXD_RETURNTOKEN(a) up(&a->stxd_token);
-#define TXMSGS 64 /* Number of Transmit Messages */
-#define ENVELOPES 8 /* Number of outstanding receive msgs */
+#define GMNAL_RXT_TXD_LOCK_INIT(a) spin_lock_init(&a->rxt_stxd_lock);
+#define GMNAL_RXT_TXD_LOCK(a) spin_lock(&a->rxt_stxd_lock);
+#define GMNAL_RXT_TXD_UNLOCK(a) spin_unlock(&a->rxt_stxd_lock);
+#define GMNAL_RXT_TXD_TOKEN_INIT(a, n) sema_init(&a->rxt_stxd_token, n);
+#define GMNAL_RXT_TXD_GETTOKEN(a) down(&a->rxt_stxd_token);
+#define GMNAL_RXT_TXD_TRYGETTOKEN(a) down_trylock(&a->rxt_stxd_token)
+#define GMNAL_RXT_TXD_RETURNTOKEN(a) up(&a->rxt_stxd_token);
-#define KGM_PORT_NUM 3
-#define KGM_HOSTNAME "kgmnal"
+#define GMNAL_RXD_LOCK_INIT(a) spin_lock_init(&a->srxd_lock);
+#define GMNAL_RXD_LOCK(a) spin_lock(&a->srxd_lock);
+#define GMNAL_RXD_UNLOCK(a) spin_unlock(&a->srxd_lock);
+#define GMNAL_RXD_TOKEN_INIT(a, n) sema_init(&a->srxd_token, n);
+#define GMNAL_RXD_GETTOKEN(a) down(&a->srxd_token);
+#define GMNAL_RXD_TRYGETTOKEN(a) down_trylock(&a->srxd_token)
+#define GMNAL_RXD_RETURNTOKEN(a) up(&a->srxd_token);
+#define GMNAL_GM_LOCK_INIT(a) spin_lock_init(&a->gm_lock);
+#define GMNAL_GM_LOCK(a) spin_lock(&a->gm_lock);
+#define GMNAL_GM_UNLOCK(a) spin_unlock(&a->gm_lock);
+#define GMNAL_CB_LOCK_INIT(a) spin_lock_init(&a->cb_lock);
-typedef struct {
- char *krx_buffer;
- unsigned long krx_len;
- unsigned int krx_size;
- unsigned int krx_priority;
- struct list_head krx_item;
-} kgmnal_rx_t;
+/*
+ * Memory Allocator
+ */
+
+/*
+ * API NAL
+ */
+int gmnal_api_forward(nal_t *, int, void *, size_t, void *, size_t);
+
+int gmnal_api_shutdown(nal_t *, int);
+
+int gmnal_api_validate(nal_t *, void *, size_t);
+
+void gmnal_api_yield(nal_t *);
+
+void gmnal_api_lock(nal_t *, unsigned long *);
+
+void gmnal_api_unlock(nal_t *, unsigned long *);
+
+
+#define GMNAL_INIT_NAL(a) do { \
+ a->forward = gmnal_api_forward; \
+ a->shutdown = gmnal_api_shutdown; \
+ a->validate = NULL; \
+ a->yield = gmnal_api_yield; \
+ a->lock = gmnal_api_lock; \
+ a->unlock = gmnal_api_unlock; \
+ a->timeout = NULL; \
+ a->refct = 1; \
+ a->nal_data = NULL; \
+ } while (0)
+
+
+/*
+ * CB NAL
+ */
+
+int gmnal_cb_send(nal_cb_t *, void *, lib_msg_t *, ptl_hdr_t *,
+ int, ptl_nid_t, ptl_pid_t, unsigned int, struct iovec *, size_t);
+
+int gmnal_cb_send_pages(nal_cb_t *, void *, lib_msg_t *, ptl_hdr_t *,
+ int, ptl_nid_t, ptl_pid_t, unsigned int, ptl_kiov_t *, size_t);
+
+int gmnal_cb_recv(nal_cb_t *, void *, lib_msg_t *,
+ unsigned int, struct iovec *, size_t, size_t);
+
+int gmnal_cb_recv_pages(nal_cb_t *, void *, lib_msg_t *,
+ unsigned int, ptl_kiov_t *, size_t, size_t);
+
+int gmnal_cb_read(nal_cb_t *, void *private, void *, user_ptr, size_t);
+
+int gmnal_cb_write(nal_cb_t *, void *private, user_ptr, void *, size_t);
+
+int gmnal_cb_callback(nal_cb_t *, void *, lib_eq_t *, ptl_event_t *);
+
+void *gmnal_cb_malloc(nal_cb_t *, size_t);
+
+void gmnal_cb_free(nal_cb_t *, void *, size_t);
+
+void gmnal_cb_unmap(nal_cb_t *, unsigned int, struct iovec*, void **);
+
+int gmnal_cb_map(nal_cb_t *, unsigned int, struct iovec*, void **);
+
+void gmnal_cb_printf(nal_cb_t *, const char *fmt, ...);
+
+void gmnal_cb_cli(nal_cb_t *, unsigned long *);
+
+void gmnal_cb_sti(nal_cb_t *, unsigned long *);
+
+int gmnal_cb_dist(nal_cb_t *, ptl_nid_t, unsigned long *);
+
+nal_t *gmnal_init(int, ptl_pt_index_t, ptl_ac_index_t, ptl_pid_t rpid);
+
+void gmnal_fini(void);
+
+
+
+#define GMNAL_INIT_NAL_CB(a) do { \
+ a->cb_send = gmnal_cb_send; \
+ a->cb_send_pages = gmnal_cb_send_pages; \
+ a->cb_recv = gmnal_cb_recv; \
+ a->cb_recv_pages = gmnal_cb_recv_pages; \
+ a->cb_read = gmnal_cb_read; \
+ a->cb_write = gmnal_cb_write; \
+ a->cb_callback = gmnal_cb_callback; \
+ a->cb_malloc = gmnal_cb_malloc; \
+ a->cb_free = gmnal_cb_free; \
+ a->cb_map = NULL; \
+ a->cb_unmap = NULL; \
+ a->cb_printf = gmnal_cb_printf; \
+ a->cb_cli = gmnal_cb_cli; \
+ a->cb_sti = gmnal_cb_sti; \
+ a->cb_dist = gmnal_cb_dist; \
+ a->nal_data = NULL; \
+ } while (0)
+
+
+/*
+ * Small Transmit and Receive Descriptor Functions
+ */
+int gmnal_alloc_stxd(gmnal_data_t *);
+void gmnal_free_stxd(gmnal_data_t *);
+gmnal_stxd_t* gmnal_get_stxd(gmnal_data_t *, int);
+void gmnal_return_stxd(gmnal_data_t *, gmnal_stxd_t *);
+
+int gmnal_alloc_srxd(gmnal_data_t *);
+void gmnal_free_srxd(gmnal_data_t *);
+gmnal_srxd_t* gmnal_get_srxd(gmnal_data_t *, int);
+void gmnal_return_srxd(gmnal_data_t *, gmnal_srxd_t *);
+
+/*
+ * general utility functions
+ */
+gmnal_srxd_t *gmnal_rxbuffer_to_srxd(gmnal_data_t *, void*);
+void gmnal_stop_rxthread(gmnal_data_t *);
+void gmnal_stop_ctthread(gmnal_data_t *);
+void gmnal_small_tx_callback(gm_port_t *, void *, gm_status_t);
+void gmnal_drop_sends_callback(gm_port_t *, void *, gm_status_t);
+char *gmnal_gm_error(gm_status_t);
+char *gmnal_rxevent(gm_recv_event_t*);
+int gmnal_is_small_msg(gmnal_data_t*, int, struct iovec*, int);
+void gmnal_yield(int);
+int gmnal_start_kernel_threads(gmnal_data_t *);
+
+
+/*
+ * Communication functions
+ */
+
+/*
+ * Receive threads
+ */
+int gmnal_ct_thread(void *); /* caretaker thread */
+int gmnal_rx_thread(void *); /* receive thread */
+int gmnal_pre_receive(gmnal_data_t*, gm_recv_t*, int);
+int gmnal_rx_bad(gmnal_data_t *, gm_recv_t *, gmnal_srxd_t *);
+int gmnal_rx_requeue_buffer(gmnal_data_t *, gmnal_srxd_t *);
+int gmnal_add_rxtwe(gmnal_data_t *, gm_recv_event_t *);
+gmnal_rxtwe_t * gmnal_get_rxtwe(gmnal_data_t *);
+void gmnal_remove_rxtwe(gmnal_data_t *);
+
+
+/*
+ * Small messages
+ */
+int gmnal_small_rx(nal_cb_t *, void *, lib_msg_t *, unsigned int,
+ struct iovec *, size_t, size_t);
+int gmnal_small_tx(nal_cb_t *, void *, lib_msg_t *, ptl_hdr_t *,
+ int, ptl_nid_t, ptl_pid_t,
+ unsigned int, struct iovec*, int);
+void gmnal_small_tx_callback(gm_port_t *, void *, gm_status_t);
+
+
+
+/*
+ * Large messages
+ */
+int gmnal_large_rx(nal_cb_t *, void *, lib_msg_t *, unsigned int,
+ struct iovec *, size_t, size_t);
-typedef struct {
- nal_cb_t *ktx_nal;
- void *ktx_private;
- lib_msg_t *ktx_cookie;
- char *ktx_buffer;
- size_t ktx_len;
- unsigned long ktx_size;
- int ktx_ndx;
- unsigned int ktx_priority;
- unsigned int ktx_tgt_node;
- unsigned int ktx_tgt_port_id;
-} kgmnal_tx_t;
+int gmnal_large_tx(nal_cb_t *, void *, lib_msg_t *, ptl_hdr_t *,
+ int, ptl_nid_t, ptl_pid_t, unsigned int,
+ struct iovec*, int);
+void gmnal_large_tx_callback(gm_port_t *, void *, gm_status_t);
-typedef struct {
- char kgm_init;
- char kgm_shuttingdown;
- struct gm_port *kgm_port;
- struct list_head kgm_list;
- ptl_nid_t kgm_nid;
- nal_cb_t *kgm_cb;
- struct kgm_trans *kgm_trans;
- struct tq_struct kgm_ready_tq;
- spinlock_t kgm_dispatch_lock;
- spinlock_t kgm_update_lock;
- spinlock_t kgm_send_lock;
-} kgmnal_data_t;
+int gmnal_remote_get(gmnal_srxd_t *, int, struct iovec*, int,
+ struct iovec*);
-int kgm_init(kgmnal_data_t *kgm_data);
-int kgmnal_recv_thread(void *);
-int gm_return_mynid(void);
-void kgmnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd);
+void gmnal_remote_get_callback(gm_port_t *, void *, gm_status_t);
-extern kgmnal_data_t kgmnal_data;
-extern nal_t kgmnal_api;
-extern nal_cb_t kgmnal_lib;
+int gmnal_copyiov(int, gmnal_srxd_t *, int, struct iovec*, int,
+ struct iovec*);
-#endif /* _GMNAL_H */
+void gmnal_large_tx_ack(gmnal_data_t *, gmnal_srxd_t *);
+void gmnal_large_tx_ack_callback(gm_port_t *, void *, gm_status_t);
+void gmnal_large_tx_ack_received(gmnal_data_t *, gmnal_srxd_t *);
+#endif /*__INCLUDE_GMNAL_H__*/
/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
* vim:expandtab:shiftwidth=8:tabstop=8:
*
- * Based on ksocknal and qswnal
+ * Copyright (c) 2003 Los Alamos National Laboratory (LANL)
*
- * Copyright (C) 2002 Cluster File Systems, Inc.
- * Author: Robert Read <rread@datarithm.net>
+ * This file is part of Lustre, http://www.lustre.org/
*
- * This file is part of Portals, http://www.sf.net/projects/sandiaportals/
- *
- * Portals is free software; you can redistribute it and/or
+ * Lustre is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*
- * Portals is distributed in the hope that it will be useful,
+ * Lustre is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with Portals; if not, write to the Free Software
+ * along with Lustre; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
-/* TODO
- * preallocate send buffers, store on list
- * put receive buffers on queue, handle with receive threads
- * use routing
- */
-
-#include "gmnal.h"
-
-extern kgmnal_rx_t *kgm_add_recv(kgmnal_data_t *,int);
-
-static kgmnal_tx_t *
-get_trans(void)
-{
- kgmnal_tx_t *t;
- PORTAL_ALLOC(t, (sizeof(kgmnal_tx_t)));
- return t;
-}
-
-static void
-put_trans(kgmnal_tx_t *t)
-{
- PORTAL_FREE(t, sizeof(kgmnal_tx_t));
-}
-
-int
-kgmnal_ispeer (ptl_nid_t nid)
-{
- unsigned int gmnid = (unsigned int)nid;
- unsigned int nnids;
-
- gm_max_node_id_in_use(kgmnal_data.kgm_port, &nnids);
-
- return ((ptl_nid_t)gmnid == nid &&/* didn't lose high bits on conversion ? */
- gmnid < nnids); /* it's in this machine */
-}
/*
- * LIB functions follow
- *
+ * This file implements the nal cb functions
*/
-static int
-kgmnal_read (nal_cb_t *nal, void *private, void *dst_addr, user_ptr src_addr,
- size_t len)
-{
- CDEBUG(D_NET, "0x%Lx: reading %ld bytes from %p -> %p\n",
- nal->ni.nid, (long)len, src_addr, dst_addr );
- memcpy( dst_addr, src_addr, len );
- return 0;
-}
-
-static int
-kgmnal_write(nal_cb_t *nal, void *private, user_ptr dst_addr, void *src_addr,
- size_t len)
-{
- CDEBUG(D_NET, "0x%Lx: writing %ld bytes from %p -> %p\n",
- nal->ni.nid, (long)len, src_addr, dst_addr );
- memcpy( dst_addr, src_addr, len );
- return 0;
-}
-static void *
-kgmnal_malloc(nal_cb_t *nal, size_t len)
-{
- void *buf;
- PORTAL_ALLOC(buf, len);
- return buf;
-}
+#include "gmnal.h"
-static void
-kgmnal_free(nal_cb_t *nal, void *buf, size_t len)
+int gmnal_cb_recv(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie,
+ unsigned int niov, struct iovec *iov, size_t mlen,
+ size_t rlen)
{
- PORTAL_FREE(buf, len);
+ gmnal_srxd_t *srxd = (gmnal_srxd_t*)private;
+ int status = PTL_OK;
+
+
+ CDEBUG(D_TRACE, "gmnal_cb_recv nal_cb [%p], private[%p], cookie[%p],
+ niov[%d], iov [%p], mlen["LPSZ"], rlen["LPSZ"]\n",
+ nal_cb, private, cookie, niov, iov, mlen, rlen);
+
+ switch(srxd->type) {
+ case(GMNAL_SMALL_MESSAGE):
+ CDEBUG(D_INFO, "gmnal_cb_recv got small message\n");
+ status = gmnal_small_rx(nal_cb, private, cookie, niov,
+ iov, mlen, rlen);
+ break;
+ case(GMNAL_LARGE_MESSAGE_INIT):
+ CDEBUG(D_INFO, "gmnal_cb_recv got large message init\n");
+ status = gmnal_large_rx(nal_cb, private, cookie, niov,
+ iov, mlen, rlen);
+ }
+
+
+ CDEBUG(D_INFO, "gmnal_cb_recv gmnal_return status [%d]\n", status);
+ return(status);
}
-static void
-kgmnal_printf(nal_cb_t *nal, const char *fmt, ...)
+int gmnal_cb_recv_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie,
+ unsigned int kniov, ptl_kiov_t *kiov, size_t mlen,
+ size_t rlen)
{
- va_list ap;
- char msg[256];
-
- if (portal_debug & D_NET) {
- va_start( ap, fmt );
- vsnprintf( msg, sizeof(msg), fmt, ap );
- va_end( ap );
-
- printk("CPUId: %d %s",smp_processor_id(), msg);
- }
+ gmnal_srxd_t *srxd = (gmnal_srxd_t*)private;
+ int status = PTL_OK;
+ struct iovec *iovec = NULL, *iovec_dup = NULL;
+ int i = 0;
+
+
+ CDEBUG(D_TRACE, "gmnal_cb_recv_pages nal_cb [%p],private[%p],
+ cookie[%p], kniov[%d], kiov [%p], mlen["LPSZ"], rlen["LPSZ"]\n",
+ nal_cb, private, cookie, kniov, kiov, mlen, rlen);
+
+ if (srxd->type == GMNAL_SMALL_MESSAGE) {
+ PORTAL_ALLOC(iovec, sizeof(struct iovec)*kniov);
+ if (!iovec) {
+ CDEBUG(D_ERROR, "Can't malloc\n");
+ return(GMNAL_STATUS_FAIL);
+ }
+ iovec_dup = iovec;
+
+ /*
+ * map each page and create an iovec for it
+ */
+ for (i=0; i<kniov; i++) {
+ CDEBUG(D_INFO, "processing kniov [%d] [%p]\n", i, kiov);
+ CDEBUG(D_INFO, "kniov page [%p] len [%d] offset[%d]\n",
+ kiov->kiov_page, kiov->kiov_len,
+ kiov->kiov_offset);
+ iovec->iov_len = kiov->kiov_len;
+ CDEBUG(D_INFO, "Calling kmap[%p]", kiov->kiov_page);
+
+ iovec->iov_base = kmap(kiov->kiov_page) +
+ kiov->kiov_offset;
+
+ CDEBUG(D_INFO, "iov_base is [%p]\n", iovec->iov_base);
+ iovec++;
+ kiov++;
+ }
+ CDEBUG(D_INFO, "calling gmnal_small_rx\n");
+ status = gmnal_small_rx(nal_cb, private, cookie, kniov,
+ iovec_dup, mlen, rlen);
+ PORTAL_FREE(iovec_dup, sizeof(struct iovec)*kniov);
+ }
+
+
+ CDEBUG(D_INFO, "gmnal_return status [%d]\n", status);
+ return(status);
}
-static void
-kgmnal_cli(nal_cb_t *nal, unsigned long *flags)
+int gmnal_cb_send(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie,
+ ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid,
+ unsigned int niov, struct iovec *iov, size_t len)
{
- kgmnal_data_t *data= nal->nal_data;
- spin_lock_irqsave(&data->kgm_dispatch_lock,*flags);
+ gmnal_data_t *nal_data;
+
+
+ CDEBUG(D_TRACE, "gmnal_cb_send niov[%d] len["LPSZ"] nid["LPU64"]\n",
+ niov, len, nid);
+ nal_data = nal_cb->nal_data;
+
+ if (GMNAL_IS_SMALL_MESSAGE(nal_data, niov, iov, len)) {
+ CDEBUG(D_INFO, "This is a small message send\n");
+ gmnal_small_tx(nal_cb, private, cookie, hdr, type, nid, pid,
+ niov, iov, len);
+ } else {
+ CDEBUG(D_ERROR, "Large message send it is not supported\n");
+ lib_finalize(nal_cb, private, cookie);
+ return(PTL_FAIL);
+ gmnal_large_tx(nal_cb, private, cookie, hdr, type, nid, pid,
+ niov, iov, len);
+ }
+ return(PTL_OK);
}
-
-static void
-kgmnal_sti(nal_cb_t *nal, unsigned long *flags)
+int gmnal_cb_send_pages(nal_cb_t *nal_cb, void *private, lib_msg_t *cookie,
+ ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, unsigned int kniov, ptl_kiov_t *kiov, size_t len)
{
- kgmnal_data_t *data= nal->nal_data;
- spin_unlock_irqrestore(&data->kgm_dispatch_lock,*flags);
+ int i = 0;
+ gmnal_data_t *nal_data;
+ struct iovec *iovec = NULL, *iovec_dup = NULL;
+
+ CDEBUG(D_TRACE, "gmnal_cb_send_pages nid ["LPU64"] niov[%d] len["LPSZ"]\n", nid, kniov, len);
+ nal_data = nal_cb->nal_data;
+ PORTAL_ALLOC(iovec, kniov*sizeof(struct iovec));
+ iovec_dup = iovec;
+ if (GMNAL_IS_SMALL_MESSAGE(nal_data, 0, NULL, len)) {
+ CDEBUG(D_INFO, "This is a small message send\n");
+
+ for (i=0; i<kniov; i++) {
+ CDEBUG(D_INFO, "processing kniov [%d] [%p]\n", i, kiov);
+ CDEBUG(D_INFO, "kniov page [%p] len [%d] offset[%d]\n",
+ kiov->kiov_page, kiov->kiov_len,
+ kiov->kiov_offset);
+
+ iovec->iov_base = kmap(kiov->kiov_page)
+ + kiov->kiov_offset;
+
+ iovec->iov_len = kiov->kiov_len;
+ iovec++;
+ kiov++;
+ }
+ gmnal_small_tx(nal_cb, private, cookie, hdr, type, nid,
+ pid, kniov, iovec_dup, len);
+ } else {
+ CDEBUG(D_ERROR, "Large message send it is not supported yet\n");
+ return(PTL_FAIL);
+ for (i=0; i<kniov; i++) {
+ CDEBUG(D_INFO, "processing kniov [%d] [%p]\n", i, kiov);
+ CDEBUG(D_INFO, "kniov page [%p] len [%d] offset[%d]\n",
+ kiov->kiov_page, kiov->kiov_len,
+ kiov->kiov_offset);
+
+ iovec->iov_base = kmap(kiov->kiov_page)
+ + kiov->kiov_offset;
+ iovec->iov_len = kiov->kiov_len;
+ iovec++;
+ kiov++;
+ }
+ gmnal_large_tx(nal_cb, private, cookie, hdr, type, nid,
+ pid, kniov, iovec, len);
+ }
+ PORTAL_FREE(iovec_dup, kniov*sizeof(struct iovec));
+ return(PTL_OK);
}
-
-static int
-kgmnal_dist(nal_cb_t *nal, ptl_nid_t nid, unsigned long *dist)
+int gmnal_cb_read(nal_cb_t *nal_cb, void *private, void *dst,
+ user_ptr src, size_t len)
{
- /* network distance doesn't mean much for this nal */
- if ( nal->ni.nid == nid ) {
- *dist = 0;
- } else {
- *dist = 1;
- }
-
- return 0;
+ gm_bcopy(src, dst, len);
+ return(PTL_OK);
}
-/* FIXME rmr: add rounting code here */
-static void
-kgmnal_tx_done(kgmnal_tx_t *trans, int error)
-{
- lib_finalize(trans->ktx_nal, trans->ktx_private, trans->ktx_cookie);
-
- gm_dma_free(kgmnal_data.kgm_port, trans->ktx_buffer);
-
- trans->ktx_buffer = NULL;
- trans->ktx_len = 0;
-
- put_trans(trans);
-}
-static char * gm_error_strings[GM_NUM_STATUS_CODES] = {
- [GM_SUCCESS] = "GM_SUCCESS",
- [GM_SEND_TIMED_OUT] = "GM_SEND_TIMED_OUT",
- [GM_SEND_REJECTED] = "GM_SEND_REJECTED",
- [GM_SEND_TARGET_PORT_CLOSED] = "GM_SEND_TARGET_PORT_CLOSED",
- [GM_SEND_TARGET_NODE_UNREACHABLE] = "GM_SEND_TARGET_NODE_UNREACHABLE",
- [GM_SEND_DROPPED] = "GM_SEND_DROPPED",
- [GM_SEND_PORT_CLOSED] = "GM_SEND_PORT_CLOSED",
-};
-
-inline char * get_error(int status)
+int gmnal_cb_write(nal_cb_t *nal_cb, void *private, user_ptr dst,
+ void *src, size_t len)
{
- if (gm_error_strings[status] != NULL)
- return gm_error_strings[status];
- else
- return "Unknown error";
+ gm_bcopy(src, dst, len);
+ return(PTL_OK);
}
-static void
-kgmnal_errhandler(struct gm_port *p, void *context, gm_status_t status)
+int gmnal_cb_callback(nal_cb_t *nal_cb, void *private, lib_eq_t *eq,
+ ptl_event_t *ev)
{
- CDEBUG(D_NET,"error callback: ktx %p status %d\n", context, status);
-}
-static void
-kgmnal_txhandler(struct gm_port *p, void *context, gm_status_t status)
-{
- kgmnal_tx_t *ktx = (kgmnal_tx_t *)context;
- int err = 0;
-
- LASSERT (p != NULL);
- LASSERT (ktx != NULL);
-
- CDEBUG(D_NET,"ktx %p status %d nid 0x%x pid %d\n", ktx, status,
- ktx->ktx_tgt_node, ktx->ktx_tgt_port_id);
-
- switch((int)status) {
- case GM_SUCCESS: /* normal */
- break;
- case GM_SEND_TIMED_OUT: /* application error */
- case GM_SEND_REJECTED: /* size of msg unacceptable */
- case GM_SEND_TARGET_PORT_CLOSED:
- CERROR("%s (%d):\n", get_error(status), status);
- gm_resume_sending(kgmnal_data.kgm_port, ktx->ktx_priority,
- ktx->ktx_tgt_node, ktx->ktx_tgt_port_id,
- kgmnal_errhandler, NULL);
- err = -EIO;
- break;
- case GM_SEND_TARGET_NODE_UNREACHABLE:
- case GM_SEND_PORT_CLOSED:
- CERROR("%s (%d):\n", get_error(status), status);
- gm_drop_sends(kgmnal_data.kgm_port, ktx->ktx_priority,
- ktx->ktx_tgt_node, ktx->ktx_tgt_port_id,
- kgmnal_errhandler, NULL);
- err = -EIO;
- break;
- case GM_SEND_DROPPED:
- CERROR("%s (%d):\n", get_error(status), status);
- err = -EIO;
- break;
- default:
- CERROR("Unknown status: %d\n", status);
- err = -EIO;
- break;
- }
-
- kgmnal_tx_done(ktx, err);
+ if (eq->event_callback != NULL) {
+ CDEBUG(D_INFO, "found callback\n");
+ eq->event_callback(ev);
+ }
+
+ return(PTL_OK);
}
-/*
- */
-
-static int
-kgmnal_send(nal_cb_t *nal,
- void *private,
- lib_msg_t *cookie,
- ptl_hdr_t *hdr,
- int type,
- ptl_nid_t nid,
- ptl_pid_t pid,
- int options,
- unsigned int niov,
- lib_md_iov_t *iov,
- size_t len)
+void *gmnal_cb_malloc(nal_cb_t *nal_cb, size_t len)
{
- /*
- * ipnal assumes that this is the private as passed to lib_dispatch..
- * so do we :/
- */
- kgmnal_tx_t *ktx=NULL;
- int rc=0;
- void * buf;
- int buf_len = sizeof(ptl_hdr_t) + len;
- int buf_size = 0;
-
- LASSERT ((options & PTL_MD_KIOV) == 0);
-
- PROF_START(gmnal_send);
-
-
- CDEBUG(D_NET, "sending %d bytes from %p to nid: 0x%Lx pid %d\n",
- len, iov, nid, KGM_PORT_NUM);
-
- /* ensure there is an available tx handle */
-
- /* save transaction info to trans for later finalize and cleanup */
- ktx = get_trans();
- if (ktx == NULL) {
- rc = -ENOMEM;
- goto send_exit;
- }
-
- /* hmmm... GM doesn't support vectored write, so need to allocate buffer to coalesce
- header and data.
- Also, memory must be dma'able or registered with GM. */
-
- if (buf_len <= MSG_LEN_SMALL) {
- buf_size = MSG_SIZE_SMALL;
- } else if (buf_len <= MSG_LEN_LARGE) {
- buf_size = MSG_SIZE_LARGE;
- } else {
- printk("kgmnal:request exceeds TX MTU size (%d).\n",
- MSG_SIZE_LARGE);
- rc = -1;
- goto send_exit;
- }
-
- buf = gm_dma_malloc(kgmnal_data.kgm_port, buf_len);
- if (buf == NULL) {
- rc = -ENOMEM;
- goto send_exit;
- }
- memcpy(buf, hdr, sizeof(ptl_hdr_t));
-
- if (len != 0)
- lib_copy_iov2buf(((char *)buf) + sizeof (ptl_hdr_t),
- options, niov, iov, len);
-
- ktx->ktx_nal = nal;
- ktx->ktx_private = private;
- ktx->ktx_cookie = cookie;
- ktx->ktx_len = buf_len;
- ktx->ktx_size = buf_size;
- ktx->ktx_buffer = buf;
- ktx->ktx_priority = GM_LOW_PRIORITY;
- ktx->ktx_tgt_node = nid;
- ktx->ktx_tgt_port_id = KGM_PORT_NUM;
-
- CDEBUG(D_NET, "gm_send %d bytes (size %d) from %p to nid: 0x%Lx "
- "pid %d pri %d\n", buf_len, buf_size, iov, nid, KGM_PORT_NUM,
- GM_LOW_PRIORITY);
-
- gm_send_with_callback(kgmnal_data.kgm_port, buf, buf_size,
- buf_len, GM_LOW_PRIORITY,
- nid, KGM_PORT_NUM,
- kgmnal_txhandler, ktx);
-
- PROF_FINISH(gmnal_send);
- send_exit:
- return rc;
-}
-void
-kgmnal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd)
-{
- CERROR ("forwarding not implemented\n");
+ void *ptr = NULL;
+ CDEBUG(D_TRACE, "gmnal_cb_malloc len["LPSZ"]\n", len);
+ PORTAL_ALLOC(ptr, len);
+ return(ptr);
}
-void
-kqswnal_fwd_callback (void *arg, int error)
+void gmnal_cb_free(nal_cb_t *nal_cb, void *buf, size_t len)
{
- CERROR ("forwarding not implemented\n");
+ CDEBUG(D_TRACE, "gmnal_cb_free :: buf[%p] len["LPSZ"]\n", buf, len);
+ PORTAL_FREE(buf, len);
+ return;
}
-
-static inline void
-kgmnal_requeue_rx(kgmnal_rx_t *krx)
+void gmnal_cb_unmap(nal_cb_t *nal_cb, unsigned int niov, struct iovec *iov,
+ void **addrkey)
{
- gm_provide_receive_buffer(kgmnal_data.kgm_port, krx->krx_buffer,
- krx->krx_size, krx->krx_priority);
+ return;
}
-/* Process a received portals packet */
-
-/* Receive Interrupt Handler */
-static void kgmnal_rx(kgmnal_data_t *kgm, unsigned long len, unsigned int size,
- void * buf, unsigned int pri)
+int gmnal_cb_map(nal_cb_t *nal_cb, unsigned int niov, struct iovec *iov,
+ void**addrkey)
{
- ptl_hdr_t *hdr = buf;
- kgmnal_rx_t krx;
-
- CDEBUG(D_NET,"buf %p, len %ld\n", buf, len);
-
- if ( len < sizeof( ptl_hdr_t ) ) {
- /* XXX what's this for? */
- if (kgm->kgm_shuttingdown)
- return;
- CERROR("kgmnal: did not receive complete portal header, "
- "len= %ld", len);
- gm_provide_receive_buffer(kgm->kgm_port, buf, size, pri);
- return;
- }
-
- /* might want to use seperate threads to handle receive */
- krx.krx_buffer = buf;
- krx.krx_len = len;
- krx.krx_size = size;
- krx.krx_priority = pri;
-
- if ( hdr->dest_nid == kgmnal_lib.ni.nid ) {
- PROF_START(lib_parse);
- lib_parse(&kgmnal_lib, (ptl_hdr_t *)krx.krx_buffer, &krx);
- PROF_FINISH(lib_parse);
- } else if (kgmnal_ispeer(hdr->dest_nid)) {
- /* should have gone direct to peer */
- CERROR("dropping packet from 0x%llx to 0x%llx: target is "
- "a peer", hdr->src_nid, hdr->dest_nid);
- kgmnal_requeue_rx(&krx);
- } else {
- /* forward to gateway */
- CERROR("forwarding not implemented yet");
- kgmnal_requeue_rx(&krx);
- }
-
- return;
+ return(PTL_OK);
}
-
-static int kgmnal_recv(nal_cb_t *nal,
- void *private,
- lib_msg_t *cookie,
- int options,
- unsigned int niov,
- lib_md_iov_t *iov,
- size_t mlen,
- size_t rlen)
+void gmnal_cb_printf(nal_cb_t *nal_cb, const char *fmt, ...)
{
- kgmnal_rx_t *krx = private;
-
- LASSERT ((options & PTL_MD_KIOV) == 0);
-
- CDEBUG(D_NET,"mlen=%d, rlen=%d\n", mlen, rlen);
-
- /* What was actually received must be >= what sender claims to
- * have sent. This is an LASSERT, since lib-move doesn't
- * check cb return code yet. */
- LASSERT (krx->krx_len >= sizeof (ptl_hdr_t) + rlen);
- LASSERT (mlen <= rlen);
-
- PROF_START(gmnal_recv);
-
- if(mlen != 0) {
- PROF_START(memcpy);
- lib_copy_buf2iov (options, niov, iov,
- krx->krx_buffer + sizeof (ptl_hdr_t), mlen);
- PROF_FINISH(memcpy);
- }
-
- PROF_START(lib_finalize);
- lib_finalize(nal, private, cookie);
- PROF_FINISH(lib_finalize);
-
- kgmnal_requeue_rx(krx);
-
- PROF_FINISH(gmnal_recv);
-
- return rlen;
+ CDEBUG(D_TRACE, "gmnal_cb_printf\n");
+ printk(fmt);
+ return;
}
-
-static void kgmnal_shutdown(void * none)
+void gmnal_cb_cli(nal_cb_t *nal_cb, unsigned long *flags)
{
- CERROR("called\n");
- return;
+ gmnal_data_t *nal_data = (gmnal_data_t*)nal_cb->nal_data;
+
+ spin_lock_irqsave(&nal_data->cb_lock, *flags);
+ return;
}
-/*
- * Set terminate and use alarm to wake up the recv thread.
- */
-static void recv_shutdown(kgmnal_data_t *kgm)
+void gmnal_cb_sti(nal_cb_t *nal_cb, unsigned long *flags)
{
- gm_alarm_t alarm;
+ gmnal_data_t *nal_data = (gmnal_data_t*)nal_cb->nal_data;
- kgm->kgm_shuttingdown = 1;
- gm_initialize_alarm(&alarm);
- gm_set_alarm(kgm->kgm_port, &alarm, 1, kgmnal_shutdown, NULL);
+ spin_unlock_irqrestore(&nal_data->cb_lock, *flags);
+ return;
}
-int kgmnal_end(kgmnal_data_t *kgm)
+int gmnal_cb_dist(nal_cb_t *nal_cb, ptl_nid_t nid, unsigned long *dist)
{
+ CDEBUG(D_TRACE, "gmnal_cb_dist\n");
+ if (dist)
+ *dist = 27;
+ return(PTL_OK);
+}
- /* wait for sends to finish ? */
- /* remove receive buffers */
- /* shutdown receive thread */
- recv_shutdown(kgm);
- return 0;
-}
-
-/* Used only for the spinner */
-int kgmnal_recv_thread(void *arg)
-{
- kgmnal_data_t *kgm = arg;
-
- LASSERT(kgm != NULL);
-
- kportal_daemonize("kgmnal_rx");
-
- while(1) {
- gm_recv_event_t *e;
- int priority = GM_LOW_PRIORITY;
- if (kgm->kgm_shuttingdown)
- break;
-
- e = gm_blocking_receive_no_spin(kgm->kgm_port);
- if (e == NULL) {
- CERROR("gm_blocking_receive returned NULL\n");
- break;
- }
-
- switch(gm_ntohc(e->recv.type)) {
- case GM_HIGH_RECV_EVENT:
- priority = GM_HIGH_PRIORITY;
- /* fall through */
- case GM_RECV_EVENT:
- kgmnal_rx(kgm, gm_ntohl(e->recv.length),
- gm_ntohc(e->recv.size),
- gm_ntohp(e->recv.buffer), priority);
- break;
- case GM_ALARM_EVENT:
- CERROR("received alarm");
- gm_unknown(kgm->kgm_port, e);
- break;
- case GM_BAD_SEND_DETECTED_EVENT: /* ?? */
- CERROR("received bad send!\n");
- break;
- default:
- gm_unknown(kgm->kgm_port, e);
- }
- }
-
- CERROR("shuttting down.\n");
- return 0;
-}
-nal_cb_t kgmnal_lib = {
- nal_data: &kgmnal_data, /* NAL private data */
- cb_send: kgmnal_send,
- cb_recv: kgmnal_recv,
- cb_read: kgmnal_read,
- cb_write: kgmnal_write,
- cb_malloc: kgmnal_malloc,
- cb_free: kgmnal_free,
- cb_printf: kgmnal_printf,
- cb_cli: kgmnal_cli,
- cb_sti: kgmnal_sti,
- cb_dist: kgmnal_dist
-};
+EXPORT_SYMBOL(gmnal_cb_send);
+EXPORT_SYMBOL(gmnal_cb_send_pages);
+EXPORT_SYMBOL(gmnal_cb_recv);
+EXPORT_SYMBOL(gmnal_cb_recv_pages);
+EXPORT_SYMBOL(gmnal_cb_read);
+EXPORT_SYMBOL(gmnal_cb_write);
+EXPORT_SYMBOL(gmnal_cb_cli);
+EXPORT_SYMBOL(gmnal_cb_sti);
+EXPORT_SYMBOL(gmnal_cb_dist);
+EXPORT_SYMBOL(gmnal_cb_printf);
+EXPORT_SYMBOL(gmnal_cb_map);
+EXPORT_SYMBOL(gmnal_cb_unmap);
+EXPORT_SYMBOL(gmnal_cb_callback);
+EXPORT_SYMBOL(gmnal_cb_free);
+EXPORT_SYMBOL(gmnal_cb_malloc);
kprni_nalid: QSWNAL,
kprni_arg: NULL,
kprni_fwd: kqswnal_fwd_packet,
+ kprni_notify: NULL, /* we're connectionless */
};
CDEBUG (D_MALLOC, "done kmem %d\n", atomic_read(&portal_kmemory));
- printk (KERN_INFO "Routing QSW NAL unloaded (final mem %d)\n",
+ printk (KERN_INFO "Lustre: Routing QSW NAL unloaded (final mem %d)\n",
atomic_read(&portal_kmemory));
}
PORTAL_SYMBOL_REGISTER(kqswnal_ni);
kqswnal_data.kqn_init = KQN_INIT_ALL;
- printk(KERN_INFO "Routing QSW NAL loaded on node %d of %d "
+ printk(KERN_INFO "Lustre: Routing QSW NAL loaded on node %d of %d "
"(Routing %s, initial mem %d)\n",
kqswnal_data.kqn_elanid, kqswnal_data.kqn_nnodes,
kpr_routing (&kqswnal_data.kqn_router) ? "enabled" : "disabled",
void *ktx_args[2]; /* completion passthru */
E3_Addr ktx_ebuffer; /* elan address of ktx_buffer */
char *ktx_buffer; /* pre-allocated contiguous buffer for hdr + small payloads */
+ unsigned long ktx_launchtime; /* when (in jiffies) the transmit was launched */
/* debug/info fields */
pid_t ktx_launcher; /* pid of launching process */
}
void
+kqswnal_notify_peer_down(kqswnal_tx_t *ktx)
+{
+ struct timeval now;
+ time_t then;
+
+ do_gettimeofday (&now);
+ then = now.tv_sec - (jiffies - ktx->ktx_launchtime)/HZ;
+
+ kpr_notify(&kqswnal_data.kqn_router, ktx->ktx_nid, 0, then);
+}
+
+void
kqswnal_unmap_tx (kqswnal_tx_t *ktx)
{
if (ktx->ktx_nmappedpages == 0)
if (status != EP_SUCCESS)
{
- CERROR ("kqswnal: Transmit failed with %d\n", status);
+ CERROR ("Tx completion to "LPX64" failed: %d\n",
+ ktx->ktx_nid, status);
+
+ kqswnal_notify_peer_down(ktx);
status = -EIO;
} else if (ktx->ktx_state == KTX_GETTING) {
int dest = kqswnal_nid2elanid (ktx->ktx_nid);
long flags;
int rc;
-
+
+ ktx->ktx_launchtime = jiffies;
+
LASSERT (dest >= 0); /* must be a peer */
if (ktx->ktx_state == KTX_GETTING) {
LASSERT (KQSW_OPTIMIZE_GETS);
ktx, ktx->ktx_frags.iov, ktx->ktx_nfrag);
}
- if (rc != ENOMEM)
- return (rc);
-
- /* can't allocate ep txd => queue for later */
+ switch (rc) {
+ case ESUCCESS: /* success */
+ return (0);
- LASSERT (in_interrupt()); /* not called by thread (not looping) */
+ case ENOMEM: /* can't allocate ep txd => queue for later */
+ LASSERT (in_interrupt());
- spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
+ spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags);
- list_add_tail (&ktx->ktx_delayed_list, &kqswnal_data.kqn_delayedtxds);
- if (waitqueue_active (&kqswnal_data.kqn_sched_waitq))
- wake_up (&kqswnal_data.kqn_sched_waitq);
+ list_add_tail (&ktx->ktx_delayed_list, &kqswnal_data.kqn_delayedtxds);
+ if (waitqueue_active (&kqswnal_data.kqn_sched_waitq))
+ wake_up (&kqswnal_data.kqn_sched_waitq);
- spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags);
+ spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags);
+ return (0);
- return (0);
+ default: /* fatal error */
+ CERROR ("Tx to "LPX64" failed: %d\n", ktx->ktx_nid, rc);
+ kqswnal_notify_peer_down(ktx);
+ return (rc);
+ }
}
-
static char *
hdr_type_string (ptl_hdr_t *hdr)
{
targetnid = nid;
if (kqswnal_nid2elanid (nid) < 0) { /* Can't send direct: find gateway? */
- rc = kpr_lookup (&kqswnal_data.kqn_router, nid, &targetnid);
+ rc = kpr_lookup (&kqswnal_data.kqn_router, nid,
+ sizeof (ptl_hdr_t) + payload_nob, &targetnid);
if (rc != 0) {
CERROR("Can't route to "LPX64": router error %d\n",
nid, rc);
#if KQSW_OPTIMIZE_GETS
if (type == PTL_MSG_REPLY &&
ep_rxd_isrpc(((kqswnal_rx_t *)private)->krx_rxd)) {
+ if (nid != targetnid ||
+ kqswnal_nid2elanid(nid) !=
+ ep_rxd_node(((kqswnal_rx_t *)private)->krx_rxd)) {
+ CERROR("Optimized reply nid conflict: "
+ "nid "LPX64" via "LPX64" elanID %d\n",
+ nid, targetnid,
+ ep_rxd_node(((kqswnal_rx_t *)private)->krx_rxd));
+ return(PTL_FAIL);
+ }
+
/* peer expects RPC completion with GET data */
rc = kqswnal_dma_reply (ktx,
payload_niov, payload_iov,
return (PTL_FAIL);
}
- CDEBUG(D_NET, "send to "LPSZ" bytes to "LPX64"\n", payload_nob, targetnid);
+ CDEBUG(D_NET, "sent "LPSZ" bytes to "LPX64" via "LPX64"\n",
+ payload_nob, nid, targetnid);
return (PTL_OK);
}
vsnprintf( msg, sizeof(msg), fmt, ap );
va_end( ap );
- printk("CPUId: %d %s",smp_processor_id(), msg);
+ printk("Lustre: CPUId: %d %s",smp_processor_id(), msg);
}
}
kprni_nalid: SOCKNAL,
kprni_arg: &ksocknal_data,
kprni_fwd: ksocknal_fwd_packet,
+ kprni_notify: ksocknal_notify,
};
+#define SOCKNAL_SYSCTL 200
+
+#define SOCKNAL_SYSCTL_TIMEOUT 1
+#define SOCKNAL_SYSCTL_EAGER_ACK 2
+#define SOCKNAL_SYSCTL_ZERO_COPY 3
+
+static ctl_table ksocknal_ctl_table[] = {
+ {SOCKNAL_SYSCTL_TIMEOUT, "timeout",
+ &ksocknal_data.ksnd_io_timeout, sizeof (int),
+ 0644, NULL, &proc_dointvec},
+ {SOCKNAL_SYSCTL_EAGER_ACK, "eager_ack",
+ &ksocknal_data.ksnd_eager_ack, sizeof (int),
+ 0644, NULL, &proc_dointvec},
+#if SOCKNAL_ZC
+ {SOCKNAL_SYSCTL_EAGER_ACK, "zero_copy",
+ &ksocknal_data.ksnd_zc_min_frag, sizeof (int),
+ 0644, NULL, &proc_dointvec},
+#endif
+ { 0 }
+};
+
+static ctl_table ksocknal_top_ctl_table[] = {
+ {SOCKNAL_SYSCTL, "socknal", NULL, 0, 0555, ksocknal_ctl_table},
+ { 0 }
+};
int
ksocknal_api_forward(nal_t *nal, int id, void *args, size_t args_len,
snprintf (cmdline, sizeof (cmdline),
"echo %d > /proc/irq/%u/smp_affinity", 1 << info->ksni_sched, irq);
- printk (KERN_INFO "Binding irq %u to CPU %d with cmd: %s\n",
+ printk (KERN_INFO "Lustre: Binding irq %u to CPU %d with cmd: %s\n",
irq, info->ksni_sched, cmdline);
/* FIXME: Find a better method of setting IRQ affinity...
ksock_route_t *
ksocknal_create_route (__u32 ipaddr, int port, int buffer_size,
- int irq_affinity, int xchange_nids, int nonagel)
+ int nonagel, int xchange_nids, int irq_affinity, int eager)
{
ksock_route_t *route;
atomic_set (&route->ksnr_refcount, 1);
route->ksnr_sharecount = 0;
route->ksnr_peer = NULL;
- route->ksnr_timeout = jiffies_64;
+ route->ksnr_timeout = jiffies;
route->ksnr_retry_interval = SOCKNAL_MIN_RECONNECT_INTERVAL;
route->ksnr_ipaddr = ipaddr;
route->ksnr_port = port;
route->ksnr_irq_affinity = irq_affinity;
route->ksnr_xchange_nids = xchange_nids;
route->ksnr_nonagel = nonagel;
+ route->ksnr_eager = eager;
route->ksnr_connecting = 0;
route->ksnr_deleted = 0;
route->ksnr_generation = 0;
int
ksocknal_add_route (ptl_nid_t nid, __u32 ipaddr, int port, int bufnob,
- int nonagle, int xchange_nids, int bind_irq, int share)
+ int nonagle, int xchange_nids, int bind_irq,
+ int share, int eager)
{
unsigned long flags;
ksock_peer_t *peer;
if (peer == NULL)
return (-ENOMEM);
- route = ksocknal_create_route (ipaddr, port, bufnob,
- nonagle, xchange_nids, bind_irq);
+ route = ksocknal_create_route (ipaddr, port, bufnob, nonagle,
+ xchange_nids, bind_irq, eager);
if (route == NULL) {
ksocknal_put_peer (peer);
return (-ENOMEM);
if (conn != NULL) {
if (!keep_conn)
- ksocknal_close_conn_locked (conn);
+ ksocknal_close_conn_locked (conn, 0);
else {
/* keeping the conn; just dissociate it and route... */
conn->ksnc_route = NULL;
struct sockaddr_in sin;
int len = sizeof (sin);
int rc;
-
- rc = ksocknal_getconnsock (conn);
- LASSERT (rc == 0);
rc = conn->ksnc_sock->ops->getname (conn->ksnc_sock,
(struct sockaddr *)&sin, &len, 2);
+ /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */
+ LASSERT (!conn->ksnc_closing);
LASSERT (len <= sizeof (sin));
- ksocknal_putconnsock (conn);
if (rc != 0) {
CERROR ("Error %d getting sock peer IP\n", rc);
ksocknal_conn_irq (ksock_conn_t *conn)
{
int irq = 0;
- int rc;
struct dst_entry *dst;
- rc = ksocknal_getconnsock (conn);
- LASSERT (rc == 0);
-
dst = sk_dst_get (conn->ksnc_sock->sk);
if (dst != NULL) {
if (dst->dev != NULL) {
dst_release (dst);
}
- ksocknal_putconnsock (conn);
+ /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */
+ LASSERT (!conn->ksnc_closing);
return (irq);
}
int rc;
/* NB, sock has an associated file since (a) this connection might
- * have been created in userland and (b) we need the refcounting so
- * that we don't close the socket while I/O is being done on it. */
+ * have been created in userland and (b) we need to refcount the
+ * socket so that we don't close it while I/O is being done on
+ * it, and sock->file has that pre-cooked... */
LASSERT (sock->file != NULL);
+ LASSERT (file_count(sock->file) > 0);
- rc = ksocknal_set_linger (sock);
+ rc = ksocknal_setup_sock (sock);
if (rc != 0)
return (rc);
ksocknal_new_packet (conn, 0);
INIT_LIST_HEAD (&conn->ksnc_tx_queue);
-#if SOCKNAL_ZC
- INIT_LIST_HEAD (&conn->ksnc_tx_pending);
-#endif
conn->ksnc_tx_ready = 0;
conn->ksnc_tx_scheduled = 0;
atomic_set (&conn->ksnc_tx_nob, 0);
conn->ksnc_peer = peer;
atomic_inc (&peer->ksnp_refcount);
+ peer->ksnp_last_alive = jiffies;
+ peer->ksnp_error = 0;
list_add (&conn->ksnc_list, &peer->ksnp_conns);
atomic_inc (&conn->ksnc_refcount);
}
void
-ksocknal_close_conn_locked (ksock_conn_t *conn)
+ksocknal_close_conn_locked (ksock_conn_t *conn, int error)
{
/* This just does the immmediate housekeeping, and queues the
* connection for the reaper to terminate.
ksock_peer_t *peer = conn->ksnc_peer;
ksock_route_t *route;
+ LASSERT (peer->ksnp_error == 0);
LASSERT (!conn->ksnc_closing);
conn->ksnc_closing = 1;
atomic_inc (&ksocknal_data.ksnd_nclosing_conns);
/* ksnd_deathrow_conns takes over peer's ref */
list_del (&conn->ksnc_list);
- if (list_empty (&peer->ksnp_conns) &&
- list_empty (&peer->ksnp_routes)) {
- /* I've just closed last conn belonging to a
- * non-autoconnecting peer */
- ksocknal_unlink_peer_locked (peer);
+ if (list_empty (&peer->ksnp_conns)) {
+ /* No more connections to this peer */
+
+ peer->ksnp_error = error; /* stash last conn close reason */
+
+ if (list_empty (&peer->ksnp_routes)) {
+ /* I've just closed last conn belonging to a
+ * non-autoconnecting peer */
+ ksocknal_unlink_peer_locked (peer);
+ }
}
spin_lock (&ksocknal_data.ksnd_reaper_lock);
}
int
-ksocknal_close_conn_unlocked (ksock_conn_t *conn)
+ksocknal_close_conn_unlocked (ksock_conn_t *conn, int why)
{
unsigned long flags;
int did_it = 0;
if (!conn->ksnc_closing) {
did_it = 1;
- ksocknal_close_conn_locked (conn);
+ ksocknal_close_conn_locked (conn, why);
}
write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags);
* ksnc_refcount will eventually hit zero, and then the reaper will
* destroy it. */
unsigned long flags;
+ ksock_peer_t *peer = conn->ksnc_peer;
+ struct timeval now;
+ time_t then = 0;
+ int notify = 0;
/* serialise with callbacks */
write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags);
conn->ksnc_scheduler->kss_nconns--;
+ if (peer->ksnp_error != 0) {
+ /* peer's last conn closed in error */
+ LASSERT (list_empty (&peer->ksnp_conns));
+
+ /* convert peer's last-known-alive timestamp from jiffies */
+ do_gettimeofday (&now);
+ then = now.tv_sec - (jiffies - peer->ksnp_last_alive)/HZ;
+ notify = 1;
+ }
+
write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags);
/* The socket is closed on the final put; either here, or in
* immediately, aborting anything buffered in it. Any hung
* zero-copy transmits will therefore complete in finite time. */
ksocknal_putconnsock (conn);
+
+ if (notify)
+ kpr_notify (&ksocknal_data.ksnd_router, peer->ksnp_nid,
+ 0, then);
}
void
LASSERT (conn->ksnc_route == NULL);
LASSERT (!conn->ksnc_tx_scheduled);
LASSERT (!conn->ksnc_rx_scheduled);
-#if SOCKNAL_ZC
- LASSERT (list_empty (&conn->ksnc_tx_pending));
-#endif
+
/* complete queued packets */
while (!list_empty (&conn->ksnc_tx_queue)) {
ksock_tx_t *tx = list_entry (conn->ksnc_tx_queue.next,
continue;
rc = 0;
- ksocknal_close_conn_locked (conn);
+ ksocknal_close_conn_locked (conn, 0);
}
}
}
return (rc);
}
+void
+ksocknal_notify (void *arg, ptl_nid_t gw_nid, int alive)
+{
+ /* The router is telling me she's been notified of a change in
+ * gateway state.... */
+
+ CDEBUG (D_NET, "gw "LPX64" %s\n", gw_nid, alive ? "up" : "down");
+
+ if (!alive) {
+ /* If the gateway crashed, close all open connections... */
+ ksocknal_close_conn (gw_nid, 0);
+ return;
+ }
+
+ /* ...otherwise do nothing. We can only establish new connections
+ * if we have autroutes, and these connect on demand. */
+}
+
#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
struct tcp_opt *sock2tcp_opt(struct sock *sk)
{
data->ioc_wait = route->ksnr_sharecount;
data->ioc_flags = (route->ksnr_nonagel ? 1 : 0) |
(route->ksnr_xchange_nids ? 2 : 0) |
- (route->ksnr_irq_affinity ? 4 : 0);
+ (route->ksnr_irq_affinity ? 4 : 0) |
+ (route->ksnr_eager ? 8 : 0);
ksocknal_put_route (route);
}
break;
case NAL_CMD_ADD_AUTOCONN: {
rc = ksocknal_add_route (data->ioc_nid, data->ioc_id,
data->ioc_misc, data->ioc_size,
- (data->ioc_flags & 1) != 0,
- (data->ioc_flags & 2) != 0,
- (data->ioc_flags & 4) != 0,
- (data->ioc_flags & 8) != 0);
+ (data->ioc_flags & 0x01) != 0,
+ (data->ioc_flags & 0x02) != 0,
+ (data->ioc_flags & 0x04) != 0,
+ (data->ioc_flags & 0x08) != 0,
+ (data->ioc_flags & 0x10) != 0);
break;
}
case NAL_CMD_DEL_AUTOCONN: {
LASSERT (0);
case SOCKNAL_INIT_ALL:
+#if CONFIG_SYSCTL
+ if (ksocknal_data.ksnd_sysctl != NULL)
+ unregister_sysctl_table (ksocknal_data.ksnd_sysctl);
+#endif
kportal_nal_unregister(SOCKNAL);
PORTAL_SYMBOL_UNREGISTER (ksocknal_ni);
/* fall through */
CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
atomic_read (&portal_kmemory));
- printk(KERN_INFO "Routing socket NAL unloaded (final mem %d)\n",
+ printk(KERN_INFO "Lustre: Routing socket NAL unloaded (final mem %d)\n",
atomic_read(&portal_kmemory));
}
/* packet descriptor must fit in a router descriptor's scratchpad */
LASSERT(sizeof (ksock_tx_t) <= sizeof (kprfd_scratch_t));
+ /* the following must be sizeof(int) for proc_dointvec() */
+ LASSERT(sizeof (ksocknal_data.ksnd_io_timeout) == sizeof (int));
+ LASSERT(sizeof (ksocknal_data.ksnd_eager_ack) == sizeof (int));
LASSERT (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING);
memset (&ksocknal_data, 0, sizeof (ksocknal_data)); /* zero pointers */
+ ksocknal_data.ksnd_io_timeout = SOCKNAL_IO_TIMEOUT;
+ ksocknal_data.ksnd_eager_ack = SOCKNAL_EAGER_ACK;
+#if SOCKNAL_ZC
+ ksocknal_data.ksnd_zc_min_frag = SOCKNAL_ZC_MIN_FRAG;
+#endif
+
ksocknal_data.ksnd_peer_hash_size = SOCKNAL_PEER_HASH_SIZE;
PORTAL_ALLOC (ksocknal_data.ksnd_peers,
sizeof (struct list_head) * ksocknal_data.ksnd_peer_hash_size);
PORTAL_SYMBOL_REGISTER(ksocknal_ni);
+#ifdef CONFIG_SYSCTL
+ /* Press on regardless even if registering sysctl doesn't work */
+ ksocknal_data.ksnd_sysctl = register_sysctl_table (ksocknal_top_ctl_table, 0);
+#endif
/* flag everything initialised */
ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL;
- printk(KERN_INFO "Routing socket NAL loaded (Routing %s, initial "
- "mem %d)\n",
+ printk(KERN_INFO "Lustre: Routing socket NAL loaded "
+ "(Routing %s, initial mem %d)\n",
kpr_routing (&ksocknal_data.ksnd_router) ?
"enabled" : "disabled", pkmem);
#include <linux/stat.h>
#include <linux/list.h>
#include <linux/kmod.h>
+#include <linux/sysctl.h>
#include <asm/uaccess.h>
#include <asm/segment.h>
#include <asm/div64.h>
#define SOCKNAL_MIN_RECONNECT_INTERVAL HZ /* first failed connection retry... */
#define SOCKNAL_MAX_RECONNECT_INTERVAL (60*HZ) /* ...exponentially increasing to this */
-#define SOCKNAL_IO_TIMEOUT (60*HZ) /* default comms timeout */
+/* default vals for runtime tunables */
+#define SOCKNAL_IO_TIMEOUT 50 /* default comms timeout (seconds) */
+#define SOCKNAL_EAGER_ACK 1 /* default eager ack (boolean) */
+#define SOCKNAL_ZC_MIN_FRAG (2<<10) /* default smallest zerocopy fragment */
+
+#define SOCKNAL_USE_KEEPALIVES 0 /* use tcp/ip keepalive? */
#define SOCKNAL_PEER_HASH_SIZE 101 /* # peer lists */
# define SOCKNAL_MAX_FWD_PAYLOAD (64<<10) /* biggest payload I can forward */
#endif
-#define SOCKNAL_ZC_MIN_FRAG (2<<10) /* default smallest zerocopy fragment */
-
#define SOCKNAL_NLTXS 128 /* # normal transmit messages */
#define SOCKNAL_NNBLK_LTXS 128 /* # transmit messages reserved if can't block */
#define SOCKNAL_TX_LOW_WATER(sk) (((sk)->sk_sndbuf*8)/10)
-#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
-# define jiffies_64 jiffies
-#endif
-
#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,72))
# define sk_data_ready data_ready
# define sk_write_space write_space
typedef struct {
int ksnd_init; /* initialisation state */
+ int ksnd_io_timeout; /* "stuck" socket timeout (seconds) */
+ int ksnd_eager_ack; /* make TCP ack eagerly? */
+#if SOCKNAL_ZC
+ unsigned int ksnd_zc_min_frag; /* minimum zero copy frag size */
+#endif
+ struct ctl_table_header *ksnd_sysctl; /* sysctl interface */
rwlock_t ksnd_global_lock; /* stabilize peer/conn ops */
struct list_head *ksnd_peers; /* hash table of all my known peers */
typedef struct /* transmit packet */
{
struct list_head tx_list; /* queue on conn for transmission etc */
- __u64 tx_deadline; /* when (in jiffies) tx times out */
char tx_isfwd; /* forwarding / sourced here */
int tx_nob; /* # packet bytes */
int tx_resid; /* residual bytes */
__u32 ksnc_ipaddr; /* peer's IP */
int ksnc_port; /* peer's port */
int ksnc_closing; /* being shut down */
-
+
/* READER */
struct list_head ksnc_rx_list; /* where I enq waiting input or a forwarding descriptor */
- __u64 ksnc_rx_deadline; /* when receive times out */
+ unsigned long ksnc_rx_deadline; /* when (in jiffies) receive times out */
+ int ksnc_rx_started; /* started receiving a message */
int ksnc_rx_ready; /* data ready to read */
int ksnc_rx_scheduled; /* being progressed */
int ksnc_rx_state; /* what is being read */
/* WRITER */
struct list_head ksnc_tx_list; /* where I enq waiting for output space */
struct list_head ksnc_tx_queue; /* packets waiting to be sent */
-#if SOCKNAL_ZC
- struct list_head ksnc_tx_pending; /* zc packets pending callback */
-#endif
+ unsigned long ksnc_tx_deadline; /* when (in jiffies) tx times out */
atomic_t ksnc_tx_nob; /* # bytes queued */
int ksnc_tx_ready; /* write space */
int ksnc_tx_scheduled; /* being progressed */
struct ksock_peer *ksnr_peer; /* owning peer */
atomic_t ksnr_refcount; /* # users */
int ksnr_sharecount; /* lconf usage counter */
- __u64 ksnr_timeout; /* when reconnection can happen next */
+ unsigned long ksnr_timeout; /* when (in jiffies) reconnection can happen next */
unsigned int ksnr_retry_interval; /* how long between retries */
__u32 ksnr_ipaddr; /* an IP address for this peer */
int ksnr_port; /* port to connect to */
unsigned int ksnr_irq_affinity:1; /* set affinity? */
unsigned int ksnr_xchange_nids:1; /* do hello protocol? */
unsigned int ksnr_nonagel:1; /* disable nagle? */
- unsigned int ksnr_connecting; /* autoconnect in progress? */
- unsigned int ksnr_deleted; /* been removed from peer? */
+ unsigned int ksnr_eager:1; /* connect eagery? */
+ unsigned int ksnr_connecting:1; /* autoconnect in progress? */
+ unsigned int ksnr_deleted:1; /* been removed from peer? */
int ksnr_generation; /* connection incarnation # */
ksock_conn_t *ksnr_conn; /* NULL/active connection */
} ksock_route_t;
ptl_nid_t ksnp_nid; /* who's on the other end(s) */
atomic_t ksnp_refcount; /* # users */
int ksnp_closing; /* being closed */
+ int ksnp_error; /* errno on closing last conn */
struct list_head ksnp_conns; /* all active connections */
struct list_head ksnp_routes; /* routes */
struct list_head ksnp_tx_queue; /* waiting packets */
+ unsigned long ksnp_last_alive; /* when (in jiffies) I was last alive */
} ksock_peer_t;
-
extern nal_cb_t ksocknal_lib;
extern ksock_nal_data_t ksocknal_data;
int single, int keep_conn);
extern int ksocknal_create_conn (ptl_nid_t nid, ksock_route_t *route,
struct socket *sock, int bind_irq);
-extern void ksocknal_close_conn_locked (ksock_conn_t *conn);
-extern int ksocknal_close_conn_unlocked (ksock_conn_t *conn);
+extern void ksocknal_close_conn_locked (ksock_conn_t *conn, int why);
+extern int ksocknal_close_conn_unlocked (ksock_conn_t *conn, int why);
extern void ksocknal_terminate_conn (ksock_conn_t *conn);
extern void ksocknal_destroy_conn (ksock_conn_t *conn);
extern void ksocknal_put_conn (ksock_conn_t *conn);
extern void ksocknal_tx_done (ksock_tx_t *tx, int asynch);
extern void ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd);
extern void ksocknal_fmb_callback (void *arg, int error);
+extern void ksocknal_notify (void *arg, ptl_nid_t gw_nid, int alive);
extern int ksocknal_thread_start (int (*fn)(void *arg), void *arg);
extern int ksocknal_new_packet (ksock_conn_t *conn, int skip);
extern int ksocknal_scheduler (void *arg);
extern void ksocknal_write_space(struct sock *sk);
extern int ksocknal_autoconnectd (void *arg);
extern int ksocknal_reaper (void *arg);
-extern int ksocknal_set_linger (struct socket *sock);
+extern int ksocknal_setup_sock (struct socket *sock);
#include "socknal.h"
-int ksocknal_io_timeout = SOCKNAL_IO_TIMEOUT;
-#if SOCKNAL_ZC
-int ksocknal_do_zc = 1;
-int ksocknal_zc_min_frag = SOCKNAL_ZC_MIN_FRAG;
-#endif
-
/*
* LIB functions follow
*
struct iovec *iov = tx->tx_iov;
int fragsize = iov->iov_len;
unsigned long vaddr = (unsigned long)iov->iov_base;
- int more = !list_empty (&conn->ksnc_tx_queue) |
+ int more = (!list_empty (&conn->ksnc_tx_queue)) |
(tx->tx_niov > 1) |
(tx->tx_nkiov > 1);
#if SOCKNAL_ZC
LASSERT (tx->tx_niov > 0);
#if SOCKNAL_ZC
- if (ksocknal_do_zc &&
+ if (zcsize >= ksocknal_data.ksnd_zc_min_frag &&
(sock->sk->route_caps & NETIF_F_SG) &&
(sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) &&
- zcsize >= ksocknal_zc_min_frag &&
(page = ksocknal_kvaddr_to_page (vaddr)) != NULL) {
CDEBUG(D_NET, "vaddr %p, page %p->%p + offset %x for %d\n",
int fragsize = kiov->kiov_len;
struct page *page = kiov->kiov_page;
int offset = kiov->kiov_offset;
- int more = !list_empty (&conn->ksnc_tx_queue) |
+ int more = (!list_empty (&conn->ksnc_tx_queue)) |
(tx->tx_nkiov > 1);
int rc;
LASSERT (tx->tx_nkiov > 0);
#if SOCKNAL_ZC
- if (ksocknal_do_zc &&
+ if (fragsize >= ksocknal_data.ksnd_zc_min_frag &&
(sock->sk->route_caps & NETIF_F_SG) &&
- (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) &&
- fragsize >= ksocknal_zc_min_frag) {
+ (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM))) {
CDEBUG(D_NET, "page %p + offset %x for %d\n",
page, offset, fragsize);
set_fs (KERNEL_DS);
rc = sock_sendmsg(sock, &msg, fragsize);
set_fs (oldmm);
+
kunmap (page);
}
break;
}
+ /* Consider the connection alive since we managed to chuck
+ * more data into it. Really, we'd like to consider it
+ * alive only when the peer ACKs something, but
+ * write_space() only gets called back while SOCK_NOSPACE
+ * is set. Instead, we presume peer death has occurred if
+ * the socket doesn't drain within a timout */
+ conn->ksnc_tx_deadline = jiffies +
+ ksocknal_data.ksnd_io_timeout * HZ;
+ conn->ksnc_peer->ksnp_last_alive = jiffies;
+
if (tx->tx_resid == 0) { /* sent everything */
rc = 0;
break;
RETURN (rc);
}
+void
+ksocknal_eager_ack (ksock_conn_t *conn)
+{
+ int opt = 1;
+ mm_segment_t oldmm = get_fs();
+ struct socket *sock = conn->ksnc_sock;
+
+ /* Remind the socket to ACK eagerly. If I don't, the socket might
+ * think I'm about to send something it could piggy-back the ACK
+ * on, introducing delay in completing zero-copy sends in my
+ * peer. */
+
+ set_fs(KERNEL_DS);
+ sock->ops->setsockopt (sock, SOL_TCP, TCP_QUICKACK,
+ (char *)&opt, sizeof (opt));
+ set_fs(oldmm);
+}
+
int
ksocknal_recv_iov (ksock_conn_t *conn)
{
if (rc <= 0)
return (rc);
+ /* received something... */
+ conn->ksnc_peer->ksnp_last_alive = jiffies;
+ conn->ksnc_rx_deadline = jiffies +
+ ksocknal_data.ksnd_io_timeout * HZ;
+ mb(); /* order with setting rx_started */
+ conn->ksnc_rx_started = 1;
+
conn->ksnc_rx_nob_wanted -= rc;
conn->ksnc_rx_nob_left -= rc;
rc = sock_recvmsg (conn->ksnc_sock, &msg, fragsize, MSG_DONTWAIT);
/* NB this is just a boolean............................^ */
set_fs (oldmm);
+
kunmap (page);
if (rc <= 0)
return (rc);
+ /* received something... */
+ conn->ksnc_peer->ksnp_last_alive = jiffies;
+ conn->ksnc_rx_deadline = jiffies +
+ ksocknal_data.ksnd_io_timeout * HZ;
+ mb(); /* order with setting rx_started */
+ conn->ksnc_rx_started = 1;
+
conn->ksnc_rx_nob_wanted -= rc;
conn->ksnc_rx_nob_left -= rc;
rc = -ESHUTDOWN;
break;
}
-
+
if (conn->ksnc_rx_niov != 0)
rc = ksocknal_recv_iov (conn);
else
rc = ksocknal_recv_kiov (conn);
-
+
if (rc <= 0) {
/* error/EOF or partial receive */
- if (rc == -EAGAIN)
+ if (rc == -EAGAIN) {
rc = 1;
+ } else if (rc == 0 && conn->ksnc_rx_started) {
+ /* EOF in the middle of a message */
+ rc = -EPROTO;
+ }
break;
}
+ /* Completed a fragment */
+
if (conn->ksnc_rx_nob_wanted == 0) {
+ /* Completed a message segment (header or payload) */
+ if (ksocknal_data.ksnd_eager_ack &&
+ (conn->ksnc_rx_state == SOCKNAL_RX_BODY ||
+ conn->ksnc_rx_state == SOCKNAL_RX_BODY_FWD)) {
+ /* Remind the socket to ack eagerly... */
+ ksocknal_eager_ack(conn);
+ }
rc = 1;
break;
}
spin_lock_irqsave (&sched->kss_lock, flags);
- list_del (&tx->tx_list); /* remove from kss_zctxpending_list */
list_add_tail (&tx->tx_list, &sched->kss_zctxdone_list);
if (waitqueue_active (&sched->kss_waitq))
wake_up (&sched->kss_waitq);
{
#if SOCKNAL_ZC
if (atomic_read (&tx->tx_zccd.zccd_count) != 1) {
- unsigned long flags;
ksock_conn_t *conn = tx->tx_conn;
- ksock_sched_t *sched = conn->ksnc_scheduler;
/* zccd skbufs are still in-flight. First take a ref on
* conn, so it hangs about for ksocknal_tx_done... */
atomic_inc (&conn->ksnc_refcount);
- /* Stash it for timeout...
- * NB We have to hold a lock to stash the tx, and we have
- * stash it before we zcc_put(), but we have to _not_ hold
- * this lock when we zcc_put(), otherwise we could deadlock
- * if it turns out to be the last put. Aaaaarrrrggghhh! */
- spin_lock_irqsave (&sched->kss_lock, flags);
- list_add_tail (&tx->tx_list, &conn->ksnc_tx_pending);
- spin_unlock_irqrestore (&sched->kss_lock, flags);
-
/* ...then drop the initial ref on zccd, so the zero copy
* callback can occur */
zccd_put (&tx->tx_zccd);
void
ksocknal_process_transmit (ksock_sched_t *sched, unsigned long *irq_flags)
{
- ksock_conn_t *conn;
- ksock_tx_t *tx;
- int rc;
-
+ ksock_conn_t *conn;
+ ksock_tx_t *tx;
+ int rc;
+
LASSERT (!list_empty (&sched->kss_tx_conns));
conn = list_entry(sched->kss_tx_conns.next, ksock_conn_t, ksnc_tx_list);
list_del (&conn->ksnc_tx_list);
CDEBUG (D_NET, "send(%d) %d\n", tx->tx_resid, rc);
if (rc != 0) {
- if (ksocknal_close_conn_unlocked (conn)) {
+ if (ksocknal_close_conn_unlocked (conn, rc)) {
/* I'm the first to close */
CERROR ("[%p] Error %d on write to "LPX64" ip %08x:%d\n",
conn, rc, conn->ksnc_peer->ksnp_nid,
spin_lock_irqsave (&sched->kss_lock, *irq_flags);
} else if (tx->tx_resid == 0) {
-
/* everything went; assume more can go, and avoid
* write_space locking */
conn->ksnc_tx_ready = 1;
return (NULL);
}
- rc = kpr_lookup (&ksocknal_data.ksnd_router, nid, &target_nid);
+ rc = kpr_lookup (&ksocknal_data.ksnd_router, nid, tx->tx_nob,
+ &target_nid);
if (rc != 0) {
CERROR ("Can't route to "LPX64": router error %d\n", nid, rc);
return (NULL);
#endif
spin_lock_irqsave (&sched->kss_lock, flags);
-
- tx->tx_deadline = jiffies_64 + ksocknal_io_timeout;
+
list_add_tail (&tx->tx_list, &conn->ksnc_tx_queue);
if (conn->ksnc_tx_ready && /* able to send */
}
ksock_route_t *
-ksocknal_find_connectable_route_locked (ksock_peer_t *peer)
+ksocknal_find_connectable_route_locked (ksock_peer_t *peer, int eager_only)
{
struct list_head *tmp;
ksock_route_t *route;
if (route->ksnr_conn == NULL && /* not connected */
!route->ksnr_connecting && /* not connecting */
- route->ksnr_timeout <= jiffies_64) /* OK to retry */
+ (!eager_only || route->ksnr_eager) && /* wants to connect */
+ time_after_eq (jiffies, route->ksnr_timeout)) /* OK to retry */
return (route);
}
ksock_conn_t *conn;
ksock_route_t *route;
rwlock_t *g_lock;
-
+
/* Ensure the frags we've been given EXACTLY match the number of
* bytes we want to send. Many TCP/IP stacks disregard any total
* size parameters passed to them and just look at the frags.
return (PTL_FAIL);
}
- /* Any routes need to be connected? (need write lock if so) */
- if (ksocknal_find_connectable_route_locked (peer) == NULL) {
+ if (ksocknal_find_connectable_route_locked(peer, 1) == NULL) {
conn = ksocknal_find_conn_locked (tx, peer);
if (conn != NULL) {
+ /* I've got no unconnected autoconnect routes that
+ * need to be connected, and I do have an actual
+ * connection... */
ksocknal_queue_tx_locked (tx, conn);
read_unlock (g_lock);
return (PTL_OK);
}
}
- /* need a write lock now to change peer state... */
+ /* Making one or more connections; I'll need a write lock... */
atomic_inc (&peer->ksnp_refcount); /* +1 ref for me while I unlock */
read_unlock (g_lock);
}
ksocknal_put_peer (peer); /* drop ref I got above */
- /* I may launch autoconnects, now we're write locked... */
- while ((route = ksocknal_find_connectable_route_locked (peer)) != NULL)
+
+ for (;;) {
+ /* launch all eager autoconnections */
+ route = ksocknal_find_connectable_route_locked (peer, 1);
+ if (route == NULL)
+ break;
+
ksocknal_launch_autoconnect_locked (route);
+ }
conn = ksocknal_find_conn_locked (tx, peer);
if (conn != NULL) {
+ /* Connection exists; queue message on it */
ksocknal_queue_tx_locked (tx, conn);
write_unlock_irqrestore (g_lock, flags);
return (PTL_OK);
}
-
+
if (ksocknal_find_connecting_route_locked (peer) == NULL) {
- /* no routes actually connecting now */
- write_unlock_irqrestore (g_lock, flags);
- return (PTL_FAIL);
+ /* no autoconnect routes actually connecting now. Scrape
+ * the barrel for non-eager autoconnects */
+ route = ksocknal_find_connectable_route_locked (peer, 0);
+ if (route != NULL) {
+ ksocknal_launch_autoconnect_locked (route);
+ } else {
+ write_unlock_irqrestore (g_lock, flags);
+ return (PTL_FAIL);
+ }
}
+ /* At least 1 connection is being established; queue the message... */
list_add_tail (&tx->tx_list, &peer->ksnp_tx_queue);
write_unlock_irqrestore (g_lock, flags);
CDEBUG (D_NET, "routed packet from "LPX64" to "LPX64": OK\n",
NTOH__u64 (hdr->src_nid), NTOH__u64 (hdr->dest_nid));
+ /* drop peer ref taken on init */
+ ksocknal_put_peer (fmb->fmb_peer);
+
spin_lock_irqsave (&fmp->fmp_lock, flags);
list_add (&fmb->fmb_list, &fmp->fmp_idle_fmbs);
spin_unlock_irqrestore (&fmp->fmp_lock, flags);
- /* drop peer ref taken on init */
- ksocknal_put_peer (fmb->fmb_peer);
-
if (conn == NULL)
return;
conn->ksnc_cookie = fmb; /* stash fmb for later */
conn->ksnc_rx_state = SOCKNAL_RX_BODY_FWD; /* read in the payload */
- conn->ksnc_rx_deadline = jiffies_64 + ksocknal_io_timeout; /* start timeout */
/* payload is desc's iov-ed buffer, but skipping the hdr */
LASSERT (niov <= sizeof (conn->ksnc_rx_iov_space) /
dest_nid, body_len);
ksocknal_new_packet (conn, 0); /* on to new packet */
- ksocknal_close_conn_unlocked (conn); /* give up on conn */
+ ksocknal_close_conn_unlocked (conn, -EINVAL); /* give up on conn */
return;
}
int skipped;
if (nob_to_skip == 0) { /* right at next packet boundary now */
+ conn->ksnc_rx_started = 0;
+ mb (); /* racing with timeout thread */
+
conn->ksnc_rx_state = SOCKNAL_RX_HEADER;
conn->ksnc_rx_nob_wanted = sizeof (ptl_hdr_t);
conn->ksnc_rx_nob_left = sizeof (ptl_hdr_t);
rc = ksocknal_recvmsg(conn);
if (rc <= 0) {
- if (ksocknal_close_conn_unlocked (conn)) {
+ if (ksocknal_close_conn_unlocked (conn, rc)) {
/* I'm the first to close */
if (rc < 0)
CERROR ("[%p] Error %d on read from "LPX64" ip %08x:%d\n",
conn, rc, conn->ksnc_peer->ksnp_nid,
conn->ksnc_ipaddr, conn->ksnc_port);
else
- CERROR ("[%p] EOF from "LPX64" ip %08x:%d\n",
- conn, conn->ksnc_peer->ksnp_nid,
- conn->ksnc_ipaddr, conn->ksnc_port);
+ CWARN ("[%p] EOF from "LPX64" ip %08x:%d\n",
+ conn, conn->ksnc_peer->ksnp_nid,
+ conn->ksnc_ipaddr, conn->ksnc_port);
}
goto out;
}
+
if (conn->ksnc_rx_nob_wanted != 0) /* short read */
goto out; /* try again later */
/* sets wanted_len, iovs etc */
lib_parse(&ksocknal_lib, &conn->ksnc_hdr, conn);
- /* start timeout (lib is waiting for finalize) */
- conn->ksnc_rx_deadline = jiffies_64 + ksocknal_io_timeout;
-
if (conn->ksnc_rx_nob_wanted != 0) { /* need to get payload? */
conn->ksnc_rx_state = SOCKNAL_RX_BODY;
goto try_read; /* go read the payload */
case SOCKNAL_RX_BODY:
/* payload all received */
- conn->ksnc_rx_deadline = 0; /* cancel timeout */
lib_finalize(&ksocknal_lib, NULL, conn->ksnc_cookie);
/* Fall through */
NTOH__u64 (conn->ksnc_hdr.dest_nid),
conn->ksnc_rx_nob_left);
- /* cancel timeout (only needed it while fmb allocated) */
- conn->ksnc_rx_deadline = 0;
-
/* forward the packet. NB ksocknal_init_fmb() put fmb into
* conn->ksnc_cookie */
fmb = (ksock_fmb_t *)conn->ksnc_cookie;
int id = sched - ksocknal_data.ksnd_schedulers;
char name[16];
- snprintf (name, sizeof (name),"ksocknald[%d]", id);
+ snprintf (name, sizeof (name),"ksocknald_%02d", id);
kportal_daemonize (name);
kportal_blockallsigs ();
#if (CONFIG_SMP && CPU_AFFINITY)
- if ((cpu_online_map & (1 << id)) != 0)
+ if ((cpu_online_map & (1 << id)) != 0) {
+#if 1
current->cpus_allowed = (1 << id);
- else
+#else
+ set_cpus_allowed (current, 1<<id);
+#endif
+ } else {
CERROR ("Can't set CPU affinity for %s\n", name);
+ }
#endif /* CONFIG_SMP && CPU_AFFINITY */
spin_lock_irqsave (&sched->kss_lock, flags);
if (conn == NULL) { /* raced with ksocknal_close_sock */
LASSERT (sk->sk_data_ready != &ksocknal_data_ready);
sk->sk_data_ready (sk, n);
- } else if (!conn->ksnc_rx_ready) { /* new news */
+ goto out;
+ }
+
+ if (!conn->ksnc_rx_ready) { /* new news */
/* Set ASAP in case of concurrent calls to me */
conn->ksnc_rx_ready = 1;
spin_unlock_irqrestore (&sched->kss_lock, flags);
}
+ out:
read_unlock (&ksocknal_data.ksnd_global_lock);
EXIT;
if (conn == NULL) { /* raced with ksocknal_close_sock */
LASSERT (sk->sk_write_space != &ksocknal_write_space);
sk->sk_write_space (sk);
- } else if (tcp_wspace(sk) >= SOCKNAL_TX_LOW_WATER(sk)) { /* got enough space */
+
+ read_unlock (&ksocknal_data.ksnd_global_lock);
+ return;
+ }
+
+ if (tcp_wspace(sk) >= SOCKNAL_TX_LOW_WATER(sk)) { /* got enough space */
clear_bit (SOCK_NOSPACE, &sk->sk_socket->flags);
if (!conn->ksnc_tx_ready) { /* new news */
}
int
-ksocknal_set_linger (struct socket *sock)
+ksocknal_setup_sock (struct socket *sock)
{
mm_segment_t oldmm = get_fs ();
int rc;
CERROR ("Can't set SO_LINGER2: %d\n", rc);
return (rc);
}
+
+#if SOCKNAL_USE_KEEPALIVES
+ /* Keepalives: If 3/4 of the timeout elapses, start probing every
+ * second until the timeout elapses. */
+
+ option = (ksocknal_data.ksnd_io_timeout * 3) / 4;
+ set_fs (KERNEL_DS);
+ rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPIDLE,
+ (char *)&option, sizeof (option));
+ set_fs (oldmm);
+ if (rc != 0) {
+ CERROR ("Can't set TCP_KEEPIDLE: %d\n", rc);
+ return (rc);
+ }
+
+ option = 1;
+ set_fs (KERNEL_DS);
+ rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPINTVL,
+ (char *)&option, sizeof (option));
+ set_fs (oldmm);
+ if (rc != 0) {
+ CERROR ("Can't set TCP_KEEPINTVL: %d\n", rc);
+ return (rc);
+ }
+ option = ksocknal_data.ksnd_io_timeout / 4;
+ set_fs (KERNEL_DS);
+ rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPCNT,
+ (char *)&option, sizeof (option));
+ set_fs (oldmm);
+ if (rc != 0) {
+ CERROR ("Can't set TCP_KEEPINTVL: %d\n", rc);
+ return (rc);
+ }
+
+ option = 1;
+ set_fs (KERNEL_DS);
+ rc = sock_setsockopt (sock, SOL_SOCKET, SO_KEEPALIVE,
+ (char *)&option, sizeof (option));
+ set_fs (oldmm);
+ if (rc != 0) {
+ CERROR ("Can't set SO_KEEPALIVE: %d\n", rc);
+ return (rc);
+ }
+#endif
return (0);
}
{
struct sockaddr_in peer_addr;
mm_segment_t oldmm = get_fs();
- __u64 n;
struct timeval tv;
int fd;
struct socket *sock;
}
/* Ugh; have to map_fd for compatibility with sockets passed in
- * from userspace. And we actually need the refcounting that
- * this gives you :) */
+ * from userspace. And we actually need the sock->file refcounting
+ * that this gives you :) */
fd = sock_map_fd (sock);
if (fd < 0) {
/* NB the fd now owns the ref on sock->file */
LASSERT (sock->file != NULL);
LASSERT (file_count(sock->file) == 1);
-
+
/* Set the socket timeouts, so our connection attempt completes in
* finite time */
- tv.tv_sec = ksocknal_io_timeout / HZ;
- n = ksocknal_io_timeout % HZ;
- n = n * 1000000 + HZ - 1;
- do_div (n, HZ);
- tv.tv_usec = n;
+ tv.tv_sec = ksocknal_data.ksnd_io_timeout;
+ tv.tv_usec = 0;
set_fs (KERNEL_DS);
rc = sock_setsockopt (sock, SOL_SOCKET, SO_SNDTIMEO,
(char *)&tv, sizeof (tv));
set_fs (oldmm);
if (rc != 0) {
- CERROR ("Can't set send timeout %d (in HZ): %d\n",
- ksocknal_io_timeout, rc);
+ CERROR ("Can't set send timeout %d: %d\n",
+ ksocknal_data.ksnd_io_timeout, rc);
goto out;
}
(char *)&tv, sizeof (tv));
set_fs (oldmm);
if (rc != 0) {
- CERROR ("Can't set receive timeout %d (in HZ): %d\n",
- ksocknal_io_timeout, rc);
+ CERROR ("Can't set receive timeout %d: %d\n",
+ ksocknal_data.ksnd_io_timeout, rc);
goto out;
}
route->ksnr_connecting = 0;
LASSERT (route->ksnr_retry_interval != 0);
- route->ksnr_timeout = jiffies_64 + route->ksnr_retry_interval;
+ route->ksnr_timeout = jiffies + route->ksnr_retry_interval;
route->ksnr_retry_interval = MIN (route->ksnr_retry_interval * 2,
SOCKNAL_MAX_RECONNECT_INTERVAL);
ksock_route_t *route;
int rc;
- snprintf (name, sizeof (name), "ksocknal_ad[%ld]", id);
+ snprintf (name, sizeof (name), "ksocknal_ad%02ld", id);
kportal_daemonize (name);
kportal_blockallsigs ();
ksocknal_find_timed_out_conn (ksock_peer_t *peer)
{
/* We're called with a shared lock on ksnd_global_lock */
- unsigned long flags;
ksock_conn_t *conn;
struct list_head *ctmp;
- ksock_tx_t *tx;
- struct list_head *ttmp;
ksock_sched_t *sched;
list_for_each (ctmp, &peer->ksnp_conns) {
conn = list_entry (ctmp, ksock_conn_t, ksnc_list);
sched = conn->ksnc_scheduler;
-
- if (conn->ksnc_rx_deadline != 0 &&
- conn->ksnc_rx_deadline <= jiffies_64)
- goto timed_out;
- spin_lock_irqsave (&sched->kss_lock, flags);
+ /* Don't need the {get,put}connsock dance to deref ksnc_sock... */
+ LASSERT (!conn->ksnc_closing);
- list_for_each (ttmp, &conn->ksnc_tx_queue) {
- tx = list_entry (ttmp, ksock_tx_t, tx_list);
- LASSERT (tx->tx_deadline != 0);
-
- if (tx->tx_deadline <= jiffies_64)
- goto timed_out_locked;
+ if (conn->ksnc_rx_started &&
+ time_after_eq (jiffies, conn->ksnc_rx_deadline)) {
+ /* Timed out incomplete incoming message */
+ atomic_inc (&conn->ksnc_refcount);
+ CERROR ("Timed out RX from "LPX64" %p\n",
+ peer->ksnp_nid, conn);
+ return (conn);
}
-#if SOCKNAL_ZC
- list_for_each (ttmp, &conn->ksnc_tx_pending) {
- tx = list_entry (ttmp, ksock_tx_t, tx_list);
- LASSERT (tx->tx_deadline != 0);
-
- if (tx->tx_deadline <= jiffies_64)
- goto timed_out_locked;
+
+ if ((!list_empty (&conn->ksnc_tx_queue) ||
+ conn->ksnc_sock->sk->wmem_queued != 0) &&
+ time_after_eq (jiffies, conn->ksnc_tx_deadline)) {
+ /* Timed out messages queued for sending, or
+ * messages buffered in the socket's send buffer */
+ atomic_inc (&conn->ksnc_refcount);
+ CERROR ("Timed out TX to "LPX64" %s%d %p\n",
+ peer->ksnp_nid,
+ list_empty (&conn->ksnc_tx_queue) ? "" : "Q ",
+ conn->ksnc_sock->sk->wmem_queued, conn);
+ return (conn);
}
-#endif
- spin_unlock_irqrestore (&sched->kss_lock, flags);
- continue;
-
- timed_out_locked:
- spin_unlock_irqrestore (&sched->kss_lock, flags);
- timed_out:
- atomic_inc (&conn->ksnc_refcount);
- return (conn);
}
return (NULL);
}
void
-ksocknal_check_peer_timeouts (struct list_head *peers)
+ksocknal_check_peer_timeouts (int idx)
{
+ struct list_head *peers = &ksocknal_data.ksnd_peers[idx];
struct list_head *ptmp;
ksock_peer_t *peer;
ksock_conn_t *conn;
if (conn != NULL) {
read_unlock (&ksocknal_data.ksnd_global_lock);
- if (ksocknal_close_conn_unlocked (conn)) {
+ if (ksocknal_close_conn_unlocked (conn, -ETIMEDOUT)) {
/* I actually closed... */
CERROR ("Timeout out conn->"LPX64" ip %x:%d\n",
peer->ksnp_nid, conn->ksnc_ipaddr,
unsigned long flags;
ksock_conn_t *conn;
int timeout;
+ int i;
int peer_index = 0;
- __u64 deadline = jiffies_64;
+ unsigned long deadline = jiffies;
kportal_daemonize ("ksocknal_reaper");
kportal_blockallsigs ();
spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags);
- while ((timeout = deadline - jiffies_64) <= 0) {
- /* Time to check for timeouts on a few more peers */
- ksocknal_check_peer_timeouts (&ksocknal_data.ksnd_peers[peer_index]);
+ /* careful with the jiffy wrap... */
+ while ((timeout = ((int)deadline - (int)jiffies)) <= 0) {
+ const int n = 4;
+ const int p = 1;
+ int chunk = ksocknal_data.ksnd_peer_hash_size;
+
+ /* Time to check for timeouts on a few more peers: I do
+ * checks every 'p' seconds on a proportion of the peer
+ * table and I need to check every connection 'n' times
+ * within a timeout interval, to ensure I detect a
+ * timeout on any connection within (n+1)/n times the
+ * timeout interval. */
+
+ if (ksocknal_data.ksnd_io_timeout > n * p)
+ chunk = (chunk * n * p) /
+ ksocknal_data.ksnd_io_timeout;
+ if (chunk == 0)
+ chunk = 1;
+
+ for (i = 0; i < chunk; i++) {
+ ksocknal_check_peer_timeouts (peer_index);
+ peer_index = (peer_index + 1) %
+ ksocknal_data.ksnd_peer_hash_size;
+ }
- peer_index = (peer_index + 1) % SOCKNAL_PEER_HASH_SIZE;
- deadline += HZ;
+ deadline += p * HZ;
}
add_wait_queue (&ksocknal_data.ksnd_reaper_waitq, &wait);
CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n",
atomic_read (&portal_kmemory));
- printk(KERN_INFO "Routing socket NAL unloaded (final mem %d)\n",
+ printk(KERN_INFO "Lustre: Routing socket NAL unloaded (final mem %d)\n",
atomic_read(&portal_kmemory));
}
/* flag everything initialised */
ktoenal_data.ksnd_init = SOCKNAL_INIT_ALL;
- printk(KERN_INFO"Routing TOE NAL loaded (Routing %s, initial mem %d)\n",
+ printk(KERN_INFO "Lustre: Routing TOE NAL loaded (Routing %s, initial mem %d)\n",
kpr_routing(&ktoenal_data.ksnd_router) ? "enabled" : "disabled",
pkmem);
if ((conn = ktoenal_get_conn (nid)) == NULL)
{
/* It's not a peer; try to find a gateway */
- rc = kpr_lookup (&ktoenal_data.ksnd_router, nid, &gatewaynid);
+ rc = kpr_lookup (&ktoenal_data.ksnd_router, nid, payload_niov,
+ &gatewaynid);
if (rc != 0)
{
CERROR ("Can't route to "LPX64": router error %d\n", nid, rc);
echo timestamp > link-stamp
DEFS =
-portals_SOURCES = $(LINKS) module.c proc.c debug.c
+portals_SOURCES = $(LINKS) module.c proc.c debug.c lwt.c
# Don't distribute any patched files.
dist-hook:
PTR_ERR(file));
GOTO(out, PTR_ERR(file));
} else {
- printk(KERN_ALERT "dumping log to %s ... writing ...\n",
+ printk(KERN_ALERT "LustreError: dumping log to %s ... writing ...\n",
debug_file_name);
}
} else {
rc = file->f_op->write(file, debug_buf, debug_off,&file->f_pos);
}
- printk("wrote %d bytes\n", rc);
+ printk("LustreError: wrote %d bytes\n", rc);
set_fs(oldfs);
rc = file->f_op->fsync(file, file->f_dentry, 1);
CERROR("cannot open %s for logging", debug_daemon_file_path);
GOTO(out1, PTR_ERR(file));
} else {
- printk(KERN_ALERT "daemon dumping log to %s ... writing ...\n",
+ printk(KERN_ALERT "LustreError: daemon dumping log to %s ... writing ...\n",
debug_daemon_file_path);
}
size, &file->f_pos);
if (rc < 0) {
printk(KERN_ALERT
- "Debug_daemon write error %d\n", rc);
+ "LustreError: Debug_daemon write error %d\n", rc);
goto out;
}
start += rc;
rc = file->f_op->fsync(file, file->f_dentry, 1);
if (rc < 0) {
printk(KERN_ALERT
- "Debug_daemon sync error %d\n", rc);
+ "LustreError: Debug_daemon sync error %d\n", rc);
goto out;
}
if (debug_daemon_state.stopped)
while (start1 < end1) {
int count = MIN(1024, end1 - start1);
- printk("%*s", count, start1);
+ printk("LustreError: %*s", count, start1);
start1 += 1024;
}
while (start2 < end2) {
int count = MIN(1024, end2 - start2);
- printk("%*s", count, start2);
+ printk("LustreError: %*s", count, start2);
start2 += 1024;
}
}
rc = kernel_thread(portals_do_debug_dumplog,
NULL, CLONE_VM | CLONE_FS | CLONE_FILES);
if (rc < 0) {
- printk(KERN_ERR "cannot start dump thread\n");
+ printk(KERN_ERR "LustreError: cannot start dump thread\n");
return;
}
sleep_on(&debug_ctlwq);
debug_daemon_state.lctl_event = 0;
rc = kernel_thread(portals_debug_daemon, NULL, 0);
if (rc < 0) {
- printk(KERN_ERR "cannot start debug daemon thread\n");
+ printk(KERN_ERR "LustreError: cannot start debug daemon thread\n");
strncpy(debug_daemon_file_path, "\0", 1);
return rc;
}
unsigned long debug_off;
if (debug_buf == NULL) {
- printk("portals_debug_msg: debug_buf is NULL!\n");
+ printk("LustreError: portals_debug_msg: debug_buf is NULL!\n");
return;
}
max_nob = debug_size - debug_off + DEBUG_OVERFLOW;
if (max_nob <= 0) {
spin_unlock_irqrestore(&portals_debug_lock, flags);
- printk("logic error in portals_debug_msg: <0 bytes to write\n");
+ printk("LustreError: logic error in portals_debug_msg: <0 bytes to write\n");
return;
}
/* Print to console, while msg is contiguous in debug_buf */
/* NB safely terminated see above */
if ((mask & D_EMERG) != 0)
- printk(KERN_EMERG "%s", debug_buf + debug_off + prefix_nob);
+ printk(KERN_EMERG "LustreError: %s",
+ debug_buf + debug_off + prefix_nob);
if ((mask & D_ERROR) != 0)
- printk(KERN_ERR "%s", debug_buf + debug_off + prefix_nob);
+ printk(KERN_ERR "LustreError: %s",
+ debug_buf + debug_off + prefix_nob);
else if (portal_printk)
- printk("<%d>%s", portal_printk, debug_buf+debug_off+prefix_nob);
+ printk("<%d>LustreError: %s", portal_printk, debug_buf+debug_off+prefix_nob);
base_offset = debug_off & 0xFFFF;
debug_off += prefix_nob + msg_nob;
void portals_debug_set_level(unsigned int debug_level)
{
- printk("Setting portals debug level to %08x\n", debug_level);
+ printk("Lustre: Setting portals debug level to %08x\n", debug_level);
portal_debug = debug_level;
}
+void portals_run_upcall(char **argv)
+{
+ int rc;
+ int argc;
+ char *envp[] = {
+ "HOME=/",
+ "PATH=/sbin:/bin:/usr/sbin:/usr/bin",
+ NULL};
+ ENTRY;
+
+ argv[0] = portals_upcall;
+ argc = 1;
+ while (argv[argc] != NULL)
+ argc++;
+
+ LASSERT(argc >= 2);
+
+ rc = call_usermodehelper(argv[0], argv, envp);
+ if (rc < 0) {
+ CERROR("Error %d invoking portals upcall %s %s%s%s%s%s%s%s%s; "
+ "check /proc/sys/portals/upcall\n",
+ rc, argv[0], argv[1],
+ argc < 3 ? "" : ",", argc < 3 ? "" : argv[2],
+ argc < 4 ? "" : ",", argc < 4 ? "" : argv[3],
+ argc < 5 ? "" : ",", argc < 5 ? "" : argv[4],
+ argc < 6 ? "" : ",...");
+ } else {
+ CERROR("Invoked portals upcall %s %s%s%s%s%s%s%s%s\n",
+ argv[0], argv[1],
+ argc < 3 ? "" : ",", argc < 3 ? "" : argv[2],
+ argc < 4 ? "" : ",", argc < 4 ? "" : argv[3],
+ argc < 5 ? "" : ",", argc < 5 ? "" : argv[4],
+ argc < 6 ? "" : ",...");
+ }
+}
+
void portals_run_lbug_upcall(char *file, const char *fn, const int line)
{
char *argv[6];
- char *envp[3];
char buf[32];
- int rc;
ENTRY;
snprintf (buf, sizeof buf, "%d", line);
- argv[0] = portals_upcall;
argv[1] = "LBUG";
argv[2] = file;
argv[3] = (char *)fn;
argv[4] = buf;
argv[5] = NULL;
- envp[0] = "HOME=/";
- envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
- envp[2] = NULL;
-
- rc = USERMODEHELPER(argv[0], argv, envp);
- if (rc < 0) {
- CERROR("Error invoking lbug upcall %s %s %s %s %s: %d; check "
- "/proc/sys/portals/upcall\n",
- argv[0], argv[1], argv[2], argv[3], argv[4], rc);
-
- } else {
- CERROR("Invoked upcall %s %s %s %s %s\n",
- argv[0], argv[1], argv[2], argv[3], argv[4]);
- }
+ portals_run_upcall (argv);
}
-
EXPORT_SYMBOL(portals_debug_dumplog);
EXPORT_SYMBOL(portals_debug_msg);
EXPORT_SYMBOL(portals_debug_set_level);
+EXPORT_SYMBOL(portals_run_upcall);
EXPORT_SYMBOL(portals_run_lbug_upcall);
}
static int
-kportal_add_route(int gateway_nalid, ptl_nid_t gateway_nid, ptl_nid_t lo_nid,
- ptl_nid_t hi_nid)
+kportal_add_route(int gateway_nalid, ptl_nid_t gateway_nid,
+ ptl_nid_t lo_nid, ptl_nid_t hi_nid)
{
int rc;
kpr_control_interface_t *ci;
}
static int
-kportal_del_route(ptl_nid_t target)
+kportal_del_route(int gw_nalid, ptl_nid_t gw_nid,
+ ptl_nid_t lo, ptl_nid_t hi)
{
int rc;
kpr_control_interface_t *ci;
if (ci == NULL)
return (-ENODEV);
- rc = ci->kprci_del_route (target);
+ rc = ci->kprci_del_route (gw_nalid, gw_nid, lo, hi);
+
+ PORTAL_SYMBOL_PUT(kpr_control_interface);
+ return (rc);
+}
+
+static int
+kportal_notify_router (int gw_nalid, ptl_nid_t gw_nid,
+ int alive, time_t when)
+{
+ int rc;
+ kpr_control_interface_t *ci;
+
+ ci = (kpr_control_interface_t *)PORTAL_SYMBOL_GET(kpr_control_interface);
+ if (ci == NULL)
+ return (-ENODEV);
+
+ rc = ci->kprci_notify (gw_nalid, gw_nid, alive, when);
PORTAL_SYMBOL_PUT(kpr_control_interface);
return (rc);
static int
kportal_get_route(int index, __u32 *gateway_nalidp, ptl_nid_t *gateway_nidp,
- ptl_nid_t *lo_nidp, ptl_nid_t *hi_nidp)
+ ptl_nid_t *lo_nidp, ptl_nid_t *hi_nidp, int *alivep)
{
int gateway_nalid;
ptl_nid_t gateway_nid;
ptl_nid_t lo_nid;
ptl_nid_t hi_nid;
+ int alive;
int rc;
kpr_control_interface_t *ci;
if (ci == NULL)
return (-ENODEV);
- rc = ci->kprci_get_route(index, &gateway_nalid, &gateway_nid, &lo_nid,
- &hi_nid);
+ rc = ci->kprci_get_route(index, &gateway_nalid, &gateway_nid,
+ &lo_nid, &hi_nid, &alive);
if (rc == 0) {
- CDEBUG(D_IOCTL, "got route [%d] %d "LPX64":"LPX64" - "LPX64"\n",
- index, gateway_nalid, gateway_nid, lo_nid, hi_nid);
+ CDEBUG(D_IOCTL, "got route [%d] %d "LPX64":"LPX64" - "LPX64", %s\n",
+ index, gateway_nalid, gateway_nid, lo_nid, hi_nid,
+ alive ? "up" : "down");
*gateway_nalidp = (__u32)gateway_nalid;
- *gateway_nidp = (__u32)gateway_nid;
- *lo_nidp = (__u32)lo_nid;
- *hi_nidp = (__u32)hi_nid;
+ *gateway_nidp = gateway_nid;
+ *lo_nidp = lo_nid;
+ *hi_nidp = hi_nid;
+ *alivep = alive;
}
PORTAL_SYMBOL_PUT (kpr_control_interface);
case IOC_PORTAL_ADD_ROUTE:
CDEBUG(D_IOCTL, "Adding route: [%d] "LPU64" : "LPU64" - "LPU64"\n",
- data->ioc_nal, data->ioc_nid, data->ioc_nid2,
- data->ioc_nid3);
+ data->ioc_nal, data->ioc_nid,
+ data->ioc_nid2, data->ioc_nid3);
err = kportal_add_route(data->ioc_nal, data->ioc_nid,
- MIN (data->ioc_nid2, data->ioc_nid3),
- MAX (data->ioc_nid2, data->ioc_nid3));
+ data->ioc_nid2, data->ioc_nid3);
break;
case IOC_PORTAL_DEL_ROUTE:
- CDEBUG (D_IOCTL, "Removing route to "LPU64"\n", data->ioc_nid);
- err = kportal_del_route (data->ioc_nid);
+ CDEBUG (D_IOCTL, "Removing routes via [%d] "LPU64" : "LPU64" - "LPU64"\n",
+ data->ioc_nal, data->ioc_nid,
+ data->ioc_nid2, data->ioc_nid3);
+ err = kportal_del_route (data->ioc_nal, data->ioc_nid,
+ data->ioc_nid2, data->ioc_nid3);
break;
+ case IOC_PORTAL_NOTIFY_ROUTER: {
+ CDEBUG (D_IOCTL, "Notifying peer [%d] "LPU64" %s @ %ld\n",
+ data->ioc_nal, data->ioc_nid,
+ data->ioc_flags ? "Enabling" : "Disabling",
+ (time_t)data->ioc_nid3);
+
+ err = kportal_notify_router (data->ioc_nal, data->ioc_nid,
+ data->ioc_flags,
+ (time_t)data->ioc_nid3);
+ break;
+ }
+
case IOC_PORTAL_GET_ROUTE:
CDEBUG (D_IOCTL, "Getting route [%d]\n", data->ioc_count);
err = kportal_get_route(data->ioc_count, &data->ioc_nal,
- &data->ioc_nid, &data->ioc_nid2,
- &data->ioc_nid3);
+ &data->ioc_nid,
+ &data->ioc_nid2, &data->ioc_nid3,
+ &data->ioc_flags);
if (err == 0)
if (copy_to_user((char *)arg, data, sizeof (*data)))
err = -EFAULT;
kportal_put_ni (data->ioc_nal);
break;
}
-
+#if LWT_SUPPORT
+ case IOC_PORTAL_LWT_CONTROL:
+ err = lwt_control (data->ioc_flags, data->ioc_misc);
+ break;
+
+ case IOC_PORTAL_LWT_SNAPSHOT:
+ err = lwt_snapshot (&data->ioc_count, &data->ioc_misc,
+ data->ioc_pbuf1, data->ioc_plen1);
+ if (err == 0 &&
+ copy_to_user((char *)arg, data, sizeof (*data)))
+ err = -EFAULT;
+ break;
+
+ case IOC_PORTAL_LWT_LOOKUP_STRING:
+ err = lwt_lookup_string (&data->ioc_count, data->ioc_pbuf1,
+ data->ioc_pbuf2, data->ioc_plen2);
+ if (err == 0 &&
+ copy_to_user((char *)arg, data, sizeof (*data)))
+ err = -EFAULT;
+ break;
+#endif
default:
err = -EINVAL;
break;
rc = portals_debug_init(5 * 1024 * 1024);
if (rc < 0) {
- printk(KERN_ERR "portals_debug_init: %d\n", rc);
+ printk(KERN_ERR "LustreError: portals_debug_init: %d\n", rc);
return (rc);
}
+#if LWT_SUPPORT
+ rc = lwt_init();
+ if (rc != 0) {
+ CERROR("lwt_init: error %d\n", rc);
+ goto cleanup_debug;
+ }
+#endif
sema_init(&nal_cmd_sem, 1);
rc = misc_register(&portal_dev);
if (rc) {
CERROR("misc_register: error %d\n", rc);
- goto cleanup_debug;
+ goto cleanup_lwt;
}
rc = PtlInit();
PtlFini();
cleanup_deregister:
misc_deregister(&portal_dev);
+ cleanup_lwt:
+#if LWT_SUPPORT
+ lwt_fini();
+#endif
cleanup_debug:
portals_debug_cleanup();
return rc;
if (rc)
CERROR("misc_deregister error %d\n", rc);
+#if LWT_SUPPORT
+ lwt_fini();
+#endif
+
if (atomic_read(&portal_kmemory) != 0)
CERROR("Portals memory leaked: %d bytes\n",
atomic_read(&portal_kmemory));
rc = portals_debug_cleanup();
if (rc)
- printk(KERN_ERR "portals_debug_cleanup: %d\n", rc);
+ printk(KERN_ERR "LustreError: portals_debug_cleanup: %d\n", rc);
}
EXPORT_SYMBOL(lib_dispatch);
EXPORT_SYMBOL(portal_debug);
EXPORT_SYMBOL(portal_stack);
EXPORT_SYMBOL(portal_printk);
+EXPORT_SYMBOL(portal_cerror);
EXPORT_SYMBOL(PtlEQWait);
EXPORT_SYMBOL(PtlEQFree);
EXPORT_SYMBOL(PtlEQGet);
#define PSDEV_DEBUG 1 /* control debugging */
#define PSDEV_SUBSYSTEM_DEBUG 2 /* control debugging */
#define PSDEV_PRINTK 3 /* force all errors to console */
-#define PSDEV_DEBUG_PATH 4 /* crashdump log location */
-#define PSDEV_DEBUG_DUMP_PATH 5 /* crashdump tracelog location */
-#define PSDEV_PORTALS_UPCALL 6 /* User mode upcall script */
+#define PSDEV_CONSOLE 4 /* allow _any_ messages to console */
+#define PSDEV_DEBUG_PATH 5 /* crashdump log location */
+#define PSDEV_DEBUG_DUMP_PATH 6 /* crashdump tracelog location */
+#define PSDEV_PORTALS_UPCALL 7 /* User mode upcall script */
-#define PORTALS_PRIMARY_CTLCNT 6
+#define PORTALS_PRIMARY_CTLCNT 7
static struct ctl_table portals_table[PORTALS_PRIMARY_CTLCNT + 1] = {
{PSDEV_DEBUG, "debug", &portal_debug, sizeof(int), 0644, NULL,
&proc_dointvec},
sizeof(int), 0644, NULL, &proc_dointvec},
{PSDEV_PRINTK, "printk", &portal_printk, sizeof(int), 0644, NULL,
&proc_dointvec},
+ {PSDEV_CONSOLE, "console", &portal_cerror, sizeof(int), 0644, NULL,
+ &proc_dointvec},
{PSDEV_DEBUG_PATH, "debug_path", debug_file_path,
sizeof(debug_file_path), 0644, NULL, &proc_dostring, &sysctl_string},
{PSDEV_DEBUG_DUMP_PATH, "debug_daemon_path", debug_daemon_file_path,
int ptl_init;
unsigned int portal_subsystem_debug = ~0 - (S_PORTALS | S_QSWNAL | S_SOCKNAL | S_GMNAL);
unsigned int portal_debug = ~0;
+unsigned int portal_cerror = 1;
unsigned int portal_printk;
unsigned int portal_stack;
": simulated failure\n",
nal->ni.nid, hdr_type_string (hdr),
hdr->src_nid);
+ lib_recv (nal, private, NULL, NULL, 0, 0, PTL_HDR_LENGTH (hdr));
return (-1);
}
#include "router.h"
LIST_HEAD(kpr_routes);
+LIST_HEAD(kpr_gateways);
LIST_HEAD(kpr_nals);
unsigned long long kpr_fwd_bytes;
kprri_lookup: kpr_lookup_target,
kprri_fwd_start: kpr_forward_packet,
kprri_fwd_done: kpr_complete_packet,
+ kprri_notify: kpr_nal_notify,
kprri_shutdown: kpr_shutdown_nal,
kprri_deregister: kpr_deregister_nal,
};
kprci_add_route: kpr_add_route,
kprci_del_route: kpr_del_route,
kprci_get_route: kpr_get_route,
+ kprci_notify: kpr_sys_notify,
};
int
struct list_head *e;
kpr_nal_entry_t *ne;
- CDEBUG (D_OTHER, "Registering NAL %d\n", nalif->kprni_nalid);
+ CDEBUG (D_NET, "Registering NAL %d\n", nalif->kprni_nalid);
PORTAL_ALLOC (ne, sizeof (*ne));
if (ne == NULL)
}
void
+kpr_do_upcall (void *arg)
+{
+ kpr_upcall_t *u = (kpr_upcall_t *)arg;
+ char nalstr[10];
+ char nidstr[36];
+ char whenstr[36];
+ char *argv[] = {
+ NULL,
+ "ROUTER_NOTIFY",
+ nalstr,
+ nidstr,
+ u->kpru_alive ? "up" : "down",
+ whenstr,
+ NULL};
+
+ snprintf (nalstr, sizeof(nalstr), "%d", u->kpru_nal_id);
+ snprintf (nidstr, sizeof(nidstr), LPX64, u->kpru_nid);
+ snprintf (whenstr, sizeof(whenstr), "%ld", u->kpru_when);
+
+ portals_run_upcall (argv);
+
+ kfree (u);
+}
+
+void
+kpr_upcall (int gw_nalid, ptl_nid_t gw_nid, int alive, time_t when)
+{
+ /* May be in arbitrary context */
+ kpr_upcall_t *u = kmalloc (sizeof (kpr_upcall_t), GFP_ATOMIC);
+
+ if (u == NULL) {
+ CERROR ("Upcall out of memory: nal %d nid "LPX64" %s\n",
+ gw_nalid, gw_nid, alive ? "up" : "down");
+ return;
+ }
+
+ u->kpru_nal_id = gw_nalid;
+ u->kpru_nid = gw_nid;
+ u->kpru_alive = alive;
+ u->kpru_when = when;
+
+ prepare_work (&u->kpru_tq, kpr_do_upcall, u);
+ schedule_work (&u->kpru_tq);
+}
+
+int
+kpr_do_notify (int byNal, int gateway_nalid, ptl_nid_t gateway_nid,
+ int alive, time_t when)
+{
+ unsigned long flags;
+ int rc = -ENOENT;
+ kpr_nal_entry_t *ne = NULL;
+ kpr_gateway_entry_t *ge = NULL;
+ struct timeval now;
+ struct list_head *e;
+ struct list_head *n;
+
+ CDEBUG (D_ERROR, "%s notifying [%d] "LPX64": %s\n",
+ byNal ? "NAL" : "userspace",
+ gateway_nalid, gateway_nid, alive ? "up" : "down");
+
+ /* can't do predictions... */
+ do_gettimeofday (&now);
+ if (when > now.tv_sec) {
+ CWARN ("Ignoring prediction from %s of [%d] "LPX64" %s "
+ "%ld seconds in the future\n",
+ byNal ? "NAL" : "userspace",
+ gateway_nalid, gateway_nid,
+ alive ? "up" : "down",
+ when - now.tv_sec);
+ return (EINVAL);
+ }
+
+ LASSERT (when <= now.tv_sec);
+
+ /* Serialise with lookups (i.e. write lock) */
+ write_lock_irqsave(&kpr_rwlock, flags);
+
+ list_for_each_safe (e, n, &kpr_gateways) {
+
+ ge = list_entry(e, kpr_gateway_entry_t, kpge_list);
+ if ((gateway_nalid != 0 &&
+ ge->kpge_nalid != gateway_nalid) ||
+ ge->kpge_nid != gateway_nid)
+ continue;
+
+ rc = 0;
+ break;
+ }
+
+ if (rc != 0) {
+ /* gateway not found */
+ write_unlock_irqrestore(&kpr_rwlock, flags);
+ CDEBUG (D_NET, "Gateway not found\n");
+ return (rc);
+ }
+
+ if (when < ge->kpge_timestamp) {
+ /* out of date information */
+ write_unlock_irqrestore (&kpr_rwlock, flags);
+ CDEBUG (D_NET, "Out of date\n");
+ return (0);
+ }
+
+ /* update timestamp */
+ ge->kpge_timestamp = when;
+
+ if ((!ge->kpge_alive) == (!alive)) {
+ /* new date for old news */
+ write_unlock_irqrestore (&kpr_rwlock, flags);
+ CDEBUG (D_NET, "Old news\n");
+ return (0);
+ }
+
+ ge->kpge_alive = alive;
+ CDEBUG(D_NET, "set "LPX64" [%p] %d\n", gateway_nid, ge, alive);
+
+ if (alive) {
+ /* Reset all gateway weights so the newly-enabled gateway
+ * doesn't have to play catch-up */
+ list_for_each_safe (e, n, &kpr_gateways) {
+ kpr_gateway_entry_t *ge = list_entry(e, kpr_gateway_entry_t,
+ kpge_list);
+ atomic_set (&ge->kpge_weight, 0);
+ }
+ }
+
+ if (!byNal) {
+ /* userland notified me: notify NAL? */
+ ne = kpr_find_nal_entry_locked (ge->kpge_nalid);
+ if (ne != NULL) {
+ if (ne->kpne_shutdown ||
+ ne->kpne_interface.kprni_notify == NULL) {
+ /* no need to notify */
+ ne = NULL;
+ } else {
+ /* take a ref on this NAL until notifying
+ * it has completed... */
+ atomic_inc (&ne->kpne_refcount);
+ }
+ }
+ }
+
+ write_unlock_irqrestore(&kpr_rwlock, flags);
+
+ if (ne != NULL) {
+ ne->kpne_interface.kprni_notify (ne->kpne_interface.kprni_arg,
+ gateway_nid, alive);
+ /* 'ne' can disappear now... */
+ atomic_dec (&ne->kpne_refcount);
+ }
+
+ if (byNal) {
+ /* It wasn't userland that notified me... */
+ CWARN ("Upcall: NAL %d NID "LPX64" is %s\n",
+ gateway_nalid, gateway_nid,
+ alive ? "alive" : "dead");
+ kpr_upcall (gateway_nalid, gateway_nid, alive, when);
+ } else {
+ CDEBUG (D_NET, " NOT Doing upcall\n");
+ }
+
+ return (0);
+}
+
+void
+kpr_nal_notify (void *arg, ptl_nid_t peer, int alive, time_t when)
+{
+ kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg;
+
+ kpr_do_notify (1, ne->kpne_interface.kprni_nalid, peer, alive, when);
+}
+
+void
kpr_shutdown_nal (void *arg)
{
unsigned long flags;
kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg;
- CDEBUG (D_OTHER, "Shutting down NAL %d\n", ne->kpne_interface.kprni_nalid);
+ CDEBUG (D_NET, "Shutting down NAL %d\n", ne->kpne_interface.kprni_nalid);
LASSERT (!ne->kpne_shutdown);
LASSERT (!in_interrupt());
unsigned long flags;
kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg;
- CDEBUG (D_OTHER, "Deregister NAL %d\n", ne->kpne_interface.kprni_nalid);
+ CDEBUG (D_NET, "Deregister NAL %d\n", ne->kpne_interface.kprni_nalid);
LASSERT (ne->kpne_shutdown); /* caller must have issued shutdown already */
LASSERT (atomic_read (&ne->kpne_refcount) == 0); /* can't be busy */
PORTAL_MODULE_UNUSE;
}
+int
+kpr_ge_isbetter (kpr_gateway_entry_t *ge1, kpr_gateway_entry_t *ge2)
+{
+ const int significant_bits = 0x00ffffff;
+ /* We use atomic_t to record/compare route weights for
+ * load-balancing. Here we limit ourselves to only using
+ * 'significant_bits' when we do an 'after' comparison */
+
+ int diff = (atomic_read (&ge1->kpge_weight) -
+ atomic_read (&ge2->kpge_weight)) & significant_bits;
+ int rc = (diff > (significant_bits >> 1));
+
+ CDEBUG(D_NET, "[%p]"LPX64"=%d %s [%p]"LPX64"=%d\n",
+ ge1, ge1->kpge_nid, atomic_read (&ge1->kpge_weight),
+ rc ? ">" : "<",
+ ge2, ge2->kpge_nid, atomic_read (&ge2->kpge_weight));
+
+ return (rc);
+}
+
+void
+kpr_update_weight (kpr_gateway_entry_t *ge, int nob)
+{
+ int weight = 1 + (nob + sizeof (ptl_hdr_t)/2)/sizeof (ptl_hdr_t);
+
+ /* We've chosen this route entry (i.e. gateway) to forward payload
+ * of length 'nob'; update the route's weight to make it less
+ * favoured. Note that the weight is 1 plus the payload size
+ * rounded and scaled to the portals header size, so we get better
+ * use of the significant bits in kpge_weight. */
+
+ CDEBUG(D_NET, "gateway [%p]"LPX64" += %d\n", ge,
+ ge->kpge_nid, weight);
+
+ atomic_add (weight, &ge->kpge_weight);
+}
int
-kpr_lookup_target (void *arg, ptl_nid_t target_nid, ptl_nid_t *gateway_nidp)
+kpr_lookup_target (void *arg, ptl_nid_t target_nid, int nob,
+ ptl_nid_t *gateway_nidp)
{
- kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg;
- struct list_head *e;
- int rc = -ENOENT;
+ kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg;
+ struct list_head *e;
+ kpr_route_entry_t *re;
+ kpr_gateway_entry_t *ge = NULL;
+ int rc = -ENOENT;
+
+ /* Caller wants to know if 'target_nid' can be reached via a gateway
+ * ON HER OWN NETWORK */
- CDEBUG (D_OTHER, "lookup "LPX64" from NAL %d\n", target_nid, ne->kpne_interface.kprni_nalid);
+ CDEBUG (D_NET, "lookup "LPX64" from NAL %d\n", target_nid,
+ ne->kpne_interface.kprni_nalid);
if (ne->kpne_shutdown) /* caller is shutting down */
return (-ENOENT);
/* Search routes for one that has a gateway to target_nid on the callers network */
- for (e = kpr_routes.next; e != &kpr_routes; e = e->next)
- {
- kpr_route_entry_t *re = list_entry (e, kpr_route_entry_t, kpre_list);
+ list_for_each (e, &kpr_routes) {
+ re = list_entry (e, kpr_route_entry_t, kpre_list);
if (re->kpre_lo_nid > target_nid ||
re->kpre_hi_nid < target_nid)
/* found table entry */
- if (re->kpre_gateway_nalid != ne->kpne_interface.kprni_nalid) /* different NAL */
- rc = -EHOSTUNREACH;
- else
- {
- rc = 0;
- *gateway_nidp = re->kpre_gateway_nid;
- }
- break;
+ if (re->kpre_gateway->kpge_nalid != ne->kpne_interface.kprni_nalid ||
+ !re->kpre_gateway->kpge_alive) {
+ /* different NAL or gateway down */
+ rc = -EHOSTUNREACH;
+ continue;
+ }
+
+ if (ge == NULL ||
+ kpr_ge_isbetter (re->kpre_gateway, ge))
+ ge = re->kpre_gateway;
}
+ if (ge != NULL) {
+ kpr_update_weight (ge, nob);
+ *gateway_nidp = ge->kpge_nid;
+ rc = 0;
+ }
+
read_unlock (&kpr_rwlock);
- CDEBUG (D_OTHER, "lookup "LPX64" from NAL %d: %d ("LPX64")\n",
+ /* NB can't deref 're' now; it might have been removed! */
+
+ CDEBUG (D_NET, "lookup "LPX64" from NAL %d: %d ("LPX64")\n",
target_nid, ne->kpne_interface.kprni_nalid, rc,
(rc == 0) ? *gateway_nidp : (ptl_nid_t)0);
return (rc);
}
+kpr_nal_entry_t *
+kpr_find_nal_entry_locked (int nal_id)
+{
+ struct list_head *e;
+
+ /* Called with kpr_rwlock held */
+
+ list_for_each (e, &kpr_nals) {
+ kpr_nal_entry_t *ne = list_entry (e, kpr_nal_entry_t, kpne_list);
+
+ if (nal_id != ne->kpne_interface.kprni_nalid) /* no match */
+ continue;
+
+ return (ne);
+ }
+
+ return (NULL);
+}
+
void
kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd)
{
- kpr_nal_entry_t *src_ne = (kpr_nal_entry_t *)arg;
- ptl_nid_t target_nid = fwd->kprfd_target_nid;
- int nob = fwd->kprfd_nob;
- struct list_head *e;
-
- CDEBUG (D_OTHER, "forward [%p] "LPX64" from NAL %d\n", fwd,
+ kpr_nal_entry_t *src_ne = (kpr_nal_entry_t *)arg;
+ ptl_nid_t target_nid = fwd->kprfd_target_nid;
+ int nob = fwd->kprfd_nob;
+ kpr_gateway_entry_t *ge = NULL;
+ kpr_nal_entry_t *dst_ne = NULL;
+ struct list_head *e;
+ kpr_route_entry_t *re;
+ kpr_nal_entry_t *tmp_ne;
+
+ CDEBUG (D_NET, "forward [%p] "LPX64" from NAL %d\n", fwd,
target_nid, src_ne->kpne_interface.kprni_nalid);
LASSERT (nob >= sizeof (ptl_hdr_t)); /* at least got a packet header */
/* Search routes for one that has a gateway to target_nid NOT on the caller's network */
- for (e = kpr_routes.next; e != &kpr_routes; e = e->next)
- {
- kpr_route_entry_t *re = list_entry (e, kpr_route_entry_t, kpre_list);
+ list_for_each (e, &kpr_routes) {
+ re = list_entry (e, kpr_route_entry_t, kpre_list);
if (re->kpre_lo_nid > target_nid || /* no match */
re->kpre_hi_nid < target_nid)
continue;
- CDEBUG (D_OTHER, "forward [%p] "LPX64" from NAL %d: match "LPX64" on NAL %d\n", fwd,
- target_nid, src_ne->kpne_interface.kprni_nalid,
- re->kpre_gateway_nid, re->kpre_gateway_nalid);
+ if (re->kpre_gateway->kpge_nalid == src_ne->kpne_interface.kprni_nalid)
+ continue; /* don't route to same NAL */
- if (re->kpre_gateway_nalid == src_ne->kpne_interface.kprni_nalid)
- break; /* don't route to same NAL */
+ if (!re->kpre_gateway->kpge_alive)
+ continue; /* gateway is dead */
+
+ tmp_ne = kpr_find_nal_entry_locked (re->kpre_gateway->kpge_nalid);
- /* Search for gateway's NAL's entry */
-
- for (e = kpr_nals.next; e != &kpr_nals; e = e->next)
- {
- kpr_nal_entry_t *dst_ne = list_entry (e, kpr_nal_entry_t, kpne_list);
-
- if (re->kpre_gateway_nalid != dst_ne->kpne_interface.kprni_nalid) /* no match */
- continue;
+ if (tmp_ne == NULL ||
+ tmp_ne->kpne_shutdown) {
+ /* NAL must be registered and not shutting down */
+ continue;
+ }
- if (dst_ne->kpne_shutdown) /* don't route if NAL is shutting down */
- break;
+ if (ge == NULL ||
+ kpr_ge_isbetter (re->kpre_gateway, ge)) {
+ ge = re->kpre_gateway;
+ dst_ne = tmp_ne;
+ }
+ }
+
+ if (ge != NULL) {
+ LASSERT (dst_ne != NULL);
+
+ kpr_update_weight (ge, nob);
- fwd->kprfd_gateway_nid = re->kpre_gateway_nid;
- atomic_inc (&dst_ne->kpne_refcount); /* dest nal is busy until fwd completes */
+ fwd->kprfd_gateway_nid = ge->kpge_nid;
+ atomic_inc (&dst_ne->kpne_refcount); /* dest nal is busy until fwd completes */
- read_unlock (&kpr_rwlock);
+ read_unlock (&kpr_rwlock);
- CDEBUG (D_OTHER, "forward [%p] "LPX64" from NAL %d: "LPX64" on NAL %d\n", fwd,
- target_nid, src_ne->kpne_interface.kprni_nalid,
- fwd->kprfd_gateway_nid, dst_ne->kpne_interface.kprni_nalid);
+ CDEBUG (D_NET, "forward [%p] "LPX64" from NAL %d: "
+ "to "LPX64" on NAL %d\n",
+ fwd, target_nid, src_ne->kpne_interface.kprni_nalid,
+ fwd->kprfd_gateway_nid, dst_ne->kpne_interface.kprni_nalid);
- dst_ne->kpne_interface.kprni_fwd (dst_ne->kpne_interface.kprni_arg, fwd);
- return;
- }
- break;
+ dst_ne->kpne_interface.kprni_fwd (dst_ne->kpne_interface.kprni_arg, fwd);
+ return;
}
- read_unlock (&kpr_rwlock);
+ read_unlock (&kpr_rwlock);
out:
kpr_fwd_errors++;
- CDEBUG (D_OTHER, "Failed to forward [%p] "LPX64" from NAL %d\n", fwd,
+ CDEBUG (D_NET, "Failed to forward [%p] "LPX64" from NAL %d\n", fwd,
target_nid, src_ne->kpne_interface.kprni_nalid);
/* Can't find anywhere to forward to */
kpr_nal_entry_t *dst_ne = (kpr_nal_entry_t *)arg;
kpr_nal_entry_t *src_ne = (kpr_nal_entry_t *)fwd->kprfd_router_arg;
- CDEBUG (D_OTHER, "complete(1) [%p] from NAL %d to NAL %d: %d\n", fwd,
+ CDEBUG (D_NET, "complete(1) [%p] from NAL %d to NAL %d: %d\n", fwd,
src_ne->kpne_interface.kprni_nalid, dst_ne->kpne_interface.kprni_nalid, error);
atomic_dec (&dst_ne->kpne_refcount); /* CAVEAT EMPTOR dst_ne can disappear now!!! */
(fwd->kprfd_callback)(fwd->kprfd_callback_arg, error);
- CDEBUG (D_OTHER, "complete(2) [%p] from NAL %d: %d\n", fwd,
+ CDEBUG (D_NET, "complete(2) [%p] from NAL %d: %d\n", fwd,
src_ne->kpne_interface.kprni_nalid, error);
atomic_dec (&kpr_queue_depth);
}
int
-kpr_add_route (int gateway_nalid, ptl_nid_t gateway_nid, ptl_nid_t lo_nid,
- ptl_nid_t hi_nid)
+kpr_add_route (int gateway_nalid, ptl_nid_t gateway_nid,
+ ptl_nid_t lo_nid, ptl_nid_t hi_nid)
{
- unsigned long flags;
- struct list_head *e;
- kpr_route_entry_t *re;
+ unsigned long flags;
+ struct list_head *e;
+ kpr_route_entry_t *re;
+ kpr_gateway_entry_t *ge;
+ int dup = 0;
- CDEBUG(D_OTHER, "Add route: %d "LPX64" : "LPX64" - "LPX64"\n",
+ CDEBUG(D_NET, "Add route: %d "LPX64" : "LPX64" - "LPX64"\n",
gateway_nalid, gateway_nid, lo_nid, hi_nid);
- LASSERT(lo_nid <= hi_nid);
+ if (gateway_nalid == PTL_NID_ANY ||
+ lo_nid == PTL_NID_ANY ||
+ hi_nid == PTL_NID_ANY ||
+ lo_nid > hi_nid)
+ return (-EINVAL);
+
+ PORTAL_ALLOC (ge, sizeof (*ge));
+ if (ge == NULL)
+ return (-ENOMEM);
+
+ ge->kpge_nalid = gateway_nalid;
+ ge->kpge_nid = gateway_nid;
+ ge->kpge_alive = 1;
+ ge->kpge_timestamp = 0;
+ ge->kpge_refcount = 0;
+ atomic_set (&ge->kpge_weight, 0);
PORTAL_ALLOC (re, sizeof (*re));
if (re == NULL)
return (-ENOMEM);
- re->kpre_gateway_nalid = gateway_nalid;
- re->kpre_gateway_nid = gateway_nid;
re->kpre_lo_nid = lo_nid;
re->kpre_hi_nid = hi_nid;
LASSERT(!in_interrupt());
write_lock_irqsave (&kpr_rwlock, flags);
- for (e = kpr_routes.next; e != &kpr_routes; e = e->next) {
- kpr_route_entry_t *re2 = list_entry(e, kpr_route_entry_t,
- kpre_list);
-
- if (re->kpre_lo_nid > re2->kpre_hi_nid ||
- re->kpre_hi_nid < re2->kpre_lo_nid)
- continue;
+ list_for_each (e, &kpr_gateways) {
+ kpr_gateway_entry_t *ge2 = list_entry(e, kpr_gateway_entry_t,
+ kpge_list);
+
+ if (ge2->kpge_nalid == gateway_nalid &&
+ ge2->kpge_nid == gateway_nid) {
+ PORTAL_FREE (ge, sizeof (*ge));
+ ge = ge2;
+ dup = 1;
+ break;
+ }
+ }
- CERROR ("Attempt to add duplicate routes ["LPX64" - "LPX64"]"
- "to ["LPX64" - "LPX64"]\n",
- re->kpre_lo_nid, re->kpre_hi_nid,
- re2->kpre_lo_nid, re2->kpre_hi_nid);
+ if (!dup) {
+ /* Adding a new gateway... */
+
+ list_add (&ge->kpge_list, &kpr_gateways);
- write_unlock_irqrestore (&kpr_rwlock, flags);
+ /* ...zero all gateway weights so this one doesn't have to
+ * play catch-up */
- PORTAL_FREE (re, sizeof (*re));
- return (-EINVAL);
+ list_for_each (e, &kpr_gateways) {
+ kpr_gateway_entry_t *ge2 = list_entry(e, kpr_gateway_entry_t,
+ kpge_list);
+ atomic_set (&ge2->kpge_weight, 0);
+ }
+
}
+ re->kpre_gateway = ge;
+ ge->kpge_refcount++;
list_add (&re->kpre_list, &kpr_routes);
write_unlock_irqrestore (&kpr_rwlock, flags);
}
int
-kpr_del_route (ptl_nid_t nid)
+kpr_sys_notify (int gateway_nalid, ptl_nid_t gateway_nid,
+ int alive, time_t when)
{
+ return (kpr_do_notify (0, gateway_nalid, gateway_nid, alive, when));
+}
+
+int
+kpr_del_route (int gw_nalid, ptl_nid_t gw_nid,
+ ptl_nid_t lo, ptl_nid_t hi)
+{
+ int specific = (lo != PTL_NID_ANY);
unsigned long flags;
+ int rc = -ENOENT;
struct list_head *e;
+ struct list_head *n;
- CDEBUG(D_OTHER, "Del route "LPX64"\n", nid);
+ CDEBUG(D_NET, "Del route [%d] "LPX64" : "LPX64" - "LPX64"\n",
+ gw_nalid, gw_nid, lo, hi);
LASSERT(!in_interrupt());
+
+ /* NB Caller may specify either all routes via the given gateway
+ * (lo/hi == PTL_NID_ANY) or a specific route entry (lo/hi are
+ * actual NIDs) */
+
+ if (specific ? (hi == PTL_NID_ANY || hi < lo) : (hi != PTL_NID_ANY))
+ return (-EINVAL);
+
write_lock_irqsave(&kpr_rwlock, flags);
- for (e = kpr_routes.next; e != &kpr_routes; e = e->next) {
- kpr_route_entry_t *re = list_entry(e, kpr_route_entry_t,
+ list_for_each_safe (e, n, &kpr_routes) {
+ kpr_route_entry_t *re = list_entry(e, kpr_route_entry_t,
kpre_list);
-
- if (re->kpre_lo_nid > nid || re->kpre_hi_nid < nid)
+ kpr_gateway_entry_t *ge = re->kpre_gateway;
+
+ if (ge->kpge_nalid != gw_nalid ||
+ ge->kpge_nid != gw_nid ||
+ (specific &&
+ (lo != re->kpre_lo_nid || hi != re->kpre_hi_nid)))
continue;
- list_del (&re->kpre_list);
- write_unlock_irqrestore(&kpr_rwlock, flags);
+ rc = 0;
+ if (--ge->kpge_refcount == 0) {
+ list_del (&ge->kpge_list);
+ PORTAL_FREE (ge, sizeof (*ge));
+ }
+
+ list_del (&re->kpre_list);
PORTAL_FREE(re, sizeof (*re));
- return (0);
+
+ if (specific)
+ break;
}
write_unlock_irqrestore(&kpr_rwlock, flags);
- return (-ENOENT);
+ return (rc);
}
int
-kpr_get_route(int idx, int *gateway_nalid, ptl_nid_t *gateway_nid,
- ptl_nid_t *lo_nid, ptl_nid_t *hi_nid)
+kpr_get_route (int idx, int *gateway_nalid, ptl_nid_t *gateway_nid,
+ ptl_nid_t *lo_nid, ptl_nid_t *hi_nid, int *alive)
{
struct list_head *e;
read_lock(&kpr_rwlock);
for (e = kpr_routes.next; e != &kpr_routes; e = e->next) {
- kpr_route_entry_t *re = list_entry(e, kpr_route_entry_t,
- kpre_list);
-
+ kpr_route_entry_t *re = list_entry(e, kpr_route_entry_t,
+ kpre_list);
+ kpr_gateway_entry_t *ge = re->kpre_gateway;
+
if (idx-- == 0) {
- *gateway_nalid = re->kpre_gateway_nalid;
- *gateway_nid = re->kpre_gateway_nid;
+ *gateway_nalid = ge->kpge_nalid;
+ *gateway_nid = ge->kpge_nid;
+ *alive = ge->kpge_alive;
*lo_nid = re->kpre_lo_nid;
*hi_nid = re->kpre_hi_nid;
typedef struct
{
+ struct list_head kpge_list;
+ atomic_t kpge_weight;
+ time_t kpge_timestamp;
+ int kpge_alive;
+ int kpge_nalid;
+ int kpge_refcount;
+ ptl_nid_t kpge_nid;
+} kpr_gateway_entry_t;
+
+typedef struct
+{
struct list_head kpre_list;
- int kpre_gateway_nalid;
- ptl_nid_t kpre_gateway_nid;
+ kpr_gateway_entry_t *kpre_gateway;
ptl_nid_t kpre_lo_nid;
ptl_nid_t kpre_hi_nid;
} kpr_route_entry_t;
+typedef struct
+{
+ struct tq_struct kpru_tq;
+ int kpru_nal_id;
+ ptl_nid_t kpru_nid;
+ int kpru_alive;
+ time_t kpru_when;
+} kpr_upcall_t;
+
extern int kpr_register_nal (kpr_nal_interface_t *nalif, void **argp);
-extern int kpr_lookup_target (void *arg, ptl_nid_t target_nid, ptl_nid_t *gateway_nidp);
+extern int kpr_lookup_target (void *arg, ptl_nid_t target_nid, int nob,
+ ptl_nid_t *gateway_nidp);
+extern kpr_nal_entry_t *kpr_find_nal_entry_locked (int nal_id);
extern void kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd);
extern void kpr_complete_packet (void *arg, kpr_fwd_desc_t *fwd, int error);
+extern void kpr_nal_notify (void *arg, ptl_nid_t peer,
+ int alive, time_t when);
extern void kpr_shutdown_nal (void *arg);
extern void kpr_deregister_nal (void *arg);
extern int kpr_add_route (int gateway_nal, ptl_nid_t gateway_nid,
ptl_nid_t lo_nid, ptl_nid_t hi_nid);
-extern int kpr_del_route (ptl_nid_t nid);
+extern int kpr_del_route (int gw_nal, ptl_nid_t gw_nid,
+ ptl_nid_t lo, ptl_nid_t hi);
extern int kpr_get_route (int idx, int *gateway_nal, ptl_nid_t *gateway_nid,
- ptl_nid_t *lo_nid, ptl_nid_t *hi_nid);
+ ptl_nid_t *lo_nid, ptl_nid_t *hi_nid, int *alive);
+extern int kpr_sys_notify (int gw_nalid, ptl_nid_t gw_nid,
+ int alive, time_t when);
extern unsigned long long kpr_fwd_bytes;
extern unsigned long kpr_fwd_packets;
magic = *(int *)(ev->mem_desc.start + ev->offset);
if(magic != 0xcafebabe) {
- printk ("Unexpected response \n");
+ printk ("LustreError: Unexpected response \n");
return 1;
}
if((i == count) || !count)
wake_up_process (client->tsk);
else
- printk ("Received response after timeout for %d\n",i);
+ printk ("LustreError: Received response after timeout for %d\n",i);
return 1;
}
pingcli_shutdown (1);
return NULL;
}
- printk ("sent msg no %d", count);
+ printk ("Lustre: sent msg no %d", count);
set_current_state (TASK_INTERRUPTIBLE);
rc = schedule_timeout (20 * args->ioc_timeout);
if (rc == 0) {
- printk (" :: timeout .....\n");
+ printk ("LustreError: :: timeout .....\n");
} else {
do_gettimeofday (&tv2);
- printk(" :: Reply in %u usec\n",
+ printk("Lustre: :: Reply in %u usec\n",
(unsigned)((tv2.tv_sec - tv1.tv_sec)
* 1000000 + (tv2.tv_usec - tv1.tv_usec)));
}
if(magic != 0xdeadbeef) {
- printk("Unexpected Packet to the server\n");
+ printk("LustreError: Unexpected Packet to the server\n");
}
memcpy (server->in_buf, &ping_bulk_magic, sizeof(ping_bulk_magic));
}
server->evnt = *ev;
- printk ("received ping from nid "LPX64" "
+ printk ("Lustre: received ping from nid "LPX64" "
"(off=%u rlen=%u mlen=%u head=%x seq=%d size=%d)\n",
ev->initiator.nid, ev->offset, ev->rlength, ev->mlength,
*((int *)(ev->mem_desc.start + ev->offset)),
set_current_state (TASK_INTERRUPTIBLE);
rc = schedule_timeout (20 * args->ioc_timeout);
if (rc == 0) {
- printk (" Time out on the server\n");
+ printk ("LustreError: Time out on the server\n");
pingcli_shutdown (2);
return NULL;
} else
- printk("Received respose from the server \n");
+ printk("Lustre: Received respose from the server \n");
pingcli_shutdown (2);
}
server->evnt = *ev;
- printk ("received ping from nid "LPX64" "
+ printk ("Lustre: received ping from nid "LPX64" "
"(off=%u rlen=%u mlen=%u head=%x)\n",
ev->initiator.nid, ev->offset, ev->rlength, ev->mlength,
*((int *)(ev->mem_desc.start + ev->offset)));
/sbin/insmod ./$PING
echo kqswnal > /tmp/nal
;;
+
+ gm)
+ /sbin/insmod portals
+ /sbin/insmod kgmnal
+ /sbin/insmod ./$PING
+ echo kgmnal > /tmp/nal
+ ;;
*)
- echo "Usage : ${0} < tcp | toe | elan >"
+ echo "Usage : ${0} < tcp | toe | elan | gm>"
exit 1;
esac
exit 0;
/sbin/insmod ./$PING nal=4
echo kqswnal > /tmp/nal
;;
+
+ gm)
+ /sbin/insmod portals
+ /sbin/insmod kgmnal
+ /sbin/insmod ./$PING nal=3
+ echo kgmnal > /tmp/nal
+ ;;
*)
- echo "Usage : ${0} < tcp | toe | elan >"
+ echo "Usage : ${0} < tcp | toe | elan | gm>"
exit 1;
esac
../utils/acceptor 9999&
.deps
routerstat
wirecheck
+gmnalnid
.*.cmd
if LIBLUSTRE
sbin_PROGRAMS = ptlctl debugctl routerstat wirecheck
else
-sbin_PROGRAMS = acceptor ptlctl debugctl routerstat wirecheck
+sbin_PROGRAMS = acceptor ptlctl debugctl routerstat wirecheck gmnalnid
endif
lib_LIBRARIES = libptlctl.a
libptlctl_a_SOURCES = portals.c debug.c l_ioctl.c parser.c parser.h
+gmnalnid_SOURCES = gmnalnid.c
+
ptlctl_SOURCES = ptlctl.c
ptlctl_LDADD = -L. -lptlctl -lncurses # -lefence
ptlctl_DEPENDENCIES = libptlctl.a
if (!strcasecmp (str, "no") ||
!strcasecmp (str, "n") ||
!strcasecmp (str, "off") ||
+ !strcasecmp (str, "down") ||
!strcasecmp (str, "disable"))
{
*b = 0;
if (!strcasecmp (str, "yes") ||
!strcasecmp (str, "y") ||
!strcasecmp (str, "on") ||
+ !strcasecmp (str, "up") ||
!strcasecmp (str, "enable"))
{
*b = 1;
unsigned int portal_debug;
unsigned int portal_printk;
unsigned int portal_stack;
-
+unsigned int portal_cerror;
static unsigned int g_nal = 0;
} name2num_t;
static name2num_t nalnames[] = {
+ {"any", 0},
{"tcp", SOCKNAL},
{"toe", TOENAL},
{"elan", QSWNAL},
{
name2num_t *e = name2num_lookup_name (nalnames, str);
- return ((e == NULL) ? 0 : e->num);
+ return ((e == NULL) ? -1 : e->num);
}
static char *
}
int
+ptl_parse_port (int *port, char *str)
+{
+ char *end;
+
+ *port = strtol (str, &end, 0);
+
+ if (*end == 0 && /* parsed whole string */
+ *port > 0 && *port < 65536) /* minimal sanity check */
+ return (0);
+
+ return (-1);
+}
+
+int
+ptl_parse_time (time_t *t, char *str)
+{
+ char *end;
+ int n;
+ struct tm tm;
+
+ *t = strtol (str, &end, 0);
+ if (*end == 0) /* parsed whole string */
+ return (0);
+
+ memset (&tm, 0, sizeof (tm));
+ n = sscanf (str, "%d-%d-%d %d:%d:%d",
+ &tm.tm_year, &tm.tm_mon, &tm.tm_mday,
+ &tm.tm_hour, &tm.tm_min, &tm.tm_sec);
+ if (n != 6)
+ return (-1);
+
+ tm.tm_mon--; /* convert to 0 == Jan */
+ tm.tm_year -= 1900; /* y2k quirk */
+ tm.tm_isdst = -1; /* dunno if it's daylight savings... */
+
+ *t = mktime (&tm);
+ if (*t == (time_t)-1)
+ return (-1);
+
+ return (0);
+}
+
+int
ptl_parse_ipaddr (__u32 *ipaddrp, char *str)
{
struct hostent *he;
int
ptl_parse_nid (ptl_nid_t *nidp, char *str)
{
- __u32 ipaddr;
- long lval;
+ __u32 ipaddr;
+ char *end;
+ unsigned long long ullval;
if (!strcmp (str, "_all_")) {
*nidp = PTL_NID_ANY;
return (0);
}
- if (sscanf (str, "%li", &lval) == 1)
- {
- *nidp = (ptl_nid_t)lval;
- return (0);
- }
-
- if (sscanf (str, "%lx", &lval) == 1)
- {
- *nidp = (ptl_nid_t)lval;
+ ullval = strtoull(str, &end, 0);
+ if (*end == 0) {
+ /* parsed whole string */
+ *nidp = (ptl_nid_t)ullval;
return (0);
}
if (he != NULL)
strcpy (buffer, he->h_name);
else
- sprintf (buffer, "0x"LPX64, nid);
+ sprintf (buffer, LPX64, nid);
return (buffer);
}
-int g_nal_is_compatible (char *cmd, ...)
+int g_nal_is_set ()
{
- va_list ap;
- int nal;
-
if (g_nal == 0) {
fprintf (stderr, "Error: you must run the 'network' command first.\n");
return (0);
}
-
+
+ return (1);
+}
+
+int g_nal_is_compatible (char *cmd, ...)
+{
+ va_list ap;
+ int nal;
+
+ if (!g_nal_is_set ())
+ return (0);
+
va_start (ap, cmd);
do {
if (g_nal == nal)
return (1);
-
- fprintf (stderr, "Command %s not compatible with nal %s\n",
- cmd, nal2name (g_nal));
+
+ if (cmd != NULL) {
+ /* Don't complain verbosely if we've not been passed a command
+ * name to complain about! */
+ fprintf (stderr, "Command %s not compatible with nal %s\n",
+ cmd, nal2name (g_nal));
+ }
return (0);
}
int nal;
if (argc == 2 &&
- (nal = ptl_name2nal (argv[1])) != 0) {
+ (nal = ptl_name2nal (argv[1])) >= 0) {
g_nal = nal;
return (0);
}
if (rc != 0)
break;
- printf (LPX64"@%s:%d #%d buffer %d nonagle %s xchg %s affinity %s share %d\n",
+ printf (LPX64"@%s:%d #%d buffer %d nonagle %s xchg %s "
+ "affinity %s eager %s share %d\n",
data.ioc_nid, ptl_ipaddr_2_str (data.ioc_id, buffer),
data.ioc_misc, data.ioc_count, data.ioc_size,
(data.ioc_flags & 1) ? "on" : "off",
(data.ioc_flags & 2) ? "on" : "off",
(data.ioc_flags & 4) ? "on" : "off",
+ (data.ioc_flags & 8) ? "on" : "off",
data.ioc_wait);
}
int xchange_nids = 0;
int irq_affinity = 0;
int share = 0;
+ int eager = 0;
int rc;
if (argc < 4 || argc > 5) {
- fprintf (stderr, "usage: %s nid ipaddr port [ixs]\n", argv[0]);
+ fprintf (stderr, "usage: %s nid ipaddr port [ixse]\n", argv[0]);
return 0;
}
return -1;
}
- port = atol (argv[3]);
-
+ if (ptl_parse_port (&port, argv[3]) != 0) {
+ fprintf (stderr, "Can't parse port: %s\n", argv[3]);
+ return -1;
+ }
+
if (argc > 4) {
char *opts = argv[4];
case 's':
share = 1;
break;
+ case 'e':
+ eager = 1;
+ break;
default:
fprintf (stderr, "Can't parse options: %s\n",
argv[4]);
data.ioc_misc = port;
/* only passing one buffer size! */
data.ioc_size = MAX (g_socket_rxmem, g_socket_txmem);
- data.ioc_flags = (g_socket_nonagle ? 1 : 0) |
- (xchange_nids ? 2 : 0) |
- (irq_affinity ? 4 : 0) |
- (share ? 8 : 0);
+ data.ioc_flags = (g_socket_nonagle ? 0x01 : 0) |
+ (xchange_nids ? 0x02 : 0) |
+ (irq_affinity ? 0x04 : 0) |
+ (share ? 0x08 : 0) |
+ (eager ? 0x10 : 0);
rc = l_ioctl (PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data);
if (rc != 0) {
if (rc != 0)
break;
- printf (LPD64"@%s:%d\n",
+ printf (LPX64"@%s:%d\n",
data.ioc_nid,
ptl_ipaddr_2_str (data.ioc_id, buffer),
data.ioc_misc);
return -1;
}
- port = atol(argv[2]);
+ if (ptl_parse_port (&port, argv[2]) != 0) {
+ fprintf (stderr, "Can't parse port: %s\n", argv[2]);
+ return -1;
+ }
+
if (argc > 3)
for (flag = argv[3]; *flag != 0; flag++)
switch (*flag)
return 0;
}
- if (!g_nal_is_compatible (argv[0], SOCKNAL, TOENAL, 0))
- return -1;
+ if (!g_nal_is_compatible (NULL, SOCKNAL, TOENAL, 0))
+ return 0;
if (argc >= 2 &&
ptl_parse_nid (&nid, argv[1]) != 0) {
return 0;
}
- if (g_nal == 0) {
- fprintf(stderr, "Error: you must run the 'network' command "
- "first.\n");
+ if (!g_nal_is_set())
return -1;
- }
if (ptl_parse_nid (&nid, argv[1]) != 0)
{
return 0;
}
- if (g_nal == 0) {
- fprintf(stderr, "Error: you must run the 'network' command first\n");
+ if (!g_nal_is_set())
return -1;
- }
-
+
PORTAL_IOC_INIT (data);
data.ioc_nal = g_nal;
rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_GET_NID, &data);
return 0;
}
- if (g_nal == 0) {
- fprintf(stderr, "Error: you must run the 'network' command "
- "first.\n");
+ if (!g_nal_is_set())
return -1;
- }
if (argc >= 2)
nidstr = argv[1];
return (0);
}
- if (g_nal == 0) {
- fprintf(stderr, "Error: you must run the 'network' command "
- "first.\n");
+ if (!g_nal_is_set())
return (-1);
- }
if (!strcmp (argv[1], "_all_"))
nid = PTL_NID_ANY;
if (Parser_bool (&enable, argv[1]) != 0)
{
fprintf (stderr, "Can't parse boolean %s\n", argv[1]);
- return (0);
+ return (-1);
}
g_socket_nonagle = !enable;
}
return (0);
}
- if (g_nal == 0) {
- fprintf(stderr, "Error: you must run the 'network' command "
- "first.\n");
+ if (!g_nal_is_set())
return (-1);
- }
if (ptl_parse_nid (&gateway_nid, argv[1]) != 0)
{
{
struct portal_ioctl_data data;
ptl_nid_t nid;
+ ptl_nid_t nid1 = PTL_NID_ANY;
+ ptl_nid_t nid2 = PTL_NID_ANY;
int rc;
if (argc < 2)
return (0);
}
+ if (!g_nal_is_set())
+ return (-1);
+
if (ptl_parse_nid (&nid, argv[1]) != 0)
{
- fprintf (stderr, "Can't parse target NID \"%s\"\n", argv[1]);
+ fprintf (stderr, "Can't parse gateway NID \"%s\"\n", argv[1]);
return (-1);
}
+ if (argc >= 3 &&
+ ptl_parse_nid (&nid1, argv[2]) != 0)
+ {
+ fprintf (stderr, "Can't parse target NID \"%s\"\n", argv[2]);
+ return (-1);
+ }
+
+ if (argc < 4) {
+ nid2 = nid1;
+ } else {
+ if (ptl_parse_nid (&nid2, argv[3]) != 0) {
+ fprintf (stderr, "Can't parse target NID \"%s\"\n", argv[3]);
+ return (-1);
+ }
+
+ if (nid1 > nid2) {
+ ptl_nid_t tmp = nid1;
+
+ nid1 = nid2;
+ nid2 = tmp;
+ }
+ }
+
PORTAL_IOC_INIT(data);
+ data.ioc_nal = g_nal;
data.ioc_nid = nid;
+ data.ioc_nid2 = nid1;
+ data.ioc_nid3 = nid2;
rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_DEL_ROUTE, &data);
if (rc != 0)
}
int
+jt_ptl_notify_router (int argc, char **argv)
+{
+ struct portal_ioctl_data data;
+ int enable;
+ ptl_nid_t nid;
+ int rc;
+ struct timeval now;
+ time_t when;
+
+ if (argc < 3)
+ {
+ fprintf (stderr, "usage: %s targetNID <up/down> [<time>]\n",
+ argv[0]);
+ return (0);
+ }
+
+ if (ptl_parse_nid (&nid, argv[1]) != 0)
+ {
+ fprintf (stderr, "Can't parse target NID \"%s\"\n", argv[1]);
+ return (-1);
+ }
+
+ if (Parser_bool (&enable, argv[2]) != 0) {
+ fprintf (stderr, "Can't parse boolean %s\n", argv[2]);
+ return (-1);
+ }
+
+ gettimeofday(&now, NULL);
+
+ if (argc < 4) {
+ when = now.tv_sec;
+ } else if (ptl_parse_time (&when, argv[3]) != 0) {
+ fprintf(stderr, "Can't parse time %s\n"
+ "Please specify either 'YYYY-MM-DD HH:MM:SS'\n"
+ "or an absolute unix time in seconds\n", argv[3]);
+ return (-1);
+ } else if (when > now.tv_sec) {
+ fprintf (stderr, "%s specifies a time in the future\n",
+ argv[3]);
+ return (-1);
+ }
+
+ PORTAL_IOC_INIT(data);
+ data.ioc_nal = g_nal;
+ data.ioc_nid = nid;
+ data.ioc_flags = enable;
+ /* Yeuch; 'cept I need a __u64 on 64 bit machines... */
+ data.ioc_nid3 = (__u64)when;
+
+ rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NOTIFY_ROUTER, &data);
+ if (rc != 0)
+ {
+ fprintf (stderr, "IOC_PORTAL_NOTIFY_ROUTER ("LPX64") failed: %s\n",
+ nid, strerror (errno));
+ return (-1);
+ }
+
+ return (0);
+}
+
+int
jt_ptl_print_routes (int argc, char **argv)
{
char buffer[3][128];
ptl_nid_t gateway_nid;
ptl_nid_t nid1;
ptl_nid_t nid2;
-
-
+ int alive;
+
for (index = 0;;index++)
{
PORTAL_IOC_INIT(data);
gateway_nid = data.ioc_nid;
nid1 = data.ioc_nid2;
nid2 = data.ioc_nid3;
-
- printf ("%8s %18s : %s - %s\n",
+ alive = data.ioc_flags;
+
+ printf ("%8s %18s : %s - %s, %s\n",
nal2name (gateway_nal),
ptl_nid2str (buffer[0], gateway_nid),
ptl_nid2str (buffer[1], nid1),
- ptl_nid2str (buffer[2], nid2));
+ ptl_nid2str (buffer[2], nid2),
+ alive ? "up" : "down");
}
return (0);
}
+static int
+lwt_control(int enable, int clear)
+{
+ struct portal_ioctl_data data;
+ int rc;
+
+ PORTAL_IOC_INIT(data);
+ data.ioc_flags = enable;
+ data.ioc_misc = clear;
+
+ rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_LWT_CONTROL, &data);
+ if (rc == 0)
+ return (0);
+
+ fprintf(stderr, "IOC_PORTAL_LWT_CONTROL failed: %s\n",
+ strerror(errno));
+ return (-1);
+}
+
+static int
+lwt_snapshot(int *ncpu, int *totalsize, lwt_event_t *events, int size)
+{
+ struct portal_ioctl_data data;
+ int rc;
+
+ PORTAL_IOC_INIT(data);
+ data.ioc_pbuf1 = (char *)events;
+ data.ioc_plen1 = size;
+
+ rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_LWT_SNAPSHOT, &data);
+ if (rc != 0) {
+ fprintf(stderr, "IOC_PORTAL_LWT_SNAPSHOT failed: %s\n",
+ strerror(errno));
+ return (-1);
+ }
+
+ LASSERT (data.ioc_count != 0);
+ LASSERT (data.ioc_misc != 0);
+
+ if (ncpu != NULL)
+ *ncpu = data.ioc_count;
+
+ if (totalsize != NULL)
+ *totalsize = data.ioc_misc;
+
+ return (0);
+}
+
+static char *
+lwt_get_string(char *kstr)
+{
+ char *ustr;
+ struct portal_ioctl_data data;
+ int size;
+ int rc;
+
+ /* FIXME: this could maintain a symbol table since we expect to be
+ * looking up the same strings all the time... */
+
+ PORTAL_IOC_INIT(data);
+ data.ioc_pbuf1 = kstr;
+ data.ioc_plen1 = 1; /* non-zero just to fool portal_ioctl_is_invalid() */
+ data.ioc_pbuf2 = NULL;
+ data.ioc_plen2 = 0;
+
+ rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_LWT_LOOKUP_STRING, &data);
+ if (rc != 0) {
+ fprintf(stderr, "IOC_PORTAL_LWT_LOOKUP_STRING failed: %s\n",
+ strerror(errno));
+ return (NULL);
+ }
+
+ size = data.ioc_count;
+ ustr = (char *)malloc(size);
+ if (ustr == NULL) {
+ fprintf(stderr, "Can't allocate string storage of size %d\n",
+ size);
+ return (NULL);
+ }
+
+ PORTAL_IOC_INIT(data);
+ data.ioc_pbuf1 = kstr;
+ data.ioc_plen1 = 1; /* non-zero just to fool portal_ioctl_is_invalid() */
+ data.ioc_pbuf2 = ustr;
+ data.ioc_plen2 = size;
+
+ rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_LWT_LOOKUP_STRING, &data);
+ if (rc != 0) {
+ fprintf(stderr, "IOC_PORTAL_LWT_LOOKUP_STRING failed: %s\n",
+ strerror(errno));
+ return (NULL);
+ }
+
+ LASSERT(strlen(ustr) == size - 1);
+ return (ustr);
+}
+
+static void
+lwt_put_string(char *ustr)
+{
+ free(ustr);
+}
+
+static int
+lwt_print(FILE *f, cycles_t t0, cycles_t tlast, double mhz, int cpu, lwt_event_t *e)
+{
+ char whenstr[32];
+ char *where = lwt_get_string(e->lwte_where);
+
+ if (where == NULL)
+ return (-1);
+
+ sprintf(whenstr, LPD64, e->lwte_when - t0);
+
+ fprintf(f, "%#010lx %#010lx %#010lx %#010lx: %#010lx %1d %10.6f %10.2f %s\n",
+ e->lwte_p1, e->lwte_p2, e->lwte_p3, e->lwte_p4,
+ (long)e->lwte_task, cpu, (e->lwte_when - t0) / (mhz * 1000000.0),
+ (t0 == e->lwte_when) ? 0.0 : (e->lwte_when - tlast) / mhz,
+ where);
+
+ lwt_put_string(where);
+
+ return (0);
+}
+
+double
+get_cycles_per_usec ()
+{
+ FILE *f = fopen ("/proc/cpuinfo", "r");
+ double mhz;
+ char line[64];
+
+ if (f != NULL) {
+ while (fgets (line, sizeof (line), f) != NULL)
+ if (sscanf (line, "cpu MHz : %lf", &mhz) == 1) {
+ fclose (f);
+ return (mhz);
+ }
+ fclose (f);
+ }
+
+ fprintf (stderr, "Can't read/parse /proc/cpuinfo\n");
+ return (1000.0);
+}
+
+int
+jt_ptl_lwt(int argc, char **argv)
+{
+#define MAX_CPUS 8
+ int ncpus;
+ int totalspace;
+ int nevents_per_cpu;
+ lwt_event_t *events;
+ lwt_event_t *cpu_event[MAX_CPUS + 1];
+ lwt_event_t *next_event[MAX_CPUS];
+ lwt_event_t *first_event[MAX_CPUS];
+ int cpu;
+ lwt_event_t *e;
+ int rc;
+ int i;
+ double mhz;
+ cycles_t t0;
+ cycles_t tlast;
+ FILE *f = stdout;
+
+ if (argc < 2 ||
+ (strcmp(argv[1], "start") &&
+ strcmp(argv[1], "stop"))) {
+ fprintf(stderr,
+ "usage: %s start\n"
+ " %s stop [fname]\n", argv[0], argv[0]);
+ return (-1);
+ }
+
+ if (!strcmp(argv[1], "start")) {
+ /* disable */
+ if (lwt_control(0, 0) != 0)
+ return (-1);
+
+ /* clear */
+ if (lwt_control(0, 1) != 0)
+ return (-1);
+
+ /* enable */
+ if (lwt_control(1, 0) != 0)
+ return (-1);
+
+ return (0);
+ }
+
+ if (lwt_snapshot(&ncpus, &totalspace, NULL, 0) != 0)
+ return (-1);
+
+ if (ncpus > MAX_CPUS) {
+ fprintf(stderr, "Too many cpus: %d (%d)\n", ncpus, MAX_CPUS);
+ return (-1);
+ }
+
+ events = (lwt_event_t *)malloc(totalspace);
+ if (events == NULL) {
+ fprintf(stderr, "Can't allocate %d\n", totalspace);
+ return (-1);
+ }
+
+ if (lwt_control(0, 0) != 0) { /* disable */
+ free(events);
+ return (-1);
+ }
+
+ if (lwt_snapshot(NULL, NULL, events, totalspace)) {
+ free(events);
+ return (-1);
+ }
+
+ if (argc > 2) {
+ f = fopen (argv[2], "w");
+ if (f == NULL) {
+ fprintf(stderr, "Can't open %s for writing: %s\n", argv[2], strerror (errno));
+ free(events);
+ return (-1);
+ }
+ }
+
+ mhz = get_cycles_per_usec();
+
+ /* carve events into per-cpu slices */
+ nevents_per_cpu = totalspace / (ncpus * sizeof(lwt_event_t));
+ for (cpu = 0; cpu <= ncpus; cpu++)
+ cpu_event[cpu] = &events[cpu * nevents_per_cpu];
+
+ /* find the earliest event on each cpu */
+ for (cpu = 0; cpu < ncpus; cpu++) {
+ first_event[cpu] = NULL;
+
+ for (e = cpu_event[cpu]; e < cpu_event[cpu + 1]; e++) {
+
+ if (e->lwte_where == NULL) /* not an event */
+ continue;
+
+ if (first_event[cpu] == NULL ||
+ first_event[cpu]->lwte_when > e->lwte_when)
+ first_event[cpu] = e;
+ }
+
+ next_event[cpu] = first_event[cpu];
+ }
+
+ t0 = tlast = 0;
+ for (cpu = 0; cpu < ncpus; cpu++) {
+ e = first_event[cpu];
+ if (e == NULL) /* no events this cpu */
+ continue;
+
+ if (e == cpu_event[cpu])
+ e = cpu_event[cpu + 1] - 1;
+ else
+ e = e - 1;
+
+ /* If there's an event immediately before the first one, this
+ * cpu wrapped its event buffer */
+ if (e->lwte_where == NULL)
+ continue;
+
+ /* We should only start outputting events from the most recent
+ * first event in any wrapped cpu. Events before this time on
+ * other cpus won't have any events from this CPU to interleave
+ * with. */
+ if (t0 < first_event[cpu]->lwte_when)
+ t0 = first_event[cpu]->lwte_when;
+ }
+
+ for (;;) {
+ /* find which cpu has the next event */
+ cpu = -1;
+ for (i = 0; i < ncpus; i++) {
+
+ if (next_event[i] == NULL) /* this cpu exhausted */
+ continue;
+
+ if (cpu < 0 ||
+ next_event[i]->lwte_when < next_event[cpu]->lwte_when)
+ cpu = i;
+ }
+
+ if (cpu < 0) /* all cpus exhausted */
+ break;
+
+ if (t0 == 0) {
+ /* no wrapped cpus and this is he first ever event */
+ t0 = next_event[cpu]->lwte_when;
+ }
+
+ if (t0 <= next_event[cpu]->lwte_when) {
+ /* on or after the first event */
+ rc = lwt_print(f, t0, tlast, mhz, cpu, next_event[cpu]);
+ if (rc != 0)
+ break;
+ }
+
+ tlast = next_event[cpu]->lwte_when;
+
+ next_event[cpu]++;
+ if (next_event[cpu] == cpu_event[cpu + 1])
+ next_event[cpu] = cpu_event[cpu];
+
+ if (next_event[cpu]->lwte_where == NULL ||
+ next_event[cpu] == first_event[cpu])
+ next_event[cpu] = NULL;
+ }
+
+ if (f != stdout)
+ fclose(f);
+
+ free(events);
+ return (0);
+#undef MAX_CPUS
+}
command_t list[] = {
{"network", jt_ptl_network, 0,"setup the NAL (args: nal name)"},
{"print_autoconns", jt_ptl_print_autoconnects, 0, "print autoconnect entries (no args)"},
- {"add_autoconn", jt_ptl_add_autoconnect, 0, "add autoconnect entry (args: nid host [ixs])"},
+ {"add_autoconn", jt_ptl_add_autoconnect, 0, "add autoconnect entry (args: nid host [ixse])"},
{"del_autoconn", jt_ptl_del_autoconnect, 0, "delete autoconnect entry (args: [nid] [host] [ks])"},
{"print_conns", jt_ptl_print_connections, 0, "print connections (no args)"},
{"connect", jt_ptl_connect, 0, "connect to a remote nid (args: host port [xi])"},
{"ping", jt_ptl_ping, 0, "do a ping test (args: nid [count] [size] [timeout])"},
{"shownid", jt_ptl_shownid, 0, "print the local NID"},
{"mynid", jt_ptl_mynid, 0, "inform the socknal of the local NID (args: [hostname])"},
- {"add_route", jt_ptl_add_route, 0, "add an entry to the routing table (args: gatewayNID targetNID [targetNID])"},
- {"del_route", jt_ptl_del_route, 0, "delete an entry from the routing table (args: targetNID"},
+ {"add_route", jt_ptl_add_route, 0,
+ "add an entry to the routing table (args: gatewayNID targetNID [targetNID])"},
+ {"del_route", jt_ptl_del_route, 0,
+ "delete all routes via a gateway from the routing table (args: gatewayNID"},
+ {"set_route", jt_ptl_notify_router, 0,
+ "enable/disable a route in the routing table (args: gatewayNID up/down [time]"},
{"print_routes", jt_ptl_print_routes, 0, "print the routing table (args: none)"},
{"recv_mem", jt_ptl_rxmem, 0, "Set socket receive buffer size (args: [size])"},
{"send_mem", jt_ptl_txmem, 0, "Set socket send buffer size (args: [size])"},
--- /dev/null
+#!/bin/sh
+
+# option variables
+DESTDIR=
+KERNELDIR=
+PHASE=
+TARGET=
+JOBS=1
+CONFIGURE_FLAGS=
+
+# provide by target file
+KERNEL=
+SERIES=
+CONFIG=
+VERSION=
+EXTRA_VERSION=
+
+# flat-out globals
+TOPDIR=
+TARGET_FILE=
+KERNEL_FILE=
+SERIES_FILE=
+CONFIG_FILE=
+CC=${CC:-gcc}
+
+canon()
+{
+ pushd $1 >/dev/null
+ echo $PWD
+ popd >/dev/null
+}
+TOPDIR=$(canon "${0%%${0##*/}}/..")
+
+cleanup()
+{
+ true
+}
+
+fatal()
+{
+ cleanup
+ [ "$2" ] && echo
+ [ "$2" ] && echo "${0##*/}: $2"
+ exit $1
+}
+
+list_targets()
+{
+ echo -n "Available targets:"
+ for target in $TOPDIR/kernel_patches/targets/*.target ; do
+ target_file=${target##*/}
+ echo -n " ${target_file%%.target}"
+ done
+ echo
+}
+
+
+usage()
+{
+ cat <<EOF
+Usage: ${0##*/} [OPTION]... [-- <lustre configure options>]
+
+${0##*/} has two phases. The build phase, and the install phase. The
+phase is specified with the --phase option:
+
+ --phase=build/install
+
+Options not requiring a phase:
+
+ -h, --help
+ Display this message.
+
+Options appropriate with both phases include:
+
+ -j jobs
+ This works just like the -j option to make, and is passed to make
+ when building.
+
+ --target=TARGET
+ Name of the configuration to use. The available targets are
+ listed below.
+
+Options appropriate with the build phase are:
+
+ --kerneldir=KERNELDIR
+ Directory containing linux source tarballs.
+
+ --extraversion=EXTRAVERSION
+ Overrides the target kernel\'s EXTRAVERSION text.
+
+Options appropriate with the install phase are:
+
+ --destdir=DESTDIR
+ Root directory to install into (like DESTDIR with auto*).
+EOF
+ list_targets
+
+ fatal "$1" "$2"
+}
+
+check_options()
+{
+ [ "$PHASE" ] || usage 1 "A phase must be specified with --phase"
+ case "$PHASE" in
+ build)
+ [ "$KERNELDIR" ] || \
+ usage 1 "A kernel directory must be specified with --kerneldir."
+ [ -d "$KERNELDIR" ] || \
+ usage 1 "$KERNELDIR is not a directory."
+ ;;
+ install)
+ [ -z "$DESTDIR" -o -d "$DESTDIR" ] || \
+ usage 1 "$DESTDIR is not a directory."
+ ;;
+ *)
+ usage 1 "Phase must be build or install."
+ ;;
+ esac
+ [ "$TARGET" ] || usage 1 "A target must be specified with --target."
+ TARGET_FILE="$TOPDIR/kernel_patches/targets/$TARGET.target"
+ [ -r "$TARGET_FILE" ] || \
+ usage 1 "Target '$TARGET' was not found. Try --list-targets."
+
+ if [ -z "$JOBS" -o "$JOBS" -lt "1" ] ; then
+ JOBS=1
+ fi
+}
+
+get_lustre_version()
+{
+ for patch in $(<"$SERIES_FILE") ; do
+ if [ "${patch#lustre_version}" = "${patch}" ] ; then
+ continue
+ fi
+ awk '/^#define LUSTRE_VERSION_VERSION /{ print $3 }' \
+ < "$TOPDIR/kernel_patches/patches/$patch" 2>/dev/null
+ break
+ done
+}
+
+load_target()
+{
+ EXTRA_VERSION_save="$EXTRA_VERSION"
+
+ . "$TARGET_FILE"
+
+ [ "$KERNEL" ] || fatal 1 "Target $TARGET did not specify a kernel."
+ [ "$SERIES" ] || fatal 1 "Target $TARGET did not specify a patch series."
+ [ "$CONFIG" ] || fatal 1 "Target $TARGET did not specify a kernel config."
+ [ "$VERSION" ] || fatal 1 "Target $TARGET did not specify the kernel version."
+
+ if [ "$KERNELDIR" ] ; then
+ KERNEL_FILE="$KERNELDIR/$KERNEL"
+ [ -r "$KERNELDIR/$KERNEL" ] || \
+ fatal 1 "Target $TARGET's kernel file $KERNEL not found in kernel directory $KERNELDIR."
+ fi
+
+ SERIES_FILE="$TOPDIR/kernel_patches/series/$SERIES"
+ [ -r "$SERIES_FILE" ] || \
+ fatal 1 "Target $TARGET's series $SERIES missing from $TOPDIR/kernel_patches/series."
+
+ CONFIG_FILE="$TOPDIR/kernel_patches/kernel_configs/$CONFIG"
+ [ -r "$CONFIG_FILE" ] || \
+ fatal 1 "Target $TARGET's config file $CONFIG missing from $TOPDIR/kernel_patches/configs."
+
+ if [ "$EXTRA_VERSION_save" ] ; then
+ EXTRA_VERSION="$EXTRA_VERSION_save"
+ else
+ EXTRA_VERSION="${EXTRA_VERSION}_$(get_lustre_version)"
+ fi
+ EXTRA_VERSION=$(echo $EXTRA_VERSION | sed -e s/-/_/g)
+}
+
+tarflags()
+{
+ case "$1" in
+ '')
+ fatal 1 "tarflags(): File name argument missing."
+ ;;
+ *.tar.gz)
+ echo 'zxf'
+ ;;
+ *.tar.bz2)
+ echo 'jxf'
+ ;;
+ *)
+ fatal 1 "tarflags(): Unrecognized tar extension in file: $1"
+ ;;
+ esac
+}
+
+untar()
+{
+ echo "Untarring ${1##*/}..."
+ tar $(tarflags $1) $1
+}
+
+
+extract_kernel()
+{
+ pushd "$TOPDIR" >/dev/null
+ untar "$KERNEL_FILE"
+ [ -d linux ] || ln -sf linux* linux
+ popd >/dev/null
+}
+
+patch_kernel()
+{
+ pushd "$TOPDIR/linux" >/dev/null
+ echo "Overriding EXTRAVERSION in kernel..."
+ perl -p -i -e "s/^EXTRAVERSION.*/EXTRAVERSION = -${EXTRA_VERSION}/" Makefile
+ echo -n "Applying patch"
+ for patch in $(<"$SERIES_FILE") ; do
+ PATCH_FILE="$TOPDIR/kernel_patches/patches/$patch"
+ [ -r "$PATCH_FILE" ] || \
+ fatal 1 "Patch file not found: $patch"
+ echo -n " $patch"
+ patch -s -p1 < "$PATCH_FILE"
+ done
+ echo
+ popd >/dev/null
+}
+
+build_kernel()
+{
+ # we need to override $CC at make time, since there is no
+ # configure
+ MAKE="make -s CC=$CC"
+ pushd "$TOPDIR/linux" >/dev/null
+ echo "Making depend in $PWD..."
+ make -s mrproper || fatal 1 "Error running make mrproper"
+ cp "$CONFIG_FILE" .config
+ $MAKE -s oldconfig_nonint || fatal 1 "Error running make oldconfig"
+ $MAKE -j $JOBS -s dep || fatal 1 "Error running make dep"
+ $MAKE -s include/linux/version.h || fatal 1 "Error making include/linux/version.h"
+ echo "Building kernel in $PWD..."
+ $MAKE -j $JOBS -s bzImage || fatal 1 "Error making bzImage."
+ $MAKE -j $JOBS -s modules || fatal 1 "Error building modules."
+ popd >/dev/null
+}
+
+configure_lustre()
+{
+ pushd "$TOPDIR" >/dev/null
+ [ -f configure ] || sh ./autogen.sh
+ ./configure --with-linux=$PWD/linux $CONFIGURE_FLAGS || \
+ fatal 1 "Error configuring Lustre."
+ popd >/dev/null
+}
+
+build_lustre()
+{
+ pushd "$TOPDIR" >/dev/null
+ make -j $JOBS -s
+ popd >/dev/null
+}
+
+install_kernel()
+{
+ FULL_VERSION="${VERSION}-${EXTRA_VERSION}"
+ pushd "$TOPDIR/linux" >/dev/null
+ mkdir -p "$DESTDIR/boot" "$DESTDIR/lib/modules"
+ INSTALL_MOD_PATH="$DESTDIR" make CC=$CC -s modules_install || \
+ fatal 1 "Error installing modules."
+ cp arch/i386/boot/bzImage "$DESTDIR/boot/vmlinuz-${FULL_VERSION}"
+ cp System.map "$DESTDIR/boot/System.map-${FULL_VERSION}"
+ cp .config "$DESTDIR/boot/config-${FULL_VERSION}"
+ popd >/dev/null
+}
+
+install_lustre()
+{
+ pushd "$TOPDIR" >/dev/null
+ make -s install "DESTDIR=$DESTDIR" || fatal 1 "Error installing Lustre."
+ popd >/dev/null
+}
+
+options=$(getopt -o hj: -l destdir:,extraversion:,help,kerneldir:,phase:,target: -- "$@")
+
+eval set -- "$options"
+
+while [ "$1" ] ; do
+ case "$1" in
+ '')
+ usage 1
+ ;;
+ --destdir)
+ DESTDIR=$2
+ shift 2
+ ;;
+ --extraversion)
+ EXTRA_VERSION=$2
+ shift 2
+ ;;
+ --help | -h)
+ usage 0
+ ;;
+ -j)
+ JOBS=$2
+ shift 2
+ ;;
+ --kerneldir)
+ KERNELDIR=$2
+ shift 2
+ ;;
+ --phase)
+ PHASE=$2
+ shift 2
+ ;;
+ --target)
+ TARGET=$2
+ shift 2
+ ;;
+ --)
+ shift
+ CONFIGURE_FLAGS=$@
+ break
+ ;;
+ *)
+ usage 1 "Unrecognized option: $1"
+ ;;
+ esac
+done
+
+check_options
+load_target
+
+case "$PHASE" in
+ build)
+ extract_kernel
+ patch_kernel
+ build_kernel
+ configure_lustre
+ build_lustre
+ ;;
+ install)
+ install_kernel
+ install_lustre
+ ;;
+esac
--- /dev/null
+Name: kernel
+Summary: The Linux Kernel
+Version: @KERNEL_VERSION@
+Release: @KERNEL_RELEASE@
+License: GPL
+Group: System Environment/Kernel
+Vendor: Cluster File Systems, Inc.
+URL: http://www.kernel.org/
+Buildroot: /var/tmp/%{name}-%{PACKAGE_VERSION}-root
+
+Source0: @LUSTRE_SOURCE@
+Source1: @KERNEL_SOURCE@
+
+%define __spec_install_post /usr/lib/rpm/brp-compress || :
+%define debug_package %{nil}
+
+%description
+The Linux Kernel, the operating system core itself.
+
+%package -n lustre-lite-utils
+Summary: Lustre utils for Linux
+Group: Applications/System
+
+%description -n lustre-lite-utils
+The Lustre Lite file system utilities.
+
+#%package -n lustre-doc
+#Summary: Sample Lustre configurations and documentation
+#Group: Documentation
+
+#%description -n lustre-doc
+#The Lustre book, sample configurations, and other documentation for
+#Lustre.
+
+%package -n lustre-ldap
+Summary: LDAP schema files for Lustre
+Group: System Environment/Daemons
+
+%description -n lustre-ldap
+LDAP schema files for Lustre.
+
+%prep
+%setup -n lustre-kernel-%{version} -q -c
+[ -d lustre ] || ln -sf lustre* lustre
+
+%build
+# if RPM_BUILD_NCPUS unset, set it
+if [ -z "$RPM_BUILD_NCPUS" ] ; then
+ RPM_BUILD_NCPUS=$(egrep -c "^cpu[0-9]+" /proc/stat || :)
+ if [ $RPM_BUILD_NCPUS -eq 0 ] ; then
+ RPM_BUILD_NCPUS=1
+ fi
+ if [ $RPM_BUILD_NCPUS -gt 8 ] ; then
+ RPM_BUILD_NCPUS=8
+ fi
+fi
+
+pushd lustre >/dev/null
+./scripts/lmake \
+ --phase build \
+ --target @LUSTRE_TARGET@ \
+ --extraversion %{release} \
+ --kerneldir $RPM_SOURCE_DIR \
+ -j $RPM_BUILD_NCPUS \
+ -- @CONFIGURE_FLAGS@
+popd >/dev/null
+
+%install
+rm -rf $RPM_BUILD_ROOT
+mkdir -p $RPM_BUILD_ROOT
+pushd lustre >/dev/null
+./scripts/lmake \
+ --phase install \
+ --target @LUSTRE_TARGET@ \
+ --extraversion %{release} \
+ --destdir $RPM_BUILD_ROOT
+popd >/dev/null
+
+%clean
+rm -rf $RPM_BUILD_ROOT
+
+%files
+%doc lustre/linux/COPYING lustre/linux/CREDITS lustre/linux/README
+%doc lustre/linux/REPORTING-BUGS
+%defattr(-, root, root)
+%dir /lib/modules
+/lib/modules/*
+/boot/*
+
+%files -n lustre-lite-utils
+%doc lustre/COPYING lustre/BUGS lustre/ChangeLog lustre/README lustre/doc/lustre.pdf
+%defattr(-, root, root)
+%{_sbindir}/*
+%{_bindir}/*
+%{_libdir}/lustre/python
+%{_sysconfdir}/init.d/lustre
+/usr/include/lustre
+/lib/lib*.a
+
+#%files -n lustre-doc
+#%defattr(-, root, root)
+#/usr/share/doc/lustre/COPYING
+#/usr/share/doc/lustre/lustre.pdf
+#/usr/share/doc/lustre/COPYING
+
+/usr/lib/lustre/examples
+
+%files -n lustre-ldap
+%defattr(-, root, root)
+/etc/openldap/slapd-lustre.conf
+/etc/openldap/schema/lustre.schema
+/usr/lib/lustre/lustre2ldif.xsl
+/usr/lib/lustre/top.ldif
--- /dev/null
+#! /bin/sh
+# Copyright (C) 2003 Cluster File Systems, Inc.
+# Create a Lustre configuration file
+#
+# Usage: lwizard
+#
+# Jerrifer <jerrifer@clusterfs.com>
+# wangdi <wangdi@clusterfs.com>
+
+# fatal error to exit
+fatal()
+{
+ if [ "$#" -gt "1" ]; then
+ echo
+ echo "$2"
+ exit 1
+ fi
+
+ exit 1
+}
+
+#print usage and exit
+usage()
+{
+ cat <<EOF
+Usage: ${0##*/} [OPTIONS]...
+
+${##*/} asks the user questions about their cluster configuration, and
+writes an appropriate configuration file to config.xml.
+
+Options:
+ -o, --file=CONFIG_FILE
+ write configuration to CONFIG_FILE (default: config.xml)
+ --stripe_size=SIZE
+ specify the size (in KB) of each stripe on an OST (default: 64)
+ --stripe_cnt=COUNT
+ sepcify the number of OSTs files are striped to (default: 1)
+ --help
+ to get this help
+EOF
+
+ exit 0
+}
+
+# check if $1 is a number
+check_number()
+{
+ local num=$(expr "$1" : "[0-9]*$")
+ if [ $num -gt "0" ]; then
+ return 0
+ fi
+
+ return 1
+}
+
+#parse options of this shell
+get_option()
+{
+ local long_options="file:,mds_size:,ost_size:,stripe_size"
+ local options
+
+ long_options="$long_options:,stripe_cnt:,stripe_pattern"
+ options=$(getopt -o o:h --long "$long_options":,help -- "$@")
+
+ if [ $? -ne 0 ] ; then
+ usage
+ fi
+ eval set -- "$options"
+
+ while true ; do
+ case "$1" in
+ -o | --file)
+ CONFIG_FILE=$2
+ shift 2
+ ;;
+ --stripe_size)
+ STRIPE_SIZE=$(($2 * 1024))
+ check_number $STRIPE_SIZE || fatal 1 "bad stripe_size"
+ shift 2
+ ;;
+ --stripe_cnt)
+ STRIPE_CNT=$2
+ check_number $STRIPE_CNT || fatal 1 "bad stripe_cnt"
+ shift 2
+ ;;
+ -h | --help)
+ usage
+ ;;
+ --)
+ shift 1
+ break
+ esac
+ done
+}
+
+# if $1 in $2
+in_list()
+{
+ local node
+
+ for node in $2 ; do
+ [ "$1" = "$node" ] && return 0
+ done
+ return 1
+}
+
+# read line from stdin
+read_ln()
+{
+ local substr=$1
+ local default_value
+ local answer
+
+ unset ANS
+
+ [ $# -gt 1 ] && default_value=$2
+ [ $# -gt 2 ] && answer=$3
+
+ while [ -z "$ANS" ]; do
+ echo -n "$substr"
+ [ "$default_value" ] && echo -n " ($default_value)"
+ echo -n ": "
+ read ANS
+ [ -z "$ANS" -a "$default_value" ] && ANS=$default_value
+ if [ "$ANS" -a "$answer" ] ; then
+ in_list "$ANS" "$ANSWER" || ANS=""
+ fi
+ done
+
+ return 0
+}
+
+#ask user some questions to add a device
+add_device()
+{
+ local first
+ local hostname
+
+ [ $# -gt 2 ] && first=$3
+
+ if [ -z "$first" ] ; then
+ read_ln "Do you want to add another $1 (yes/no)?" "no" "$ANSWER"
+ else
+ ANS="yes"
+ fi
+
+ [ "$ANS" = "no" ] && return 1
+
+ echo "Creating $1 \"$1$2\"..."
+ read_ln "Please enter $1$2's hostname"
+ hostname=$ANS
+ read_ln "Please enter $1$2's device"
+
+ device=$ANS
+
+ DEVICE="$hostname:$device:$2:$1$2"
+
+ return 0
+}
+
+# get mds information
+add_mds()
+{
+ local id=1
+ local host_name
+
+ while :; do
+ add_device "mds" "$id" "first" || break
+ in_list "$DEVICE" "$MDS_LIST" && continue
+ MDS_LIST="$MDS_LIST $DEVICE" #add mds to MDS_LIST
+ ((id++))
+ break
+ done
+
+ return 0
+}
+
+# ask user to add ost
+add_ost()
+{
+ local first="first"
+ local id=1
+
+ while :; do
+ add_device "ost" "$id" "$first" || break
+ in_list "$DEVICE" "$OST_LIST" && continue
+ OST_LIST="$OST_LIST $DEVICE" #add ost to MDS_LIST
+ ((id++))
+ first=""
+ done
+
+ return 0
+}
+
+# ask user to add client to lustre
+add_client()
+{
+ read_ln "Please enter the mountpoint for your clients" "$DEFAULT_MNTPT"
+ CLIENT_LIST="*:$ANS:client:client"
+ return 0
+}
+
+#save node config into config file
+add_node()
+{
+ local node=$1
+ local nettype=$DEFAULT_NETTYPE
+ local config_file=$2
+
+ in_list "$node" "$NODE_LIST" && return 0
+ NODE_LIST="$NODE_LIST $node"
+
+ $LMC -m "$config_file" --add node --node "$node" || return 1
+ $LMC -m "$config_file" --add net --node "$node" --nid "$node" \
+ --nettype "$nettype" || return 1
+ return 0
+}
+
+#get hostname, device , device_id and device name
+#from mds node
+get_name_in_list()
+{
+ HOST_NAME=$(echo $1 | awk -F: '{ print $1 }')
+ DEVICE=$(echo $1 | awk -F: '{ print $2 }')
+ DEVICE_ID=$(echo $1 | awk -F: '{ print $3 }')
+ DEVICE_NAME=$(echo $1 | awk -F: '{ print $4 }')
+}
+
+# following user input to create xml config file
+create_config()
+{
+ local mds_name
+ local config_file=$1
+
+ for mds in $MDS_LIST ; do
+ get_name_in_list $mds
+ echo -n " $DEVICE_NAME"
+ add_node "$HOST_NAME" "$config_file" || return 1
+ $LMC -m "$config_file" --add mds --node "$HOST_NAME" \
+ --mds "$DEVICE_NAME" \
+ --nid "$HOST_NAME" --fstype "$DEFAULT_FSTYPE" \
+ --dev "$DEVICE" || return 1
+
+ mds_name="$DEVICE_NAME"
+ done
+
+ #add lov information FIXME --stripe_sz and
+ #--stripe_cnt should be input by user
+ echo -n " lov1"
+ $LMC -m "$config_file" --add lov --lov lov1 --mds "$mds_name" \
+ --stripe_sz "$STRIPE_SIZE" --stripe_cnt "$STRIPE_CNT" \
+ --stripe_pattern "$STRIPE_PATTERN" || return 1
+
+ for ost in $OST_LIST ; do
+ get_name_in_list $ost
+ echo -n " $DEVICE_NAME"
+ add_node "$HOST_NAME" "$config_file" || return 1
+ $LMC -m "$config_file" --add ost --node "$HOST_NAME" \
+ --ost "$DEVICE_NAME" \
+ --lov lov1 --fstype "$DEFAULT_FSTYPE" \
+ --dev "$DEVICE" || return 1
+ done
+
+ for client in $CLIENT_LIST ; do
+ get_name_in_list $client
+ echo -n " $DEVICE_NAME"
+ add_node "client" "$config_file" || return 1
+ $LMC -m "$config_file" --add mtpt --nod client \
+ --mds "$mds_name" --lov lov1 --path "$DEVICE" || return 1
+ done
+ echo
+ return 0
+}
+
+#parse options
+get_option "$@"
+
+#some default definitions
+LMC=${LMC:-"/usr/sbin/lmc"}
+CONFIG_FILE=${CONFIG_FILE:-"config.xml"}
+TMP_CONFIG_FILE=${TMP_CONFIG_FILE:-".config.xml.tmp"}
+DEFAULT_FSTYPE=${DEFAULT_FSTYPE:-"ext3"}
+DEFAULT_NETTYPE=${DEFAULT_NETTYPE:-"tcp"}
+DEFAULT_MNTPT=${DEFAULT_MNTPT:-"/mnt/lustre"}
+STRIPE_SIZE=${STRIPE_SIZE:-65536}
+STRIPE_CNT=${STRIPE_CNT:-1}
+STRIPE_PATTERN=${STRIPE_PATTERN:-0}
+ANSWER="yes no"
+
+#print program information
+cat <<EOF
+This script will help you create a Lustre configuration file.
+
+EOF
+
+#add mds to lustre
+unset $MDS_LIST
+add_mds || fatal 1 "Cannot add mds to your lustre"
+
+#add ost to lustre
+unset $OST_LIST
+add_ost || fatal 1 "Cannot add ost to your lustre"
+
+#add client to lustre
+unset $CLIENT_LIST
+add_client || fatal 1 "Cannot add client to your lustre"
+
+rm -f "$TMP_CONFIG_FILE"
+echo -n "Saving configuration to $CONFIG_FILE:"
+create_config "$TMP_CONFIG_FILE" || \
+ fatal 1 "There was an error saving the config file."
+mv -f "$TMP_CONFIG_FILE" "$CONFIG_FILE" || \
+ fatal 1 "There was an error saving the config file."
+
+cat <<EOF
+
+Your configuration has been saved to $CONFIG_FILE.
+EOF
+exit 0
--- /dev/null
+#!/bin/sh
+#
+# this runs prep/commit against filters and generates a
+# table of their write timings. it needs the names
+# of the filters it will run against and needs the
+# obdecho module loaded. it spews a lot of junk
+# as it goes, only the last bit is really interesting;
+# tee it to a log file.
+#
+# ex: FILTER_NAMES="ost1 ost2" sh ./filter_survey.sh
+#
+SRCDIR="`dirname $0`/"
+export PATH=$SRCDIR/../utils:/sbin:/usr/sbin::$PATH
+
+tmp_dir=""
+echo_base="f_s_$$"
+echo_objs=""
+
+die() {
+ echo $* 1>&2
+ exit 1
+}
+
+cleanup() {
+ [ ! -z "$tmp_dir" ] && [ -d $tmp_dir ] && rm -rf $tmp_dir
+ [ -z "$echo_objs" ] && exit 0
+ for obj in $echo_objs; do
+ echo cleaning up $obj
+# I can't believe leading whitespace matters here.
+lctl << EOF
+device $obj
+cleanup
+detach
+quit
+EOF
+ done
+}
+trap cleanup EXIT
+
+not_a_filter() {
+ lctl device_list | awk "/obdfilter $1/ {exit 1}"
+ return $?
+}
+
+# holy crap are these confusing
+sep1="||||"
+sep2="||"
+sep3=""
+sep4="||||"
+
+#
+# build up echo_clients attached to the given filters and record
+# their names and obj numbers for later use and teardown
+#
+last_filter="-1"
+[ -z "$FILTER_NAMES" ] && die "please specify filter names to run against"
+for fn in $FILTER_NAMES; do
+ if not_a_filter $fn; then
+ die "'$fn' isn't the name of an obdfilter device"
+ fi
+ en="${echo_base}_$fn"
+lctl << EOF
+ newdev
+ attach echo_client $en ${en}_uuid
+ setup $fn
+ probe
+ quit
+EOF
+ [ $? -eq 0 ] || die "error setting up echo_client (is obdecho loaded?)"
+
+ obj=`lctl device_list | awk "/echo_client $en/ {print "'$1}'`
+ [ -z "$obj" ] && die "couldn't find my echo_client's object number"
+ echo setup echo_client name $en as object $obj
+ echo_objs="$echo_objs $obj"
+
+ last_filter=$(($last_filter + 1))
+ echo_names[$last_filter]=$en
+
+ # build up the seperators we'll use in generating the wiki
+ sep1="$sep1||||"
+ sep2="$sep2||"
+ sep3="$sep3||"
+ sep4="$sep4||"
+done
+
+tmp_dir=`mktemp -d /tmp/echo_client_survey_XXXXXX` || die "mktemp failed"
+
+TOT_PAGES=${TOT_PAGES:-524288}
+
+throughput() {
+ local threads="$1"
+ local pages="$2"
+ local time="$3"
+ local tp=`echo 'scale=2; '$threads' * '$pages' * 4096 / ('$time' * 1024 * 1024)' | bc`
+ echo $tp
+}
+
+wait_for_idle_io() {
+ echo "waiting idle io via vmstat"
+ vmstat 1 | awk '
+ ($10 == 0 && $11 == 0) {
+ idle++;
+ if (idle == 3) {
+ print "idle for 3 seconds, must be done"
+ exit
+ }
+ }
+ (NR == 13) {
+ "deletion took longer than 10s, bailing";
+ exit
+ } '
+}
+
+#
+# sorry for the wild indenting. get a wide terminal, its 2003.
+#
+num_rows="0"
+num_summary_rows="0"
+for order_threads in `seq 0 3`; do
+ nthreads=$(echo "2^$order_threads" | bc)
+
+ for stride in 16 64 128; do
+ span="<|$(($nthreads +1))>"
+ row="||$span $nthreads||$span $stride||"
+ sum_row="||$nthreads||$stride||"
+
+ for t in `seq 1 $nthreads`; do
+ thread_row[$t]="||"
+ done
+
+ for obj_per_thread in y n; do
+ if [ $obj_per_thread == "y" ]; then
+ offset_prefix=""
+ objid_prefix="t";
+ else
+ offset_prefix="t"
+ objid_prefix="";
+ fi
+
+ # create the objects that this write/rewrite run
+ # will be using
+ for i in `seq 0 $last_filter`; do
+ oid=`lctl --device "\$"${echo_names[$i]} create $nthreads | \
+ awk '/1 is object id/ { print $6 }'`
+ [ -z "$oid" ] && die "error creating object"
+ oids[$i]=$oid
+ done
+
+ # iterate through write and rewrite
+ for a in 1 2; do
+ total_maxtime="0.0"
+ pids=""
+ # start a test_brw thread in the background
+ # for each given filter
+ for i in `seq 0 $last_filter`; do
+ lctl --threads $nthreads v "\$"${echo_names[$i]} \
+ test_brw ${offset_prefix}1 w v \
+ $TOT_PAGES ${objid_prefix}${oids[$i]} p$stride | \
+ tee $tmp_dir/$i &
+ pids="$pids $!"
+ done
+ for p in $pids; do
+ wait $p
+ done
+
+ for t in `seq 1 $nthreads`; do
+ thread_row[$t]="${thread_row[$t]} ||"
+ done
+ row_tmp=""
+ for i in `seq 0 $last_filter`; do
+ maxtime="0.0"
+ for t in `seq 1 $nthreads`; do
+ f="$tmp_dir/$i"
+ MS=`grep "test_brw-$t" $f | \
+ awk '($8=="MB/s):"){print $6, substr($7,2);}'`
+ thread_row[$t]="${thread_row[$t]}$MS||"
+ time=`echo $MS | cut -d s -f 1`
+ if [ `echo "$time > $maxtime" | bc` -eq "1" ]; then
+ maxtime=$time;
+ fi
+ done
+ tp=`throughput $nthreads $TOT_PAGES $maxtime`
+ row_tmp="${row_tmp}<#ffffe0>$tp $maxtime||"
+ sum_row="${sum_row}$tp||"
+
+ if [ `echo "$maxtime > $total_maxtime" | bc` -eq "1" ]; then
+ total_maxtime=$maxtime;
+ fi
+ done
+ tp=`throughput $(($nthreads * $(($last_filter +1)))) $TOT_PAGES $total_maxtime`
+ row="${row}<#ffffe0>${tp} $total_maxtime||${row_tmp}"
+ done
+
+ # destroy the objects from this run and wait for
+ # their destruction to complete
+ for i in `seq 0 $last_filter`; do
+ lctl --device "\$"${echo_names[$i]} destroy ${oids[$i]} $nthreads
+ done
+ wait_for_idle_io
+ done
+
+ num_rows=$(($num_rows + 1))
+ rows[$num_rows]="$row"
+
+ num_summary_rows=$(($num_summary_rows + 1))
+ summary[$num_summary_rows]="$sum_row"
+
+ for t in `seq 1 $nthreads`; do
+ num_rows=$(($num_rows + 1))
+ rows[$num_rows]="${thread_row[$t]}"
+ done
+ done
+done
+
+echo done.
+
+bg='<rowbgcolor="#eoeoff"'
+echo "||$bg|2>threads writing $TOT_PAGES pages||<|2>pages per prep/commit${sep1}oid per thread${sep1}shared oid||"
+echo "$sep2$bg>write${sep2}re-write${sep2}write${sep2}re-write||"
+for r in `seq 1 $num_rows`; do
+ echo ${rows[$r]}
+done
+
+echo summary table
+
+echo "||$bg|2>threads||<|2>pages ${sep4}oid/thread${sep4}shared oid||"
+echo "$sep3$bg>write${sep3}re-write${sep3}write${sep3}re-write||"
+for r in `seq 1 $num_summary_rows`; do
+ echo ${summary[$r]}
+done
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#define BUFSIZE (1024*1024)
+int compfunc(const void *x, const void *y)
+{
+ if (*(unsigned int *)x < *(unsigned int *)y) {
+ return -1;
+ } else if (*(unsigned int *)x == *(unsigned int *)y) {
+ return 0;
+ } else {
+ return 1;
+ }
+}
+
+/* sort offsets and delete redundant data in offsets
+ * no return
+ */
+void collapse_redundant(unsigned *offsets, int offsetcount)
+{
+ int i, j;
+
+ qsort(offsets, offsetcount, sizeof(unsigned int), compfunc);
+
+ /* collapse the redundant offsets */
+ for (i = 0; i < offsetcount - 1; i++) {
+ if (offsets[i] == offsets[i + 1]) {
+ for (j = i; j < offsetcount; j++) {
+ offsets[j] = offsets[j + 1];
+ }
+ offsetcount--;
+ }
+ }
+}
+
+/* verify the sparse pwrite from page(0) to page(filesize / BUFSIZE)
+ * if sucess return last verified page number else return (-1)
+ */
+int verify_content(int fd, int filesize, unsigned int *offsets,
+ int O_number)
+{
+ int i , j;
+ char *filebuf;
+ int focus = 0;
+ int p_number;
+
+ filebuf = (char*) malloc(BUFSIZE);
+
+ p_number = filesize / BUFSIZE;
+ for (j = 0; j < p_number ; j++) {
+
+ i = read(fd, filebuf, BUFSIZE);
+
+ if (i != BUFSIZE) {
+ fprintf(stderr,
+ "Reading file fails (%s), returning (%d)\n",
+ strerror(errno), i);
+ free(filebuf);
+ return -1;
+ }
+
+ /* check the position that should hold '+'
+ * If correct, change it to 0 in the buffer */
+ for (; focus < O_number; focus++) {
+ if (offsets[focus] < (j + 1) * BUFSIZE - 1) {
+ if (filebuf[offsets[focus] % BUFSIZE] != '+') {
+ fprintf(stderr,
+ "Bad content, should \
+ be '+' at %d.\n",
+ offsets[focus]);
+ free(filebuf);
+ return -1;
+ } else {
+ /* '+', change it to 0 for comparison */
+ filebuf[offsets[focus] % BUFSIZE] = 0;
+ }
+ }
+ }
+
+ /* Hopefully '+' should have been changed to 0
+ * Thus, we should not encounter any strange character */
+ for (i = 0; i < BUFSIZE; i++) {
+ if (filebuf[i] != 0) {
+ fprintf(stderr,
+ "Bad content, should be 0 at %d.\n",
+ i + j * BUFSIZE);
+ free(filebuf);
+ return -1;
+ }
+ }
+ }
+
+ free(filebuf);
+ return focus;
+}
+
+/* verify the sparse pwrite with last page
+ * if sucess return 0 else return 1
+ */
+int verify_tail(int fd, int filesize, unsigned int *offsets,
+ int O_number, int focus)
+{
+ int i;
+ char *filebuf;
+ int p_number;
+
+ filebuf = (char*) malloc(BUFSIZE);
+
+ /* The last page */
+ p_number = filesize % BUFSIZE;
+ i = read(fd, filebuf, p_number);
+ if (i != p_number) {
+ fprintf(stderr, "Reading file fails (%s), returning (%d)\n",
+ strerror(errno), i);
+ free(filebuf);
+ return 1;
+ }
+ for (; focus < O_number; focus++) {
+ if (offsets[focus] < filesize) {
+ if (filebuf[offsets[focus] % BUFSIZE] != '+') {
+ fprintf(stderr,
+ "Bad content, should be '+' at %d.\n",
+ offsets[focus]);
+ free(filebuf);
+ return 1;
+ } else {
+ /* '+', change it to 0 for later comparison */
+ filebuf[offsets[focus]%BUFSIZE] = 0;
+ }
+ } else {
+ fprintf(stderr,
+ "Error: File size <= offset %d\n",
+ offsets[focus]);
+ free(filebuf);
+ return 1;
+ }
+ }
+
+ for (i = 0; i < p_number; i++) {
+ if (filebuf[i] != 0) {
+ fprintf(stderr, "Bad content, should be 0 at %d.\n",
+ filesize - (p_number - i) - 1);
+ free(filebuf);
+ return 1;
+ }
+ }
+
+ free(filebuf);
+ return 0;
+}
+
+/* Function: verify the sparse pwrite (bug 1222): the charaters at
+ * <offset> should be '+', and all other characters should be 0
+ * Return: 0 success
+ * 1 failure*/
+int verify(char *filename, unsigned int *offsets, int O_number)
+{
+ int status;
+ unsigned int size;
+ int fd;
+ struct stat Fstat;
+
+ status = stat(filename, &Fstat);
+ if (status == -1) {
+ fprintf(stderr, "No such file named as %s.\n", filename);
+ return 1;
+ }
+ size = Fstat.st_size;
+
+ /* Because we always have '+' just before EOF,
+ * qsorted offsets[] should have the (filesize-1) at the end */
+ if (size != offsets[O_number - 1] + 1) {
+ fprintf(stderr,
+ "Error: the final character not in the offset?\n");
+ return 1;
+ }
+
+ /* now we check the integrity of the file */
+ fd = open(filename, O_RDONLY);
+ if (fd == -1) {
+ fprintf(stderr, "Openning %s fails (%s)\n",
+ filename, strerror(errno));
+ return 1;
+ }
+
+ if((status = verify_content(fd, size, offsets, O_number)) < 0) {
+ close(fd);
+ return status ;
+ }
+
+ return verify_tail(fd, size, offsets, O_number, status);
+}
+
+/* verify the sparse pwrite file with the charaters at <offset>
+ * should be '+', and all other characters should be 0
+ */
+int main(int argc, char**argv)
+{
+ int i;
+ char *filename;
+ char *end;
+ int O_number;
+ unsigned int *offsets;
+
+ if (argc < 3) {
+ fprintf(stderr,
+ "Usage: %s <filename> <offset> [ offset ... ]\n",
+ argv[0]);
+ exit(1);
+ }
+
+ filename = argv[1];
+ O_number = argc - 2;
+ offsets = (unsigned int *) malloc(sizeof(unsigned int) * O_number);
+ for (i = 0; i < O_number; i++) {
+ offsets[i] = strtoul(argv[i + 2], &end, 10);
+ if (*end) {
+ fprintf(stderr,
+ "<offset> parameter should be integer\n");
+ exit(1);
+ }
+ }
+
+ collapse_redundant(offsets, O_number);
+
+ return verify(filename,offsets,O_number);
+}
+
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#define BUFSIZE (1024*1024)
+
+/* Function: pwrite character '+' to <filename> at <offset> (man pwrite)
+ * Return: 0 success
+ * 1 failure */
+int main(int argc, char**argv)
+{
+ int p_size;
+ unsigned int offset;
+ char *filename;
+ int fd;
+ char buf[] = "+++";
+ char *end;
+
+ if(argc != 3) {
+ fprintf(stderr, "Usage: %s <filename> <offset>(KB)\n", argv[0]);
+ exit(1);
+ }
+
+ filename = argv[1];
+ offset = strtoul(argv[2], &end, 10);
+ if (*end) {
+ fprintf(stderr, "<offset> parameter should be integer\n");
+ exit(1);
+ }
+
+ fd = open(filename, O_CREAT|O_RDWR, 0644);
+ if (fd == -1) {
+ fprintf(stderr, "Opening %s fails (%s)\n",
+ filename, strerror(errno));
+ return 1;
+ }
+
+ /* write the character '+' at offset */
+ p_size = pwrite(fd, buf, 1, offset);
+ if (p_size != 1) {
+ fprintf(stderr, "pwrite %s fails (%s)\n",
+ filename, strerror(errno));
+ close(fd);
+ return 1;
+ }
+
+ close(fd);
+ return 0;
+}
--- /dev/null
+#include <stdlib.h>
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+
+int main(int argc, char *argv[])
+{
+ int rc;
+
+ if (argc != 3) {
+ fprintf(stderr, "usage: %s from to\n", argv[0]);
+ exit(1);
+ }
+
+ rc = rename(argv[1], argv[2]);
+ printf("rename returned %d: %s\n", rc, strerror(errno));
+
+ return rc;
+}
--- /dev/null
+#!/bin/bash
+insmod ../obdclass/llog_test.o
+../utils/lctl modules > /r/tmp/ogdb-localhost.localdomain
+echo "NOW reload debugging syms.."
+
+# Using ignore_errors will allow lctl to cleanup even if the test
+# fails.
+../utils/lctl <<EOF
+ignore_errors
+newdev
+attach llog_test llt_name llt_uuid
+setup mds1
+cleanup
+detach
+EOF
+rmmod llog_test
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ * Author: Peter J. Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Robert Read <rread@clusterfs.com>
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <getopt.h>
+#include <string.h>
+
+#include <liblustre.h>
+#include <linux/lustre_idl.h>
+
+#include "parser.h"
+
+extern int op_create_file(char *name, long stripe_size, int stripe_offset,
+ int stripe_count);
+extern int op_find(char *path, struct obd_uuid *obduuid, int recursive,
+ int verbose, int quiet);
+
+/* all functions */
+static int lfs_setstripe(int argc, char **argv);
+static int lfs_find(int argc, char **argv);
+
+/* all avaialable commands */
+command_t cmdlist[] = {
+ {"setstripe", lfs_setstripe, 0,
+ "blah...\n"
+ "usage: setstripe <filename> <stripe size> <stripe start> <stripe count>\n"
+ "\tstripe size: Number of bytes in each stripe (0 default)\n"
+ "\tstripe start: OST index of first stripe (-1 default)\n"
+ "\tstripe count: Number of OSTs to stripe over (0 default)"},
+ {"find", lfs_find, 0,
+ "blah...\n"
+ "usage: find [--obd <uuid>] [--quiet | --verbose] [--recursive] <dir|file> ..."},
+ {"help", Parser_help, 0, "help"},
+ {"exit", Parser_quit, 0, "quit"},
+ {"quit", Parser_quit, 0, "quit"},
+ { 0, 0, 0, NULL }
+};
+
+/* functions */
+static int lfs_setstripe(int argc, char **argv)
+{
+ int result;
+ long st_size;
+ int st_offset, st_count;
+ char *end;
+
+ if (argc != 5)
+ return CMD_HELP;
+
+ // get the stripe size
+ st_size = strtoul(argv[2], &end, 0);
+ if (*end != '\0') {
+ fprintf(stderr, "error: %s: bad stripe size '%s'\n",
+ argv[0], argv[2]);
+ return CMD_HELP;
+ }
+ // get the stripe offset
+ st_offset = strtoul(argv[3], &end, 0);
+ if (*end != '\0') {
+ fprintf(stderr, "error: %s: bad stripe offset '%s'\n",
+ argv[0], argv[3]);
+ return CMD_HELP;
+ }
+ // get the stripe count
+ st_count = strtoul(argv[4], &end, 0);
+ if (*end != '\0') {
+ fprintf(stderr, "error: %s: bad stripe count '%s'\n",
+ argv[0], argv[4]);
+ return CMD_HELP;
+ }
+
+ result = op_create_file(argv[1], st_size, st_offset, st_count);
+ if (result)
+ fprintf(stderr, "error: %s: create stripe file failed\n",
+ argv[0]);
+
+ return result;
+}
+
+static int lfs_find(int argc, char **argv)
+{
+ struct option long_opts[] = {
+ {"obd", 1, 0, 'o'},
+ {"quiet", 0, 0, 'q'},
+ {"recursive", 0, 0, 'r'},
+ {"verbose", 0, 0, 'v'},
+ {0, 0, 0, 0}
+ };
+ char short_opts[] = "ho:qrv";
+ int quiet, verbose, recursive, c, rc;
+ struct obd_uuid *obduuid = NULL;
+
+ optind = 0;
+ quiet = verbose = recursive = 0;
+ while ((c = getopt_long(argc, argv, short_opts,
+ long_opts, NULL)) != -1) {
+ switch (c) {
+ case 'o':
+ if (obduuid) {
+ fprintf(stderr, "error: %s: only one obduuid allowed",
+ argv[0]);
+ return CMD_HELP;
+ }
+ obduuid = (struct obd_uuid *)optarg;
+ break;
+ case 'q':
+ quiet++;
+ verbose = 0;
+ break;
+ case 'r':
+ recursive = 1;
+ break;
+ case 'v':
+ verbose++;
+ quiet = 0;
+ break;
+ case '?':
+ return CMD_HELP;
+ break;
+ default:
+ fprintf(stderr, "error: %s: option '%s' unrecognized\n",
+ argv[0], argv[optind - 1]);
+ return CMD_HELP;
+ break;
+ }
+ }
+
+ if (optind >= argc)
+ return CMD_HELP;
+
+ do {
+ rc = op_find(argv[optind], obduuid, recursive, verbose, quiet);
+ } while (++optind < argc && !rc);
+
+ if (rc)
+ fprintf(stderr, "error: %s: find failed\n", argv[0]);
+ return rc;
+}
+
+
+int main(int argc, char **argv)
+{
+ int rc;
+
+ setlinebuf(stdout);
+
+ Parser_init("lfs > ", cmdlist);
+
+ if (argc > 1) {
+ rc = Parser_execarg(argc - 1, argv + 1, cmdlist);
+ } else {
+ rc = Parser_commands();
+ }
+
+ return rc;
+}
--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ * Author: Peter J. Braam <braam@clusterfs.com>
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Robert Read <rread@clusterfs.com>
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ */
+
+
+/* for O_DIRECTORY */
+#define _GNU_SOURCE
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <dirent.h>
+#include <stdarg.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <linux/types.h>
+#include <linux/unistd.h>
+
+#include <liblustre.h>
+#include <linux/obd.h>
+#include <linux/lustre_lib.h>
+#include <linux/lustre_lite.h>
+#include <linux/lustre_idl.h>
+#include <linux/obd_lov.h>
+
+static void err_msg(char *fmt, ...)
+{
+ va_list args;
+ int tmp_errno = errno;
+
+ va_start(args, fmt);
+ vfprintf(stderr, fmt, args);
+ va_end(args);
+ fprintf(stderr, ": %s (%d)\n", strerror(tmp_errno), tmp_errno);
+}
+
+int op_create_file(char *name, long stripe_size, int stripe_offset,
+ int stripe_count)
+{
+ struct lov_mds_md a_striping;
+ int fd, result = 0;
+
+ /* Initialize IOCTL striping pattern structure */
+ a_striping.lmm_magic = LOV_MAGIC;
+ a_striping.lmm_stripe_size = stripe_size;
+ a_striping.lmm_stripe_offset = stripe_offset;
+ a_striping.lmm_stripe_count = stripe_count;
+
+ fd = open(name, O_CREAT | O_RDWR | O_LOV_DELAY_CREATE, 0644);
+ if (fd < 0) {
+ err_msg("unable to open '%s'",name);
+ result = -errno;
+ }
+ else if (ioctl(fd, LL_IOC_LOV_SETSTRIPE, &a_striping)) {
+ char *errmsg = "stripe already set";
+ if (errno != EEXIST && errno != EALREADY)
+ errmsg = strerror(errno);
+
+ fprintf(stderr, "error on ioctl for '%s' (%d): %s\n",
+ name, fd, errmsg);
+ result = -errno;
+ }
+ else if (close(fd) < 0) {
+ err_msg("error on close for '%s' (%d)", name, fd);
+ result = -errno;
+ }
+ return result;
+}
+
+
+struct find_param {
+ int recursive;
+ int verbose;
+ int quiet;
+ struct obd_uuid *obduuid;
+ struct obd_ioctl_data data;
+ struct lov_desc desc;
+ int uuidslen;
+ char *buf;
+ int buflen;
+ struct obd_uuid *uuids;
+ struct lov_mds_md *lmm;
+ int got_uuids;
+ int obdindex;
+ int max_ost_count;
+};
+
+/* XXX Max obds per lov currently hardcoded to 1000 in lov/lov_obd.c */
+#define MAX_LOV_UUID_COUNT 1000
+#define OBD_NOT_FOUND (-1)
+
+static int prepare_find(struct find_param *param)
+{
+ int datalen, desclen;
+ int cfglen, lmmlen;
+ int max_ost_count = MAX_LOV_UUID_COUNT;
+
+ datalen = size_round(sizeof(struct obd_ioctl_data));
+ desclen = size_round(sizeof(struct lov_desc));
+ param->uuidslen = size_round(max_ost_count * sizeof(struct obd_uuid));
+ cfglen = datalen + desclen + param->uuidslen;
+ lmmlen = lov_mds_md_size(max_ost_count);
+ if (cfglen > lmmlen)
+ param->buflen = cfglen;
+ else
+ param->buflen = lmmlen;
+
+ /* XXX max ioctl buffer size currently hardcoded to 8192 */
+ if (param->buflen > 8192) {
+ int nuuids, remaining, nluoinfos;
+
+ param->buflen = 8192;
+ nuuids = (param->buflen - datalen - desclen) / sizeof(struct obd_uuid);
+ param->uuidslen = size_round(nuuids * sizeof(struct obd_uuid));
+ remaining = nuuids * sizeof(struct obd_uuid);
+ if (param->uuidslen > remaining)
+ nuuids--;
+ nluoinfos = (param->buflen - sizeof(struct lov_mds_md)) /
+ sizeof(*(param->lmm->lmm_objects));
+ if (nuuids > nluoinfos)
+ max_ost_count = nluoinfos;
+ else
+ max_ost_count = nuuids;
+
+ cfglen = datalen + desclen + param->uuidslen;
+ lmmlen = lov_mds_md_size(max_ost_count);
+ }
+
+ if ((param->buf = malloc(param->buflen)) == NULL) {
+ err_msg("unable to allocate %d bytes of memory for ioctl's",
+ param->buflen);
+ return 1;
+ }
+
+ param->lmm = (struct lov_mds_md *)param->buf;
+ param->uuids = (struct obd_uuid *)param->buf;
+ param->got_uuids = 0;
+ param->obdindex = OBD_NOT_FOUND;
+ param->max_ost_count = max_ost_count;
+
+ return 0;
+}
+
+static void cleanup_find(struct find_param *param)
+{
+ if (param->obduuid)
+ free(param->obduuid);
+ if (param->buf)
+ free(param->buf);
+}
+
+static void get_obd_uuids(DIR *dir, char *dname, struct find_param *param)
+{
+ int obdcount;
+ struct obd_uuid *uuidp;
+ int rc, i;
+
+ param->got_uuids = 1;
+ memset(¶m->data, 0, sizeof(struct obd_ioctl_data));
+ param->data.ioc_inllen1 = sizeof(struct lov_desc);
+ param->data.ioc_inlbuf1 = (char *)¶m->desc;
+ param->data.ioc_inllen2 = param->uuidslen;
+ param->data.ioc_inlbuf2 = (char *)param->uuids;
+
+ memset(¶m->desc, 0, sizeof(struct lov_desc));
+ param->desc.ld_tgt_count = param->max_ost_count;
+
+ if (obd_ioctl_pack(¶m->data, ¶m->buf, param->buflen)) {
+ fprintf(stderr, "internal buffer error from %s\n", dname);
+ return;
+ }
+
+ rc = ioctl(dirfd(dir), OBD_IOC_LOV_GET_CONFIG, param->buf);
+ if (rc) {
+ err_msg("error getting LOV config from %s", dname);
+ return;
+ }
+
+ if (obd_ioctl_unpack(¶m->data, param->buf, param->buflen)) {
+ err_msg("invalid reply from ioctl from %s", dname);
+ return;
+ }
+
+ obdcount = param->desc.ld_tgt_count;
+ if (obdcount == 0)
+ return;
+
+ if (param->obduuid) {
+ for (i = 0, uuidp = param->uuids; i < obdcount; i++, uuidp++) {
+ if (strncmp(param->obduuid->uuid, uuidp->uuid,
+ sizeof(*uuidp)) == 0) {
+ param->obdindex = i;
+ break;
+ }
+ }
+
+ if (param->obdindex == OBD_NOT_FOUND)
+ return;
+ } else if (!param->quiet) {
+ printf("OBDS:\n");
+ for (i = 0, uuidp = param->uuids; i < obdcount; i++, uuidp++)
+ printf("%4d: %s\n", i, uuidp->uuid);
+ }
+}
+
+static void process_file(DIR *dir, char *dname, char *fname, struct find_param *param)
+{
+ int rc, i;
+
+ strncpy((char *)param->lmm, fname, param->buflen);
+
+ rc = ioctl(dirfd(dir), IOC_MDC_GETSTRIPE, (void *)param->lmm);
+ if (rc) {
+ if (errno == ENODATA) {
+ if (!param->obduuid && !param->quiet)
+ fprintf(stderr,
+ "%s/%s has no stripe info\n",
+ dname, fname);
+ } else if (errno == EISDIR) {
+ fprintf(stderr, "process_file on directory %s/%s!\n",
+ dname, fname);
+ /*
+ add fname to directory list;
+ */
+ } else {
+ err_msg("IOC_MDC_GETSTRIPE ioctl failed");
+ }
+ return;
+ }
+
+ if ((param->obduuid && param->lmm->lmm_objects[param->obdindex].l_object_id) ||
+ (!param->obduuid && !param->quiet))
+ printf("%s/%s\n", dname, fname);
+
+ if (param->verbose) {
+ printf("lmm_magic: 0x%x\n", param->lmm->lmm_magic);
+ printf("lmm_object_id: "LPX64"\n", param->lmm->lmm_object_id);
+ printf("lmm_stripe_offset: %u\n", (int)param->lmm->lmm_stripe_offset);
+ printf("lmm_stripe_count: %u\n", (int)param->lmm->lmm_stripe_count);
+ printf("lmm_stripe_size: %u\n", (int)param->lmm->lmm_stripe_size);
+ printf("lmm_ost_count: %u\n", param->lmm->lmm_ost_count);
+ printf("lmm_stripe_pattern: %d\n", param->lmm->lmm_magic & 0xf);
+ }
+
+ if (param->verbose || !param->obduuid) {
+ long long oid;
+ int ost = param->lmm->lmm_stripe_offset;
+ int header = !param->quiet;
+
+ /* FIXME: temporary fix for bug 1612 */
+ if (param->lmm->lmm_ost_count == 0) {
+ oid = param->lmm->lmm_object_id;
+ if (header)
+ printf("\tobdidx\t\t objid\t\tobjid\n");
+ printf("\t%6u\t%14llu\t%#13llx\n", 0, oid, oid);
+ } else
+ for (i = 0; i < param->lmm->lmm_ost_count; i++, ost++) {
+ ost %= param->lmm->lmm_ost_count;
+ if ((oid = param->lmm->lmm_objects[ost].l_object_id)) {
+ if (header) {
+ printf("\tobdidx\t\t objid\t\tobjid\n");
+ header = 0;
+ }
+ printf("\t%6u\t%14llu\t%#13llx%s\n", ost,
+ oid, oid, param->obdindex == ost ? " *" : "");
+ }
+ }
+ printf("\n");
+ }
+}
+
+
+static void process_dir(DIR *dir, char *dname, struct find_param *param)
+{
+ struct dirent64 *dirp;
+ DIR *subdir;
+ char path[1024];
+
+ if (!param->got_uuids)
+ get_obd_uuids(dir, dname, param);
+
+ /* Handle the contents of the directory */
+ while ((dirp = readdir64(dir)) != NULL) {
+ if (!strcmp(dirp->d_name, ".") || !strcmp(dirp->d_name, ".."))
+ continue;
+
+ switch (dirp->d_type) {
+ case DT_UNKNOWN:
+ err_msg("\"%s\" is UNKNOWN type %d", dirp->d_name,
+ dirp->d_type);
+ /* If we cared we could stat the file to determine
+ * type and continue on here, but we don't since we
+ * know d_type should be valid for lustre and this
+ * tool only makes sense for lustre filesystems. */
+ return;
+ break;
+ case DT_DIR:
+ if (!param->recursive)
+ break;
+ strcpy(path, dname);
+ strcat(path, "/");
+ strcat(path, dirp->d_name);
+ subdir = opendir(path);
+ if (subdir == NULL) {
+ err_msg("\"%.40s\" opendir failed", path);
+ break;
+ }
+ process_dir(subdir, path, param);
+ closedir(subdir);
+ break;
+ case DT_REG:
+ process_file(dir, dname, dirp->d_name, param);
+ break;
+ default:
+ break;
+ }
+ }
+}
+
+static void process_path(char *path, struct find_param *param)
+{
+ char *fname, *dname;
+ DIR *dir;
+
+ fname = strrchr(path, '/');
+ if (fname != NULL && fname[1] == '\0') {
+ /* Trailing '/', it must be a dir */
+ *fname = '\0';
+ dir = opendir(path);
+ if (dir == NULL) {
+ err_msg("\"%.40s\" opendir failed", path);
+ } else {
+ process_dir(dir, path, param);
+ closedir(dir);
+ }
+ } else if ((dir = opendir(path)) != NULL) {
+ /* No trailing '/', but it is still a dir */
+ process_dir(dir, path, param);
+ closedir(dir);
+ } else {
+ /* It must be a file (or other non-directory) */
+ if (fname == NULL) {
+ dname = ".";
+ fname = path;
+ } else {
+ *fname = '\0';
+ fname++;
+ dname = path;
+ }
+ dir = opendir(dname);
+ if (dir == NULL) {
+ err_msg("\"%.40s\" opendir failed", dname);
+ } else {
+ if (!param->got_uuids)
+ get_obd_uuids(dir, dname, param);
+ process_file(dir, dname, fname, param);
+ closedir(dir);
+ }
+ }
+}
+
+
+int op_find(char *path, struct obd_uuid *obduuid, int recursive,
+ int verbose, int quiet)
+{
+ struct find_param param;
+ int ret = 0;
+
+ memset(¶m, 0, sizeof(param));
+ param.recursive = recursive;
+ param.verbose = verbose;
+ param.quiet = quiet;
+ if (obduuid) {
+ param.obduuid = (struct obd_uuid*)malloc(sizeof(struct obd_uuid));
+ if (param.obduuid == NULL) {
+ ret = 1;
+ goto out;
+ }
+ memcpy(param.obduuid, obduuid, sizeof(struct obd_uuid));
+ }
+
+ ret = prepare_find(¶m);
+ if (ret)
+ goto out;
+
+ process_path(path, ¶m);
+out:
+ cleanup_find(¶m);
+ return ret;
+}
+
--- /dev/null
+#! /bin/sh
+# Copyright (C) 2003 Cluster File Systems, Inc.
+# Create a Lustre configuration file
+#
+# Usage: lwizard
+#
+# Jerrifer <jerrifer@clusterfs.com>
+# wangdi <wangdi@clusterfs.com>
+
+# fatal error to exit
+fatal()
+{
+ if [ "$#" -gt "1" ]; then
+ echo
+ echo "$2"
+ exit 1
+ fi
+
+ exit 1
+}
+
+#print usage and exit
+usage()
+{
+ cat <<EOF
+Usage: ${0##*/} [OPTIONS]...
+
+${##*/} asks the user questions about their cluster configuration, and
+writes an appropriate configuration file to config.xml.
+
+Options:
+ -o, --file=CONFIG_FILE
+ write configuration to CONFIG_FILE (default: config.xml)
+ --stripe_size=SIZE
+ specify the size (in KB) of each stripe on an OST (default: 64)
+ --stripe_cnt=COUNT
+ sepcify the number of OSTs files are striped to (default: 1)
+ --help
+ to get this help
+EOF
+
+ exit 0
+}
+
+# check if $1 is a number
+check_number()
+{
+ local num=$(expr "$1" : "[0-9]*$")
+ if [ $num -gt "0" ]; then
+ return 0
+ fi
+
+ return 1
+}
+
+#parse options of this shell
+get_option()
+{
+ local long_options="file:,mds_size:,ost_size:,stripe_size"
+ local options
+
+ long_options="$long_options:,stripe_cnt:,stripe_pattern"
+ options=$(getopt -o o:h --long "$long_options":,help -- "$@")
+
+ if [ $? -ne 0 ] ; then
+ usage
+ fi
+ eval set -- "$options"
+
+ while true ; do
+ case "$1" in
+ -o | --file)
+ CONFIG_FILE=$2
+ shift 2
+ ;;
+ --stripe_size)
+ STRIPE_SIZE=$(($2 * 1024))
+ check_number $STRIPE_SIZE || fatal 1 "bad stripe_size"
+ shift 2
+ ;;
+ --stripe_cnt)
+ STRIPE_CNT=$2
+ check_number $STRIPE_CNT || fatal 1 "bad stripe_cnt"
+ shift 2
+ ;;
+ -h | --help)
+ usage
+ ;;
+ --)
+ shift 1
+ break
+ esac
+ done
+}
+
+# if $1 in $2
+in_list()
+{
+ local node
+
+ for node in $2 ; do
+ [ "$1" = "$node" ] && return 0
+ done
+ return 1
+}
+
+# read line from stdin
+read_ln()
+{
+ local substr=$1
+ local default_value
+ local answer
+
+ unset ANS
+
+ [ $# -gt 1 ] && default_value=$2
+ [ $# -gt 2 ] && answer=$3
+
+ while [ -z "$ANS" ]; do
+ echo -n "$substr"
+ [ "$default_value" ] && echo -n " ($default_value)"
+ echo -n ": "
+ read ANS
+ [ -z "$ANS" -a "$default_value" ] && ANS=$default_value
+ if [ "$ANS" -a "$answer" ] ; then
+ in_list "$ANS" "$ANSWER" || ANS=""
+ fi
+ done
+
+ return 0
+}
+
+#ask user some questions to add a device
+add_device()
+{
+ local first
+ local hostname
+
+ [ $# -gt 2 ] && first=$3
+
+ if [ -z "$first" ] ; then
+ read_ln "Do you want to add another $1 (yes/no)?" "no" "$ANSWER"
+ else
+ ANS="yes"
+ fi
+
+ [ "$ANS" = "no" ] && return 1
+
+ echo "Creating $1 \"$1$2\"..."
+ read_ln "Please enter $1$2's hostname"
+ hostname=$ANS
+ read_ln "Please enter $1$2's device"
+
+ device=$ANS
+
+ DEVICE="$hostname:$device:$2:$1$2"
+
+ return 0
+}
+
+# get mds information
+add_mds()
+{
+ local id=1
+ local host_name
+
+ while :; do
+ add_device "mds" "$id" "first" || break
+ in_list "$DEVICE" "$MDS_LIST" && continue
+ MDS_LIST="$MDS_LIST $DEVICE" #add mds to MDS_LIST
+ ((id++))
+ break
+ done
+
+ return 0
+}
+
+# ask user to add ost
+add_ost()
+{
+ local first="first"
+ local id=1
+
+ while :; do
+ add_device "ost" "$id" "$first" || break
+ in_list "$DEVICE" "$OST_LIST" && continue
+ OST_LIST="$OST_LIST $DEVICE" #add ost to MDS_LIST
+ ((id++))
+ first=""
+ done
+
+ return 0
+}
+
+# ask user to add client to lustre
+add_client()
+{
+ read_ln "Please enter the mountpoint for your clients" "$DEFAULT_MNTPT"
+ CLIENT_LIST="*:$ANS:client:client"
+ return 0
+}
+
+#save node config into config file
+add_node()
+{
+ local node=$1
+ local nettype=$DEFAULT_NETTYPE
+ local config_file=$2
+
+ in_list "$node" "$NODE_LIST" && return 0
+ NODE_LIST="$NODE_LIST $node"
+
+ $LMC -m "$config_file" --add node --node "$node" || return 1
+ $LMC -m "$config_file" --add net --node "$node" --nid "$node" \
+ --nettype "$nettype" || return 1
+ return 0
+}
+
+#get hostname, device , device_id and device name
+#from mds node
+get_name_in_list()
+{
+ HOST_NAME=$(echo $1 | awk -F: '{ print $1 }')
+ DEVICE=$(echo $1 | awk -F: '{ print $2 }')
+ DEVICE_ID=$(echo $1 | awk -F: '{ print $3 }')
+ DEVICE_NAME=$(echo $1 | awk -F: '{ print $4 }')
+}
+
+# following user input to create xml config file
+create_config()
+{
+ local mds_name
+ local config_file=$1
+
+ for mds in $MDS_LIST ; do
+ get_name_in_list $mds
+ echo -n " $DEVICE_NAME"
+ add_node "$HOST_NAME" "$config_file" || return 1
+ $LMC -m "$config_file" --add mds --node "$HOST_NAME" \
+ --mds "$DEVICE_NAME" \
+ --nid "$HOST_NAME" --fstype "$DEFAULT_FSTYPE" \
+ --dev "$DEVICE" || return 1
+
+ mds_name="$DEVICE_NAME"
+ done
+
+ #add lov information FIXME --stripe_sz and
+ #--stripe_cnt should be input by user
+ echo -n " lov1"
+ $LMC -m "$config_file" --add lov --lov lov1 --mds "$mds_name" \
+ --stripe_sz "$STRIPE_SIZE" --stripe_cnt "$STRIPE_CNT" \
+ --stripe_pattern "$STRIPE_PATTERN" || return 1
+
+ for ost in $OST_LIST ; do
+ get_name_in_list $ost
+ echo -n " $DEVICE_NAME"
+ add_node "$HOST_NAME" "$config_file" || return 1
+ $LMC -m "$config_file" --add ost --node "$HOST_NAME" \
+ --ost "$DEVICE_NAME" \
+ --lov lov1 --fstype "$DEFAULT_FSTYPE" \
+ --dev "$DEVICE" || return 1
+ done
+
+ for client in $CLIENT_LIST ; do
+ get_name_in_list $client
+ echo -n " $DEVICE_NAME"
+ add_node "client" "$config_file" || return 1
+ $LMC -m "$config_file" --add mtpt --nod client \
+ --mds "$mds_name" --lov lov1 --path "$DEVICE" || return 1
+ done
+ echo
+ return 0
+}
+
+#parse options
+get_option "$@"
+
+#some default definitions
+LMC=${LMC:-"/usr/sbin/lmc"}
+CONFIG_FILE=${CONFIG_FILE:-"config.xml"}
+TMP_CONFIG_FILE=${TMP_CONFIG_FILE:-".config.xml.tmp"}
+DEFAULT_FSTYPE=${DEFAULT_FSTYPE:-"ext3"}
+DEFAULT_NETTYPE=${DEFAULT_NETTYPE:-"tcp"}
+DEFAULT_MNTPT=${DEFAULT_MNTPT:-"/mnt/lustre"}
+STRIPE_SIZE=${STRIPE_SIZE:-65536}
+STRIPE_CNT=${STRIPE_CNT:-1}
+STRIPE_PATTERN=${STRIPE_PATTERN:-0}
+ANSWER="yes no"
+
+#print program information
+cat <<EOF
+This script will help you create a Lustre configuration file.
+
+EOF
+
+#add mds to lustre
+unset $MDS_LIST
+add_mds || fatal 1 "Cannot add mds to your lustre"
+
+#add ost to lustre
+unset $OST_LIST
+add_ost || fatal 1 "Cannot add ost to your lustre"
+
+#add client to lustre
+unset $CLIENT_LIST
+add_client || fatal 1 "Cannot add client to your lustre"
+
+rm -f "$TMP_CONFIG_FILE"
+echo -n "Saving configuration to $CONFIG_FILE:"
+create_config "$TMP_CONFIG_FILE" || \
+ fatal 1 "There was an error saving the config file."
+mv -f "$TMP_CONFIG_FILE" "$CONFIG_FILE" || \
+ fatal 1 "There was an error saving the config file."
+
+cat <<EOF
+
+Your configuration has been saved to $CONFIG_FILE.
+EOF
+exit 0