Whamcloud - gitweb
Highmem deadlock avoidance (server version):
authoradilger <adilger>
Fri, 8 Nov 2002 18:25:53 +0000 (18:25 +0000)
committeradilger <adilger>
Fri, 8 Nov 2002 18:25:53 +0000 (18:25 +0000)
- reserve highmem pages before we start doing kmaps to avoid deadlocks
  with multiple threads reserving vectors of pages

lustre/include/linux/lustre_net.h
lustre/include/linux/obd_support.h
lustre/mds/handler.c
lustre/obdclass/class_obd.c
lustre/obdclass/genops.c
lustre/obdecho/echo.c
lustre/osc/osc_request.c
lustre/ost/ost_handler.c

index 7382e60..5e2fc03 100644 (file)
  * is left in them.
  */
 
+#define LDLM_NUM_THREADS        4
 #define LDLM_NEVENTS   1024
 #define LDLM_NBUFS     10
 #define LDLM_BUFSIZE   (64 * 1024)
 #define LDLM_MAXREQSIZE        1024
 
+#define MDT_NUM_THREADS 8
 #define MDS_NEVENTS    1024
 #define MDS_NBUFS      10
 #define MDS_BUFSIZE    (64 * 1024)
 #define MDS_MAXREQSIZE 1024
 
+#define OST_NUM_THREADS 6
 #define OST_NEVENTS    min(num_physpages / 16, 32768UL)
 #define OST_NBUFS      min(OST_NEVENTS / 128, 256UL)
 #define OST_BUFSIZE    ((OST_NEVENTS > 4096UL ? 128 : 64) * 1024)
index 0df7a46..5d927d6 100644 (file)
 #ifndef _OBD_SUPPORT
 #define _OBD_SUPPORT
 
+#include <linux/config.h>
 #include <linux/autoconf.h>
 #include <linux/slab.h>
+#include <linux/highmem.h>
 #include <linux/kp30.h>
 
 /* global variables */
@@ -127,8 +129,8 @@ do {                                                                         \
 #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0))
 #define ll_bdevname(a) __bdevname((a))
 #define ll_lock_kernel lock_kernel()
-#else 
-#define ll_lock_kernel 
+#else
+#define ll_lock_kernel
 #define ll_bdevname(a) bdevname((a))
 #endif
 
@@ -174,4 +176,11 @@ do {                                                                    \
         (ptr) = (void *)0xdeadbeef;                                     \
 } while (0)
 
+#ifdef CONFIG_HIGHMEM
+extern void obd_highmem_get(int count);
+extern void obd_highmem_put(int count);
+#else
+#define obd_highmem_get(count) do {} while (0)
+#define obd_highmem_put(count) do {} while (0)
+#endif
 #endif
index 60bbce3..7f34755 100644 (file)
@@ -1468,7 +1468,6 @@ int mds_detach(struct obd_device *dev)
 
 }
 
-#define MDT_NUM_THREADS 8
 static int mdt_setup(struct obd_device *obddev, obd_count len, void *buf)
 {
         int i;
index 81a862d..ae29ee0 100644 (file)
@@ -30,6 +30,7 @@
 #include <linux/poll.h>
 #include <linux/init.h>
 #include <linux/list.h>
+#include <linux/highmem.h>
 #include <asm/io.h>
 #include <asm/ioctls.h>
 #include <asm/system.h>
@@ -85,14 +86,16 @@ static int obd_class_release(struct inode * inode, struct file * file)
 }
 
 
-inline void obd_data2conn(struct lustre_handle *conn, struct obd_ioctl_data *data)
+static inline void obd_data2conn(struct lustre_handle *conn,
+                                 struct obd_ioctl_data *data)
 {
         conn->addr = data->ioc_addr;
         conn->cookie = data->ioc_cookie;
 }
 
 
-inline void obd_conn2data(struct obd_ioctl_data *data, struct lustre_handle *conn)
+static inline void obd_conn2data(struct obd_ioctl_data *data,
+                                 struct lustre_handle *conn)
 {
         data->ioc_addr = conn->addr;
         data->ioc_cookie = conn->cookie;
@@ -364,7 +367,7 @@ static int obd_class_ioctl (struct inode * inode, struct file * filp,
                 if (OBP(obd, attach))
                         err = OBP(obd,attach)(obd, sizeof(*data), data);
                 if (err) {
-                        if(data->ioc_inlbuf2)                                                
+                        if(data->ioc_inlbuf2)
                                 OBD_FREE(obd->obd_name, strlen(obd->obd_name)+1);
                         obd->obd_type = NULL;
 
@@ -500,7 +503,6 @@ static int obd_class_ioctl (struct inode * inode, struct file * filp,
         }
 
         case OBD_IOC_GETATTR: {
-
                 obd_data2conn(&conn, data);
                 err = obd_getattr(&conn, &data->ioc_obdo1, NULL);
                 if (!err)
@@ -662,6 +664,62 @@ static struct miscdevice obd_psdev = {
 
 void (*class_signal_connection_failure)(struct ptlrpc_connection *);
 
+#ifdef CONFIG_HIGHMEM
+#warning "using highmem accounting for deadlock avoidance"
+/* Allow at most 3/4 of the highmem mappings to be consumed by vector I/O
+ * requests.  This avoids deadlocks on servers which have a lot of clients
+ * doing vector I/O.  We don't need to do this for non-vector I/O requests
+ * because singleton requests will just block on the kmap itself and never
+ * deadlock waiting for additional kmaps to complete.
+ */
+#define OBD_HIGHMEM_MAX (LAST_PKMAP * 3 / 4)
+static atomic_t obd_highmem_count = ATOMIC_INIT(OBD_HIGHMEM_MAX);
+static DECLARE_WAIT_QUEUE_HEAD(obd_highmem_waitq);
+
+void obd_highmem_get(int count)
+{
+        //CERROR("getting %d kmap counts (%d/%d)\n", count,
+        //       atomic_read(&obd_highmem_count), OBD_HIGHMEM_MAX);
+        if (count == 1)
+                atomic_dec(&obd_highmem_count);
+        else while (atomic_add_negative(-count, &obd_highmem_count)) {
+                static long next_show = 0;
+                static int skipped = 0;
+
+                CDEBUG(D_OTHER, "negative kmap reserved count: %d\n",
+                       atomic_read(&obd_highmem_count));
+                atomic_add(count, &obd_highmem_count);
+
+                if (time_after(jiffies, next_show)) {
+                        CERROR("blocking %s (and %d others) for kmaps\n",
+                               current->comm, skipped);
+                        next_show = jiffies + 5*HZ;
+                        skipped = 0;
+                } else
+                        skipped++;
+                wait_event(obd_highmem_waitq,
+                           atomic_read(&obd_highmem_count) >= count);
+        }
+}
+
+void obd_highmem_put(int count)
+{
+        atomic_add(count, &obd_highmem_count);
+        /* Wake up sleepers.  Sadly, this wakes up all of the tasks at once.
+         * We should have something smarter here like:
+        while (atomic_read(&obd_highmem_count) > 0)
+                wake_up_nr(obd_highmem_waitq, 1);
+        although we would need to set somewhere (probably obd_class_init):
+        obd_highmem_waitq.flags |= WQ_EXCLUSIVE;
+        for now the wait_event() condition will handle this OK I believe.
+         */
+        wake_up(&obd_highmem_waitq);
+}
+
+EXPORT_SYMBOL(obd_highmem_get);
+EXPORT_SYMBOL(obd_highmem_put);
+#endif
+
 EXPORT_SYMBOL(obd_dev);
 EXPORT_SYMBOL(obdo_cachep);
 EXPORT_SYMBOL(obd_memory);
@@ -715,13 +773,13 @@ static int __init init_obdclass(void)
                 obd->obd_minor = i;
 
         err = obd_init_caches();
-        
+
         if (err)
                 return err;
         obd_sysctl_init();
-        
+
         err=lprocfs_reg_main();
-        
+
         return 0;
 }
 
@@ -742,7 +800,7 @@ static void __exit cleanup_obdclass(void)
 
         obd_cleanup_caches();
         obd_sysctl_clean();
-        
+
         err = lprocfs_dereg_main();
 
         CERROR("obd memory leaked: %ld bytes\n", obd_memory);
index c996e62..fe40668 100644 (file)
@@ -92,7 +92,7 @@ int ll_sync_brw_cb(struct brw_cb_data *brw_cbd, int err, int phase)
                 if (atomic_dec_and_test(&brw_cbd->brw_refcount))
                         OBD_FREE(brw_cbd, sizeof(*brw_cbd));
                 RETURN(err);
-        } else                
+        } else
                 LBUG();
         EXIT;
         return 0;
index 9d7800e..e0e74dd 100644 (file)
@@ -12,8 +12,8 @@
  * and Andreas Dilger <adilger@clusterfs.com>
  */
 
-static char rcsid[] __attribute ((unused)) = "$Id: echo.c,v 1.44 2002/11/02 02:41:31 thantry Exp $";
-#define OBDECHO_VERSION "$Revision: 1.44 $"
+static char rcsid[] __attribute ((unused)) = "$Id: echo.c,v 1.45 2002/11/08 18:25:53 adilger Exp $";
+#define OBDECHO_VERSION "$Revision: 1.45 $"
 
 #define EXPORT_SYMTAB
 
@@ -272,6 +272,8 @@ int echo_preprw(int cmd, struct lustre_handle *conn, int objcount,
 
         *desc_private = (void *)DESC_PRIV;
 
+        obd_highmem_get(niocount);
+
         for (i = 0; i < objcount; i++, obj++) {
                 int gfp_mask = (obj->ioo_id & 1) ? GFP_HIGHUSER : GFP_KERNEL;
                 int verify = obj->ioo_id != 0;
@@ -319,6 +321,7 @@ preprw_cleanup:
                 __free_pages(r->page, 0);
                 atomic_dec(&obd->u.echo.eo_prep);
         }
+        obd_highmem_put(niocount);
         memset(res, 0, sizeof(*res) * niocount);
 
         return rc;
@@ -381,6 +384,7 @@ int echo_commitrw(int cmd, struct lustre_handle *conn, int objcount,
                                                  r->offset, obj->ioo_id);
 
                         kunmap(page);
+                        obd_highmem_put(1);
                         __free_pages(page, 0);
                         atomic_dec(&obd->u.echo.eo_prep);
                 }
@@ -396,6 +400,7 @@ commitrw_cleanup:
                 struct page *page = r->page;
 
                 kunmap(page);
+                obd_highmem_put(1);
                 __free_pages(page, 0);
                 atomic_dec(&obd->u.echo.eo_prep);
         }
index e8b81ac..b79e612 100644 (file)
@@ -326,6 +326,7 @@ static void unmap_and_decref_bulk_desc(void *data)
                 bulk = list_entry(tmp, struct ptlrpc_bulk_page, bp_link);
 
                 kunmap(bulk->bp_page);
+                obd_highmem_put(1);
         }
 
         ptlrpc_bulk_decref(desc);
@@ -412,6 +413,8 @@ static int osc_brw_read(struct lustre_handle *conn, struct lov_stripe_md *lsm,
         xid = ++connection->c_xid_out;       /* single xid for all pages */
         spin_unlock(&connection->c_lock);
 
+        obd_highmem_get(page_count);
+
         for (mapped = 0; mapped < page_count; mapped++) {
                 struct ptlrpc_bulk_page *bulk = ptlrpc_prep_bulk_page(desc);
                 if (bulk == NULL)
@@ -474,6 +477,7 @@ out_req:
 out_unmap:
         while (mapped-- > 0)
                 kunmap(pga[mapped].pg);
+        obd_highmem_put(page_count);
         OBD_FREE(cb_data, sizeof(*cb_data));
 out_desc:
         ptlrpc_bulk_decref(desc);
@@ -534,6 +538,8 @@ static int osc_brw_write(struct lustre_handle *conn, struct lov_stripe_md *md,
         cb_data->obd_data = local;
         cb_data->obd_size = page_count * sizeof(*local);
 
+        obd_highmem_get(page_count);
+
         for (mapped = 0; mapped < page_count; mapped++) {
                 local[mapped].addr = kmap(pga[mapped].pg);
 
@@ -606,6 +612,8 @@ out_unmap:
         while (mapped-- > 0)
                 kunmap(pga[mapped].pg);
 
+        obd_highmem_put(page_count);
+
         OBD_FREE(local, page_count * sizeof(*local));
 out_cb:
         OBD_FREE(cb_data, sizeof(*cb_data));
index f3e3081..e1bbaec 100644 (file)
@@ -575,8 +575,6 @@ out:
         return 0;
 }
 
-#define OST_NUM_THREADS 6
-
 /* mount the file system (secretly) */
 static int ost_setup(struct obd_device *obddev, obd_count len, void *buf)
 {
@@ -613,7 +611,7 @@ static int ost_setup(struct obd_device *obddev, obd_count len, void *buf)
 
         ost->ost_service = ptlrpc_init_svc(OST_NEVENTS, OST_NBUFS,
                                            OST_BUFSIZE, OST_MAXREQSIZE,
-                                           OST_REQUEST_PORTAL, OSC_REPLY_PORTAL, 
+                                           OST_REQUEST_PORTAL, OSC_REPLY_PORTAL,
                                            "self", ost_handle, "ost");
         if (!ost->ost_service) {
                 CERROR("failed to start service\n");