land clio.

author nikita <nikita>

Fri, 7 Nov 2008 23:54:43 +0000 (23:54 +0000)

committer nikita <nikita>

Fri, 7 Nov 2008 23:54:43 +0000 (23:54 +0000)
author nikita <nikita>
Fri, 7 Nov 2008 23:54:43 +0000 (23:54 +0000)
committer nikita <nikita>
Fri, 7 Nov 2008 23:54:43 +0000 (23:54 +0000)
diff --git a/lustre/ChangeLog b/lustre/ChangeLog

index bd543fd..c7d0d77 100644 (file)
--- a/lustre/ChangeLog
+++ b/lustre/ChangeLog
@@ -14,6 +14,10 @@ tbd  Sun Microsystems, Inc.
         * File join has been disabled in this release, refer to Bugzilla 16929.
  
  Severity   : enhancement
+Bugzilla   : 14166
+Description: New client IO stack (CLIO).
+
+Severity   : enhancement
  Bugzilla   : 15393
  Description: Commit on sharing. Eliminate inter-client dependencies between
              uncommitted transactions by doing transaction commits.
diff --git a/lustre/autoMakefile.am b/lustre/autoMakefile.am

index 55aa1b6..3ad4024 100644 (file)
--- a/lustre/autoMakefile.am
+++ b/lustre/autoMakefile.am
@@ -42,7 +42,7 @@ ALWAYS_SUBDIRS := include lvfs obdclass ldlm ptlrpc osc lov obdecho \
  
  SERVER_SUBDIRS := obdfilter ost mds mgs mdt cmm mdd osd
  
-CLIENT_SUBDIRS := mdc lmv llite
+CLIENT_SUBDIRS := mdc lmv llite lclient
  
  QUOTA_SUBDIRS := quota
  
diff --git a/lustre/autoconf/lustre-core.m4 b/lustre/autoconf/lustre-core.m4

index 4899310..2a87e8f 100644 (file)
--- a/lustre/autoconf/lustre-core.m4
+++ b/lustre/autoconf/lustre-core.m4
@@ -258,9 +258,9 @@ LB_LINUX_TRY_COMPILE([
  # LC_FUNC_REGISTER_CACHE
  #
  # if register_cache() is defined by kernel
-# 
+#
  # There are two ways to shrink one customized cache in linux kernels. For the
-# kernels are prior than 2.6.5(?), register_cache() is used, and for latest 
+# kernels are prior than 2.6.5(?), register_cache() is used, and for latest
  # kernels, set_shrinker() is used instead.
  #
  AC_DEFUN([LC_FUNC_REGISTER_CACHE],
@@ -342,7 +342,7 @@ LB_LINUX_TRY_COMPILE([
          AC_MSG_RESULT([yes])
          AC_DEFINE(HAVE_DEV_SET_RDONLY, 1, [kernel has new dev_set_rdonly])
  ],[
-        AC_MSG_RESULT([no, Linux kernel source needs to be patches by lustre 
+        AC_MSG_RESULT([no, Linux kernel source needs to be patches by lustre
  kernel patches from Lustre version 1.4.3 or above.])
  ])
  ])
@@ -580,7 +580,7 @@ AC_DEFUN([LC_BIT_SPINLOCK_H],
  #
  # LC_POSIX_ACL_XATTR
  #
-# If we have xattr_acl.h 
+# If we have xattr_acl.h
  #
  AC_DEFUN([LC_XATTR_ACL],
  [LB_CHECK_FILE([$LINUX/include/linux/xattr_acl.h],[
@@ -729,7 +729,7 @@ AC_DEFUN([LC_CONFIG_SUNRPC],
  #
  AC_DEFUN([LC_CONFIG_GSS_KEYRING],
  [AC_MSG_CHECKING([whether to enable gss keyring backend])
- AC_ARG_ENABLE([gss_keyring], 
+ AC_ARG_ENABLE([gss_keyring],
                [AC_HELP_STRING([--disable-gss-keyring],
                                 [disable gss keyring backend])],
                [],[enable_gss_keyring='yes'])
@@ -757,7 +757,7 @@ m4_pattern_allow(AC_KERBEROS_V5)
  #
  AC_DEFUN([LC_CONFIG_GSS],
  [AC_MSG_CHECKING([whether to enable gss/krb5 support])
- AC_ARG_ENABLE([gss], 
+ AC_ARG_ENABLE([gss],
                 [AC_HELP_STRING([--enable-gss], [enable gss/krb5 support])],
                 [],[enable_gss='no'])
   AC_MSG_RESULT([$enable_gss])
@@ -949,7 +949,7 @@ LB_LINUX_TRY_COMPILE([
          AC_MSG_RESULT(no)
  ])
  ])
- 
+
  #
  # LC_STATFS_DENTRY_PARAM
  # starting from 2.6.18 linux kernel uses dentry instead of
@@ -990,7 +990,7 @@ LB_LINUX_TRY_COMPILE([
  ])
  ])
  
-# 
+#
  # LC_INVALIDATEPAGE_RETURN_INT
  # more 2.6 api changes.  return type for the invalidatepage
  # address_space_operation is 'void' in new kernels but 'int' in old
@@ -1048,7 +1048,7 @@ LB_LINUX_TRY_COMPILE([
  #include <linux/fs.h>
  ],[
         struct inode i;
-       i.i_blksize = 0; 
+       i.i_blksize = 0;
  ],[
         AC_MSG_RESULT(yes)
         AC_DEFINE(HAVE_INODE_BLKSIZE, 1,
@@ -1086,37 +1086,37 @@ LB_LINUX_TRY_COMPILE([
  EXTRA_KCFLAGS="$tmp_flags"
  ])
  
-# LC_GENERIC_FILE_WRITE
-# 2.6.19 introduce do_sync_write instead of
-# generic_file_write
-AC_DEFUN([LC_GENERIC_FILE_WRITE],
-[AC_MSG_CHECKING([use generic_file_write])
+# LC_FILE_WRITEV
+# 2.6.19 replaced writev with aio_write
+AC_DEFUN([LC_FILE_WRITEV],
+[AC_MSG_CHECKING([writev in fops])
  LB_LINUX_TRY_COMPILE([
          #include <linux/fs.h>
  ],[
-        int result = generic_file_read(NULL, NULL, 0, 0);
+        struct file_operations *fops;
+        fops->writev = NULL;
  ],[
          AC_MSG_RESULT(yes)
-        AC_DEFINE(HAVE_GENERIC_FILE_WRITE, 1,
-                [use generic_file_write])
+        AC_DEFINE(HAVE_FILE_WRITEV, 1,
+                [use fops->writev])
  ],[
         AC_MSG_RESULT(no)
  ])
  ])
  
  # LC_GENERIC_FILE_READ
-# 2.6.19 need to use do_sync_read instead of
-# generic_file_read
-AC_DEFUN([LC_GENERIC_FILE_READ],
-[AC_MSG_CHECKING([use generic_file_read])
+# 2.6.19 replaced readv with aio_read
+AC_DEFUN([LC_FILE_READV],
+[AC_MSG_CHECKING([readv in fops])
  LB_LINUX_TRY_COMPILE([
          #include <linux/fs.h>
  ],[
-        int result = generic_file_read(NULL, NULL, 0, 0);
+        struct file_operations *fops;
+        fops->readv = NULL;
  ],[
          AC_MSG_RESULT(yes)
-        AC_DEFINE(HAVE_GENERIC_FILE_READ, 1,
-                [use generic_file_read])
+        AC_DEFINE(HAVE_FILE_READV, 1,
+                [use fops->readv])
  ],[
          AC_MSG_RESULT(no)
  ])
@@ -1140,7 +1140,7 @@ LB_LINUX_TRY_COMPILE([
  ])
  
  # LC_CANCEL_DIRTY_PAGE
-# 2.6.20 introduse cancel_dirty_page instead of 
+# 2.6.20 introduse cancel_dirty_page instead of
  # clear_page_dirty.
  AC_DEFUN([LC_CANCEL_DIRTY_PAGE],
  [AC_MSG_CHECKING([kernel has cancel_dirty_page])
@@ -1348,7 +1348,7 @@ LB_LINUX_TRY_COMPILE([
          int i = unregister_blkdev(0,NULL);
  ],[
          AC_MSG_RESULT([yes])
-        AC_DEFINE(HAVE_UNREGISTER_BLKDEV_RETURN_INT, 1, 
+        AC_DEFINE(HAVE_UNREGISTER_BLKDEV_RETURN_INT, 1,
                  [unregister_blkdev return int])
  ],[
          AC_MSG_RESULT([no])
@@ -1467,7 +1467,7 @@ AC_TRY_RUN([
  #include <linux/autoconf.h>
  #include <linux/types.h>
  #undef __KERNEL__
-// block include 
+// block include
  #define __LINUX_POSIX_ACL_H
  
  # ifdef CONFIG_FS_POSIX_ACL
@@ -1504,7 +1504,7 @@ CFLAGS="$tmp_flags"
  ])
  
  #
-# check for crypto API 
+# check for crypto API
  #
  AC_DEFUN([LC_ASYNC_BLOCK_CIPHER],
  [AC_MSG_CHECKING([if kernel has block cipher support])
@@ -1637,15 +1637,15 @@ AC_DEFUN([LC_PROG_LINUX],
           # 2.6.19
           LC_INODE_BLKSIZE
           LC_VFS_READDIR_U64_INO
-         LC_GENERIC_FILE_READ
-         LC_GENERIC_FILE_WRITE
+         LC_FILE_WRITEV
+         LC_FILE_READV
  
           # 2.6.20
           LC_CANCEL_DIRTY_PAGE
  
           # raid5-zerocopy patch
           LC_PAGE_CONSTANT
-                 
+               
          # 2.6.22
           LC_INVALIDATE_BDEV_2ARG
           LC_ASYNC_BLOCK_CIPHER
@@ -1765,7 +1765,7 @@ LC_CONFIG_LIBLUSTRE_RECOVERY
  
  AC_DEFUN([LC_CONFIG_LRU_RESIZE],
  [AC_MSG_CHECKING([whether to enable lru self-adjusting])
-AC_ARG_ENABLE([lru_resize], 
+AC_ARG_ENABLE([lru_resize],
         AC_HELP_STRING([--enable-lru-resize],
                         [enable lru resize support]),
         [],[enable_lru_resize='yes'])
@@ -1781,7 +1781,7 @@ fi
  # whether to enable quota support
  #
  AC_DEFUN([LC_CONFIG_QUOTA],
-[AC_ARG_ENABLE([quota], 
+[AC_ARG_ENABLE([quota],
         AC_HELP_STRING([--enable-quota],
                         [enable quota support]),
         [],[enable_quota='default'])
@@ -1814,7 +1814,7 @@ fi
  #
  AC_DEFUN([LC_CONFIG_SPLIT],
  [AC_MSG_CHECKING([whether to enable split support])
-AC_ARG_ENABLE([split], 
+AC_ARG_ENABLE([split],
         AC_HELP_STRING([--enable-split],
                         [enable split support]),
         [],[enable_split='no'])
@@ -1823,7 +1823,7 @@ if test x$enable_split != xno; then
     AC_DEFINE(HAVE_SPLIT_SUPPORT, 1, [enable split support])
  fi
  ])
- 
+
  AC_DEFUN([LC_QUOTA_READ],
  [AC_MSG_CHECKING([if kernel supports quota_read])
  LB_LINUX_TRY_COMPILE([
@@ -1866,7 +1866,7 @@ LB_LINUX_TRY_COMPILE([
  #
  # LC_FUNC_RCU
  #
-# kernels prior than 2.6.0(?) have no RCU supported; in kernel 2.6.5(SUSE), 
+# kernels prior than 2.6.0(?) have no RCU supported; in kernel 2.6.5(SUSE),
  # call_rcu takes three parameters.
  #
  AC_DEFUN([LC_FUNC_RCU],
@@ -1887,7 +1887,7 @@ LB_LINUX_TRY_COMPILE([
                  AC_DEFINE(HAVE_CALL_RCU_PARAM, 1, [call_rcu takes three parameters])
                  AC_MSG_RESULT([yes])
          ],[
-                AC_MSG_RESULT([no]) 
+                AC_MSG_RESULT([no])
          ])
  ],[
          AC_MSG_RESULT([no])
@@ -1895,7 +1895,7 @@ LB_LINUX_TRY_COMPILE([
  ])
  
  # LC_SECURITY_PLUG  # for SLES10 SP2
-# check security plug in sles10 sp2 kernel 
+# check security plug in sles10 sp2 kernel
  AC_DEFUN([LC_SECURITY_PLUG],
  [AC_MSG_CHECKING([If kernel has security plug support])
  LB_LINUX_TRY_COMPILE([
@@ -2085,6 +2085,7 @@ lustre/liblustre/Makefile
  lustre/liblustre/tests/Makefile
  lustre/llite/Makefile
  lustre/llite/autoMakefile
+lustre/lclient/Makefile
  lustre/lov/Makefile
  lustre/lov/autoMakefile
  lustre/lvfs/Makefile
diff --git a/lustre/cmm/cmm_device.c b/lustre/cmm/cmm_device.c

index 4308533..d2c435d 100644 (file)
--- a/lustre/cmm/cmm_device.c
+++ b/lustre/cmm/cmm_device.c
@@ -401,7 +401,7 @@ static struct lu_device *cmm_device_alloc(const struct lu_env *env,
                  if (!m->cmm_fld) {
                          cmm_device_free(env, l);
                          l = ERR_PTR(-ENOMEM);
-        }
+                }
          }
          RETURN(l);
  }
@@ -448,14 +448,14 @@ static int cmm_device_init(const struct lu_env *env, struct lu_device *d,
          ls = cmm2lu_dev(m)->ld_site;
          lu_site2md(ls)->ms_client_fld = m->cmm_fld;
          err = cmm_procfs_init(m, name);
-        
+
          RETURN(err);
  }
  
  static struct lu_device *cmm_device_fini(const struct lu_env *env,
                                           struct lu_device *ld)
  {
-       struct cmm_device *cm = lu2cmm_dev(ld);
+        struct cmm_device *cm = lu2cmm_dev(ld);
          struct mdc_device *mc, *tmp;
          struct lu_site *ls;
          ENTRY;
diff --git a/lustre/cmm/cmm_object.c b/lustre/cmm/cmm_object.c

index 2289be3..3309687 100644 (file)
--- a/lustre/cmm/cmm_object.c
+++ b/lustre/cmm/cmm_object.c
@@ -116,8 +116,8 @@ struct lu_object *cmm_object_alloc(const struct lu_env *env,
                  struct cml_object *clo;
  
                  OBD_ALLOC_PTR(clo);
-               if (clo != NULL) {
-                       lo = &clo->cmm_obj.cmo_obj.mo_lu;
+                if (clo != NULL) {
+                        lo = &clo->cmm_obj.cmo_obj.mo_lu;
                          lu_object_init(lo, NULL, ld);
                          clo->cmm_obj.cmo_obj.mo_ops = &cml_mo_ops;
                          clo->cmm_obj.cmo_obj.mo_dir_ops = &cml_dir_ops;
@@ -127,8 +127,8 @@ struct lu_object *cmm_object_alloc(const struct lu_env *env,
                  struct cmr_object *cro;
  
                  OBD_ALLOC_PTR(cro);
-               if (cro != NULL) {
-                       lo = &cro->cmm_obj.cmo_obj.mo_lu;
+                if (cro != NULL) {
+                        lo = &cro->cmm_obj.cmo_obj.mo_lu;
                          lu_object_init(lo, NULL, ld);
                          cro->cmm_obj.cmo_obj.mo_ops = &cmr_mo_ops;
                          cro->cmm_obj.cmo_obj.mo_dir_ops = &cmr_dir_ops;
@@ -199,9 +199,9 @@ static int cml_object_print(const struct lu_env *env, void *cookie,
  }
  
  static const struct lu_object_operations cml_obj_ops = {
-       .loo_object_init    = cml_object_init,
-       .loo_object_free    = cml_object_free,
-       .loo_object_print   = cml_object_print
+        .loo_object_init    = cml_object_init,
+        .loo_object_free    = cml_object_free,
+        .loo_object_print   = cml_object_print
  };
  
  /* CMM local md_object operations */
@@ -831,9 +831,9 @@ static int cmr_object_print(const struct lu_env *env, void *cookie,
  }
  
  static const struct lu_object_operations cmr_obj_ops = {
-       .loo_object_init    = cmr_object_init,
-       .loo_object_free    = cmr_object_free,
-       .loo_object_print   = cmr_object_print
+        .loo_object_init    = cmr_object_init,
+        .loo_object_free    = cmr_object_free,
+        .loo_object_print   = cmr_object_print
  };
  
  /* CMM remote md_object operations. All are invalid */
diff --git a/lustre/cmm/mdc_device.c b/lustre/cmm/mdc_device.c

index 8c75a6c..5507962 100644 (file)
--- a/lustre/cmm/mdc_device.c
+++ b/lustre/cmm/mdc_device.c
@@ -89,7 +89,7 @@ static int mdc_obd_update(struct obd_device *host,
                  CDEBUG(D_INFO, "Update connect_flags: "LPX64"\n",
                         conn_data->ocd_connect_flags);
          }
-        
+
          RETURN(rc);
  }
  /* MDC OBD is set up already and connected to the proper MDS
@@ -146,9 +146,9 @@ static int mdc_obd_add(const struct lu_env *env,
                  ocd->ocd_ibits_known = MDS_INODELOCK_UPDATE;
                  ocd->ocd_connect_flags = OBD_CONNECT_VERSION |
                                           OBD_CONNECT_ACL |
-                                         OBD_CONNECT_LCL_CLIENT | 
+                                         OBD_CONNECT_LCL_CLIENT |
                                           OBD_CONNECT_MDS_CAPA |
-                                         OBD_CONNECT_OSS_CAPA | 
+                                         OBD_CONNECT_OSS_CAPA |
                                           OBD_CONNECT_IBITS |
                                           OBD_CONNECT_MDS_MDS |
                                           OBD_CONNECT_FID |
@@ -173,7 +173,7 @@ static int mdc_obd_add(const struct lu_env *env,
                                  mdc->obd_upcall.onu_upcall = mdc_obd_update;
                          }
                  }
-                
+
                  if (rc) {
                          obd_disconnect(desc->cl_exp);
                          desc->cl_exp = NULL;
@@ -205,7 +205,7 @@ static int mdc_obd_del(const struct lu_env *env, struct mdc_device *mc,
                  mdc_obd->obd_force = mdt_obd->obd_force;
                  mdc_obd->obd_fail = 0;
          }
-        
+
          rc = obd_fid_fini(desc->cl_exp);
          if (rc)
                  CERROR("Fid fini error %d\n", rc);
@@ -246,7 +246,7 @@ static int mdc_process_config(const struct lu_env *env,
  }
  
  static const struct lu_device_operations mdc_lu_ops = {
-       .ldo_object_alloc   = mdc_object_alloc,
+        .ldo_object_alloc   = mdc_object_alloc,
          .ldo_process_config = mdc_process_config
  };
  
@@ -254,12 +254,12 @@ void cmm_mdc_init_ea_size(const struct lu_env *env, struct mdc_device *mc,
                        int max_mdsize, int max_cookiesize)
  {
          struct obd_device *obd = class_exp2obd(mc->mc_desc.cl_exp);
-       
+
          obd->u.cli.cl_max_mds_easize = max_mdsize;
          obd->u.cli.cl_max_mds_cookiesize = max_cookiesize;
  }
  
-static int mdc_device_init(const struct lu_env *env, struct lu_device *ld, 
+static int mdc_device_init(const struct lu_env *env, struct lu_device *ld,
                             const char *name, struct lu_device *next)
  {
          return 0;
@@ -286,10 +286,9 @@ static struct lu_device *mdc_device_alloc(const struct lu_env *env,
          } else {
                  md_device_init(&mc->mc_md_dev, ldt);
                  mc->mc_md_dev.md_ops = &mdc_md_ops;
-               ld = mdc2lu_dev(mc);
+                ld = mdc2lu_dev(mc);
                  ld->ld_ops = &mdc_lu_ops;
                  sema_init(&mc->mc_fid_sem, 1);
-
          }
  
          RETURN (ld);
@@ -300,7 +299,7 @@ static struct lu_device *mdc_device_free(const struct lu_env *env,
  {
          struct mdc_device *mc = lu2mdc_dev(ld);
  
-       LASSERTF(atomic_read(&ld->ld_ref) == 0,
+        LASSERTF(atomic_read(&ld->ld_ref) == 0,
                   "Refcount = %i\n", atomic_read(&ld->ld_ref));
          LASSERT(list_empty(&mc->mc_linkage));
          md_device_fini(&mc->mc_md_dev);
diff --git a/lustre/cmm/mdc_object.c b/lustre/cmm/mdc_object.c

index 36c7678..2e884ba 100644 (file)
--- a/lustre/cmm/mdc_object.c
+++ b/lustre/cmm/mdc_object.c
@@ -176,7 +176,7 @@ static int mdc_req2attr_update(const struct lu_env *env,
                  LASSERT(ma->ma_capa != NULL);
                  *ma->ma_capa = *capa;
          }
-                
+
          if ((body->valid & OBD_MD_FLEASIZE) || (body->valid & OBD_MD_FLDIREA)) {
                  if (body->eadatasize == 0) {
                          CERROR("No size defined for easize field\n");
@@ -189,7 +189,7 @@ static int mdc_req2attr_update(const struct lu_env *env,
                          RETURN(-EPROTO);
  
                  LASSERT(ma->ma_lmm != NULL);
-                LASSERT(ma->ma_lmm_size >= body->eadatasize); 
+                LASSERT(ma->ma_lmm_size >= body->eadatasize);
                  ma->ma_lmm_size = body->eadatasize;
                  memcpy(ma->ma_lmm, md, ma->ma_lmm_size);
                  ma->ma_valid |= MA_LOV;
@@ -207,7 +207,7 @@ static int mdc_req2attr_update(const struct lu_env *env,
                          RETURN(-EPROTO);
                  }
  
-                cookie = req_capsule_server_sized_get(&req->rq_pill, 
+                cookie = req_capsule_server_sized_get(&req->rq_pill,
                                                        &RMF_LOGCOOKIES,
                                                        body->aclsize);
                  if (cookie == NULL)
@@ -226,7 +226,7 @@ static int mdc_req2attr_update(const struct lu_env *env,
                          RETURN(-EPROTO);
                  }
  
-                acl = req_capsule_server_sized_get(&req->rq_pill, 
+                acl = req_capsule_server_sized_get(&req->rq_pill,
                                                     &RMF_ACL,
                                                     body->aclsize);
                  if (acl == NULL)
@@ -349,7 +349,7 @@ static int mdc_object_create(const struct lu_env *env,
          mci = mdc_info_init(env);
          mci->mci_opdata.op_bias = MDS_CROSS_REF;
          mci->mci_opdata.op_fid2 = *lu_object_fid(&mo->mo_lu);
-        
+
          /* Parent fid is needed to create dotdot on the remote node. */
          mci->mci_opdata.op_fid1 = *(spec->u.sp_pfid);
          mci->mci_opdata.op_mod_time = la->la_ctime;
@@ -572,7 +572,7 @@ static int mdc_rename_tgt(const struct lu_env *env, struct md_object *mo_p,
  
          RETURN(rc);
  }
-/* 
+/*
   * Return resulting fid in sfid
   * 0: fids are not relatives
   * fid: fid at which search stopped
@@ -594,7 +594,7 @@ static int mdc_is_subdir(const struct lu_env *env, struct md_object *mo,
                  body = req_capsule_server_get(&mci->mci_req->rq_pill,
                                                &RMF_MDT_BODY);
                  LASSERT(body->valid & OBD_MD_FLID);
-        
+
                  CDEBUG(D_INFO, "Remote mdo_is_subdir(), new src "DFID"\n",
                         PFID(&body->fid1));
                  *sfid = body->fid1;
diff --git a/lustre/doc/lock-ordering b/lustre/doc/lock-ordering

new file mode 100644 (file)

index 0000000..3bea748
--- /dev/null
+++ b/lustre/doc/lock-ordering
@@ -0,0 +1,309 @@
+/* this is dot(1) input file for lock-ordering diagram */
+/* it should be passed through C preprocessor first */
+/* cpp -P -DFITPAGE lock-ordering | tred | dot -Tps | gv -media a4 - */
+
+/*
+sb->s_umount
+    libcfs_nidstring_lock
+    obd_dev_lock
+    g_uuid_lock
+    obd_types_lock
+    type->obd_type_lock
+    obd->obd_dev_lock
+    handle_base_lock
+    bucket->lock
+    _lprocfs_lock
+    the_lnet.ln_lock
+        request->rq_lock
+    ptlrpc_all_services_lock
+    service->srv_lock
+    shrinker_rwsem
+    conn_lock
+        hash_body->lchb_hash_tables[i].lhb_lock
+    hash_body->lchb_hash_tables[i].lhb_lock
+    imp->imp_lock
+    olg->olg_lock
+    cli->cl_sem
+        handle_base_lock
+        bucket->lock
+        obd->obd_dev_lock
+            ref->lf_guard
+            hash_body->lchb_hash_tables[i].lhb_lock
+        h->h_lock
+        _lprocfs_lock
+        imp->imp_lock
+            h->h_lock
+        policy_lock
+        null_sec.ps_lock
+        ptlrpc_last_xid_lock
+        set->set_new_req_lock
+    h->h_lock
+    ns->ns_hash_lock
+    ns->ns_unused_lock
+    lock->l_lock
+    null_sec.ps_lock
+    ptlrpc_last_xid_lock
+    request->rq_lock
+    ksocknal_data.ksnd_global_lock
+    at->at_lock
+    fld->lcf_lock
+    obd->obd_pool_lock
+    obd->obd_osfs_lock
+    lov->lov_qos.lq_rw_sem
+    sbi->ll_lco.lco_lock
+    cache->fci_lock
+    inode_lock
+    dcache_lock
+        dentry->d_lock
+slock-AF_INET/1
+    ksocknal_data.ksnd_global_lock
+        ksocknal_data.ksnd_connd_lock
+        kss->kss_lock
+pl->pl_lock
+    obd->obd_pool_lock
+inode->i_mutex
+    ns->ns_unused_lock
+    ns->ns_hash_lock
+    imp->imp_lock
+    null_sec.ps_lock
+    ptlrpc_last_xid_lock
+    bucket->lock
+    lock->l_lock
+        res->lr_lock
+            ns->ns_unused_lock
+            bucket->lock
+                h->h_lock
+            res->lr_lock/1
+            inode_lock
+            osc_ast_guard_class
+                ref->lf_guard
+    ksocknal_data.ksnd_global_lock
+    at->at_lock
+    h->h_lock
+    blp->blp_lock
+    cache->fci_lock
+    obd->obd_pool_lock
+    fld->lcf_lock
+    pl->pl_lock
+    lu_site_guard_class
+    files_lock
+lov->lo_type_guard
+    h->coh_lock_guard
+    ref->lf_guard
+    cl_lock_guard_class
+        ref->lf_guard
+        cl_lock_guard_class#2
+            cl_lock_guard_class#2
+            ref->lf_guard
+            ns->ns_hash_lock
+            ns->ns_unused_lock
+            imp->imp_lock
+            null_sec.ps_lock
+            ptlrpc_last_xid_lock
+            handle_base_lock
+            bucket->lock
+            lock->l_lock
+            set->set_new_req_lock
+            h->h_lock
+        h->coh_lock_guard
+        h->coh_page_guard
+
+*/
+#define CATTR fontsize=8 /*, fontname=Helvetica */
+#define NATTR CATTR
+#define EATTR CATTR
+
+#define SYSATTR color=yellow, style=filled
+#define PSEUDOATTR color=pink, style=filled, peripheries=2
+
+#define BLOCKATTR shape=ellipse
+#define SPINATTR shape=box
+
+#define CONDATTR color=blue, peripheries=2, BLOCKATTR
+
+#define MARKBLOCK(name) /* name -> schedulable [style=invis, weight=0] */
+
+#define SBLOCK(name, l) name [label=l, NATTR, BLOCKATTR, SYSATTR]; MARKBLOCK(name)
+
+#define SPSEUDO(name) name [NATTR, BLOCKATTR, PSEUDOATTR]; MARKBLOCK(name)
+
+#define LBLOCK(name, l) name [label=l, NATTR, BLOCKATTR]; MARKBLOCK(name)
+
+#define RCOND(name, l) name [label=l, NATTR, CONDATTR]; MARKBLOCK(name)
+
+#define MARKSPIN(name) /* schedulable -> name [style=invis, weight=0] */
+
+#define SSPIN(name, l) name [label=l, NATTR, SYSATTR, SPINATTR]; MARKSPIN(name)
+#define LSPIN(name, l) name [label=l, NATTR, SPINATTR]; MARKSPIN(name)
+
+#define ARC(from, to, func, ...) from -> to [EATTR, label=func, ## __VA_ARGS__]
+
+digraph locks {
+
+    subgraph blocking {
+        SBLOCK(sb__s_umount, "sb->s_umount")
+        LBLOCK(_lprocfs_lock, "_lprocfs_lock")
+        LBLOCK(cli__cl_sem, "cli->cl_sem")
+        SBLOCK(shrinker_rwsem, "shrinker_rwsem")
+        LBLOCK(lov__lov_qos_lq_rw_sem, "lov->lov_qos.lq_rw_sem")
+        SBLOCK(inode__i_mutex, "inode->i_mutex")
+        LBLOCK(lov__lo_type_guard, "lov->lo_type_guard")
+        LBLOCK(cl_lock_guard_class, "cl_lock_guard_class")
+    }
+
+    subgraph spin {
+        LSPIN(h__coh_lock_guard, "h->coh_lock_guard")
+        LSPIN(h__coh_page_guard, "h->coh_page_guard")
+        LSPIN(libcfs_nidstring_lock, "libcfs_nidstring_lock")
+        LSPIN(obd_dev_lock, "obd_dev_lock")
+        LSPIN(g_uuid_lock, "g_uuid_lock")
+        LSPIN(obd_types_lock, "obd_types_lock")
+        LSPIN(obd_type__obd_type_lock, "obd_type->obd_type_lock")
+        LSPIN(obd__obd_dev_lock, "obd->obd_dev_lock")
+        LSPIN(handle_base_lock, "handle_base_lock")
+        LSPIN(bucket__lock, "bucket->lock")
+        LSPIN(the_lnet_ln_lock, "the_lnet.ln_lock")
+        LSPIN(request__rq_lock, "request->rq_lock")
+        LSPIN(hash_body__lchb_hash_tables_i__lhb_lock, "hash_body->lchb_hash_tables[i].lhb_lock")
+        LSPIN(imp__imp_lock, "imp->imp_lock")
+        LSPIN(ref__lf_guard, "ref->lf_guard")
+        LSPIN(h__h_lock, "h->h_lock")
+        LSPIN(null_sec_ps_lock, "null_sec.ps_lock")
+        LSPIN(set__set_new_req_lock, "set->set_new_req_lock")
+        LSPIN(ns__ns_hash_lock, "ns->ns_hash_lock")
+        LSPIN(ns__ns_unused_lock, "ns->ns_unused_lock")
+        LSPIN(lock__l_lock, "lock->l_lock")
+        LSPIN(ksocknal_data_ksnd_global_lock, "ksocknal_data.ksnd_global_lock")
+        LSPIN(at__at_lock, "at->at_lock")
+        LSPIN(fld__lcf_lock, "fld->lcf_lock")
+        LSPIN(obd__obd_pool_lock, "obd->obd_pool_lock")
+        LSPIN(service__srv_lock, "service->srv_lock")
+        LSPIN(obd__obd_osfs_lock, "obd->obd_osfs_lock")
+        LSPIN(sbi__ll_lco_lco_lock, "sbi->ll_lco.lco_lock")
+        LSPIN(cache__fci_lock, "cache->fci_lock")
+        SSPIN(inode_lock, "inode_lock")
+        SSPIN(dcache_lock, "dcache_lock")
+        SSPIN(dentry__d_lock, "dentry->d_lock")
+        LSPIN(ksocknal_data_ksnd_connd_lock, "ksocknal_data.ksnd_connd_lock")
+        LSPIN(kss__kss_lock, "kss->kss_lock")
+        LSPIN(pl__pl_lock, "pl->pl_lock")
+        LSPIN(osc_ast_guard_class, "osc_ast_guard_class")
+        LSPIN(blp__blp_lock, "blp->blp_lock")
+        LSPIN(lu_site_guard_class, "lu_site_guard_class")
+        SSPIN(files_lock, "files_lock")
+        LSPIN(ptlrpc_all_services_lock, "ptlrpc_all_services_lock")
+        LSPIN(conn_lock, "conn_lock")
+        LSPIN(olg__olg_lock, "olg->olg_lock")
+        LSPIN(policy_lock, "policy_lock")
+        LSPIN(ptlrpc_last_xid_lock, "ptlrpc_last_xid_lock")
+    }
+
+ARC(sb__s_umount, libcfs_nidstring_lock, "")
+ARC(sb__s_umount, obd_dev_lock, "")
+ARC(sb__s_umount, g_uuid_lock, "")
+ARC(sb__s_umount, obd_types_lock, "")
+ARC(sb__s_umount, type__obd_type_lock, "")
+ARC(sb__s_umount, obd__obd_dev_lock, "")
+ARC(sb__s_umount, handle_base_lock, "")
+ARC(sb__s_umount, bucket__lock, "")
+ARC(sb__s_umount, _lprocfs_lock, "")
+ARC(sb__s_umount, the_lnet_ln_lock, "")
+ARC(sb__s_umount, ptlrpc_all_services_lock, "")
+ARC(sb__s_umount, service__srv_lock, "")
+ARC(sb__s_umount, shrinker_rwsem, "")
+ARC(sb__s_umount, conn_lock, "")
+ARC(sb__s_umount, hash_body__lchb_hash_tables_i__lhb_lock, "")
+ARC(sb__s_umount, imp__imp_lock, "")
+ARC(sb__s_umount, olg__olg_lock, "")
+ARC(sb__s_umount, cli__cl_sem, "")
+ARC(sb__s_umount, h__h_lock, "")
+ARC(sb__s_umount, ns__ns_hash_lock, "")
+ARC(sb__s_umount, ns__ns_unused_lock, "")
+ARC(sb__s_umount, lock__l_lock, "")
+ARC(sb__s_umount, null_sec_ps_lock, "")
+ARC(sb__s_umount, ptlrpc_last_xid_lock, "")
+ARC(sb__s_umount, request__rq_lock, "")
+ARC(sb__s_umount, ksocknal_data_ksnd_global_lock, "")
+ARC(sb__s_umount, at__at_lock, "")
+ARC(sb__s_umount, fld__lcf_lock, "")
+ARC(sb__s_umount, obd__obd_pool_lock, "")
+ARC(sb__s_umount, obd__obd_osfs_lock, "")
+ARC(sb__s_umount, lov__lov_qos_lq_rw_sem, "")
+ARC(sb__s_umount, sbi__ll_lco_lco_lock, "")
+ARC(sb__s_umount, cache__fci_lock, "")
+ARC(sb__s_umount, inode_lock, "")
+ARC(sb__s_umount, dcache_lock, "")
+
+ARC(the_lnet_ln_lock, request__rq_lock, "")
+ARC(conn_lock, hash_body__lchb_hash_tables_i__lhb_lock, "")
+ARC(cli__cl_sem, handle_base_lock, "")
+ARC(cli__cl_sem, bucket__lock, "")
+ARC(cli__cl_sem, obd__obd_dev_lock, "")
+ARC(cli__cl_sem, h__h_lock, "")
+ARC(cli__cl_sem, _lprocfs_lock, "")
+ARC(cli__cl_sem, imp__imp_lock, "")
+ARC(cli__cl_sem, policy_lock, "")
+ARC(cli__cl_sem, null_sec_ps_lock, "")
+ARC(cli__cl_sem, ptlrpc_last_xid_lock, "")
+ARC(cli__cl_sem, set__set_new_req_lock, "")
+
+ARC(obd__obd_dev_lock, ref__lf_guard, "")
+ARC(obd__obd_dev_lock, hash_body__lchb_hash_tables_i__lhb_lock, "")
+ARC(imp__imp_lock, h__h_lock, "")
+
+ARC(dcache_lock, dentry__d_lock, "")
+
+ARC(ksocknal_data_ksnd_global_lock, ksocknal_data_ksnd_connd_lock, "")
+ARC(ksocknal_data_ksnd_global_lock, kss__kss_lock, "")
+ARC(pl__pl_lock, obd__obd_pool_lock, "")
+
+ARC(inode__i_mutex, ns__ns_unused_lock, "")
+ARC(inode__i_mutex, ns__ns_hash_lock, "")
+ARC(inode__i_mutex, imp__imp_lock, "")
+ARC(inode__i_mutex, null_sec_ps_lock, "")
+ARC(inode__i_mutex, ptlrpc_last_xid_lock, "")
+ARC(inode__i_mutex, bucket__lock, "")
+ARC(inode__i_mutex, lock__l_lock, "")
+ARC(inode__i_mutex, ksocknal_data_ksnd_global_lock, "")
+ARC(inode__i_mutex, at__at_lock, "")
+ARC(inode__i_mutex, h__h_lock, "")
+ARC(inode__i_mutex, blp__blp_lock, "")
+ARC(inode__i_mutex, cache__fci_lock, "")
+ARC(inode__i_mutex, obd__obd_pool_lock, "")
+ARC(inode__i_mutex, fld__lcf_lock, "")
+ARC(inode__i_mutex, pl__pl_lock, "")
+ARC(inode__i_mutex, lu_site_guard_class, "")
+ARC(inode__i_mutex, files_lock, "")
+
+ARC(lock__l_lock, res__lr_lock, "")
+ARC(res__lr_lock, ns__ns_unused_lock, "")
+ARC(res__lr_lock, bucket__lock, "")
+ARC(res__lr_lock, res__lr_lock, "")
+ARC(res__lr_lock, inode_lock, "")
+ARC(res__lr_lock, osc_ast_guard_class, "")
+
+ARC(osc_ast_guard_class, ref__lf_guard, "")
+ARC(bucket__lock, h__h_lock, "")
+
+ARC(cl_lock_guard_class, cl_lock_guard_class, "")
+ARC(cl_lock_guard_class, ref__lf_guard, "")
+ARC(cl_lock_guard_class, ns__ns_hash_lock, "")
+ARC(cl_lock_guard_class, ns__ns_unused_lock, "")
+ARC(cl_lock_guard_class, imp__imp_lock, "")
+ARC(cl_lock_guard_class, null_sec_ps_lock, "")
+ARC(cl_lock_guard_class, ptlrpc_last_xid_lock, "")
+ARC(cl_lock_guard_class, handle_base_lock, "")
+ARC(cl_lock_guard_class, bucket__lock, "")
+ARC(cl_lock_guard_class, lock__l_lock, "")
+ARC(cl_lock_guard_class, set__set_new_req_lock, "")
+ARC(cl_lock_guard_class, h__h_lock, "")
+ARC(cl_lock_guard_class, ref__lf_guard, "")
+ARC(cl_lock_guard_class, cl_lock_guard_class, "")
+ARC(cl_lock_guard_class, h__coh_lock_guard, "")
+ARC(cl_lock_guard_class, h__coh_page_guard, "")
+
+ARC(lov__lo_type_guard, h__coh_lock_guard, "")
+ARC(lov__lo_type_guard, ref__lf_guard, "")
+ARC(lov__lo_type_guard, cl_lock_guard_class, "")
+
+}
diff --git a/lustre/fld/fld_request.c b/lustre/fld/fld_request.c

index cdb5110..39fb13b 100644 (file)
--- a/lustre/fld/fld_request.c
+++ b/lustre/fld/fld_request.c
@@ -68,7 +68,7 @@
  #include <lustre_mdc.h>
  #include "fld_internal.h"
  
-/* TODO: these 3 functions are copies of flow-control code from mdc_lib.c 
+/* TODO: these 3 functions are copies of flow-control code from mdc_lib.c
   * It should be common thing. The same about mdc RPC lock */
  static int fld_req_avail(struct client_obd *cli, struct mdc_cache_waiter *mcw)
  {
@@ -105,7 +105,7 @@ static void fld_exit_request(struct client_obd *cli)
          spin_lock(&cli->cl_loi_list_lock);
          cli->cl_r_in_flight--;
          list_for_each_safe(l, tmp, &cli->cl_cache_waiters) {
-                
+
                  if (cli->cl_r_in_flight >= cli->cl_max_rpcs_in_flight) {
                          /* No free request slots anymore */
                          break;
@@ -606,7 +606,7 @@ int fld_client_lookup(struct lu_client_fld *fld,
                  /*
                   * insert the 'inflight' sequence. No need to protect that,
                   * we are trying to reduce numbers of RPC but not restrict
-                 * to them exactly one 
+                 * to them exactly one
                   */
                  fld_cache_insert_inflight(fld->lcf_cache, seq);
                  rc = fld_client_rpc(target->ft_exp,
@@ -619,7 +619,7 @@ int fld_client_lookup(struct lu_client_fld *fld,
                   * The current solution for IGIF is to bind it to mds0.
                   * In the future, this should be fixed once IGIF can be found
                   * in FLD.
-                 */ 
+                 */
                  md_fld.mf_mds = 0;
                  rc = 0;
          }
diff --git a/lustre/include/Makefile.am b/lustre/include/Makefile.am

index 9803237..d18e1a9 100644 (file)
--- a/lustre/include/Makefile.am
+++ b/lustre/include/Makefile.am
@@ -41,9 +41,9 @@ EXTRA_DIST = ioctl.h liblustre.h lprocfs_status.h lustre_cfg.h        \
               lustre_fsfilt.h lustre_ha.h lustre_handles.h lustre_import.h \
               lustre_lib.h lustre_sec.h lustre_lite.h lustre_log.h lustre_mds.h \
               lustre_mdc.h lustre_net.h lustre_quota.h lustre_ucache.h lvfs.h \
-             class_hash.h obd_cache.h obd_class.h obd_echo.h obd.h obd_lov.h \
+             class_hash.h obd_cache.h obd_class.h obd.h obd_lov.h \
              obd_ost.h obd_support.h lustre_ver.h lu_object.h lu_time.h  \
               md_object.h dt_object.h lustre_param.h lustre_mdt.h \
               lustre_fid.h lustre_fld.h lustre_req_layout.h lustre_capa.h \
               lustre_idmap.h lustre_eacl.h interval_tree.h obd_cksum.h \
-            lu_ref.h lustre_acl.h lustre_cache.h
+            lu_ref.h cl_object.h lustre_acl.h lclient.h
diff --git a/lustre/include/cl_object.h b/lustre/include/cl_object.h

new file mode 100644 (file)

index 0000000..3343139
--- /dev/null
+++ b/lustre/include/cl_object.h
@@ -0,0 +1,3033 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+#ifndef _LUSTRE_CL_OBJECT_H
+#define _LUSTRE_CL_OBJECT_H
+
+/** \defgroup clio clio
+ *
+ * Client objects implement io operations and cache pages.
+ *
+ * Examples: lov and osc are implementations of cl interface.
+ *
+ * Big Theory Statement.
+ *
+ * Layered objects.
+ *
+ * Client implementation is based on the following data-types:
+ *
+ *   - cl_object
+ *
+ *   - cl_page
+ *
+ *   - cl_lock     represents an extent lock on an object.
+ *
+ *   - cl_io       represents high-level i/o activity such as whole read/write
+ *                 system call, or write-out of pages from under the lock being
+ *                 canceled. cl_io has sub-ios that can be stopped and resumed
+ *                 independently, thus achieving high degree of transfer
+ *                 parallelism. Single cl_io can be advanced forward by
+ *                 the multiple threads (although in the most usual case of
+ *                 read/write system call it is associated with the single user
+ *                 thread, that issued the system call).
+ *
+ *   - cl_req      represents a collection of pages for a transfer. cl_req is
+ *                 constructed by req-forming engine that tries to saturate
+ *                 transport with large and continuous transfers.
+ *
+ * Terminology
+ *
+ *     - to avoid confusion high-level I/O operation like read or write system
+ *     call is referred to as "an io", whereas low-level I/O operation, like
+ *     RPC, is referred to as "a transfer"
+ *
+ *     - "generic code" means generic (not file system specific) code in the
+ *     hosting environment. "cl-code" means code (mostly in cl_*.c files) that
+ *     is not layer specific.
+ *
+ * Locking.
+ *
+ *  - i_mutex
+ *      - PG_locked
+ *          - cl_object_header::coh_page_guard
+ *          - cl_object_header::coh_lock_guard
+ *          - lu_site::ls_guard
+ *
+ * See the top comment in cl_object.c for the description of overall locking and
+ * reference-counting design.
+ *
+ * See comments below for the description of i/o, page, and dlm-locking
+ * design.
+ *
+ * @{
+ */
+
+/*
+ * super-class definitions.
+ */
+#include <lu_object.h>
+#include <lvfs.h>
+#ifdef __KERNEL__
+#        include <linux/mutex.h>
+#        include <linux/radix-tree.h>
+#endif
+
+struct inode;
+
+struct cl_device;
+struct cl_device_operations;
+
+struct cl_object;
+struct cl_object_page_operations;
+struct cl_object_lock_operations;
+
+struct cl_page;
+struct cl_page_slice;
+struct cl_lock;
+struct cl_lock_slice;
+
+struct cl_lock_operations;
+struct cl_page_operations;
+
+struct cl_io;
+struct cl_io_slice;
+
+struct cl_req;
+struct cl_req_slice;
+
+/**
+ * Operations for each data device in the client stack.
+ *
+ * \see vvp_cl_ops, lov_cl_ops, lovsub_cl_ops, osc_cl_ops
+ */
+struct cl_device_operations {
+        /**
+         * Initialize cl_req. This method is called top-to-bottom on all
+         * devices in the stack to get them a chance to allocate layer-private
+         * data, and to attach them to the cl_req by calling
+         * cl_req_slice_add().
+         *
+         * \see osc_req_init(), lov_req_init(), lovsub_req_init()
+         * \see ccc_req_init()
+         */
+        int (*cdo_req_init)(const struct lu_env *env, struct cl_device *dev,
+                            struct cl_req *req);
+};
+
+/**
+ * Device in the client stack.
+ *
+ * \see ccc_device, lov_device, lovsub_device, osc_device
+ */
+struct cl_device {
+        /** Super-class. */
+        struct lu_device                   cd_lu_dev;
+        /** Per-layer operation vector. */
+        const struct cl_device_operations *cd_ops;
+};
+
+/** \addtogroup cl_object cl_object
+ * @{ */
+/**
+ * "Data attributes" of cl_object. Data attributes can be updated
+ * independently for a sub-object, and top-object's attributes are calculated
+ * from sub-objects' ones.
+ */
+struct cl_attr {
+        /** Object size, in bytes */
+        loff_t cat_size;
+        /**
+         * Known minimal size, in bytes.
+         *
+         * This is only valid when at least one DLM lock is held.
+         */
+        loff_t cat_kms;
+        /** Modification time. Measured in seconds since epoch. */
+        time_t cat_mtime;
+        /** Access time. Measured in seconds since epoch. */
+        time_t cat_atime;
+        /** Change time. Measured in seconds since epoch. */
+        time_t cat_ctime;
+        /**
+         * Blocks allocated to this cl_object on the server file system.
+         *
+         * \todo XXX An interface for block size is needed.
+         */
+        __u64  cat_blocks;
+        /**
+         * User identifier for quota purposes.
+         */
+        uid_t  cat_uid;
+        /**
+         * Group identifier for quota purposes.
+         */
+        gid_t  cat_gid;
+};
+
+/**
+ * Fields in cl_attr that are being set.
+ */
+enum cl_attr_valid {
+        CAT_SIZE   = 1 << 0,
+        CAT_KMS    = 1 << 1,
+        CAT_MTIME  = 1 << 3,
+        CAT_ATIME  = 1 << 4,
+        CAT_CTIME  = 1 << 5,
+        CAT_BLOCKS = 1 << 6,
+        CAT_UID    = 1 << 7,
+        CAT_GID    = 1 << 8
+};
+
+/**
+ * Sub-class of lu_object with methods common for objects on the client
+ * stacks.
+ *
+ * cl_object: represents a regular file system object, both a file and a
+ *    stripe. cl_object is based on lu_object: it is identified by a fid,
+ *    layered, cached, hashed, and lrued. Important distinction with the server
+ *    side, where md_object and dt_object are used, is that cl_object "fans out"
+ *    at the lov/sns level: depending on the file layout, single file is
+ *    represented as a set of "sub-objects" (stripes). At the implementation
+ *    level, struct lov_object contains an array of cl_objects. Each sub-object
+ *    is a full-fledged cl_object, having its fid, living in the lru and hash
+ *    table.
+ *
+ *    This leads to the next important difference with the server side: on the
+ *    client, it's quite usual to have objects with the different sequence of
+ *    layers. For example, typical top-object is composed of the following
+ *    layers:
+ *
+ *        - vvp
+ *        - lov
+ *
+ *    whereas its sub-objects are composed of
+ *
+ *        - lovsub
+ *        - osc
+ *
+ *    layers. Here "lovsub" is a mostly dummy layer, whose purpose is to keep
+ *    track of the object-subobject relationship.
+ *
+ *    Sub-objects are not cached independently: when top-object is about to
+ *    be discarded from the memory, all its sub-objects are torn-down and
+ *    destroyed too.
+ *
+ * \see ccc_object, lov_object, lovsub_object, osc_object
+ */
+struct cl_object {
+        /** super class */
+        struct lu_object                   co_lu;
+        /** per-object-layer operations */
+        const struct cl_object_operations *co_ops;
+};
+
+/**
+ * Description of the client object configuration. This is used for the
+ * creation of a new client object that is identified by a more state than
+ * fid.
+ */
+struct cl_object_conf {
+        /** Super-class. */
+        struct lu_object_conf     coc_lu;
+        union {
+                /**
+                 * Object layout. This is consumed by lov.
+                 */
+                struct lustre_md *coc_md;
+                /**
+                 * Description of particular stripe location in the
+                 * cluster. This is consumed by osc.
+                 */
+                struct lov_oinfo *coc_oinfo;
+        } u;
+        /**
+         * VFS inode. This is consumed by vvp.
+         */
+        struct inode             *coc_inode;
+};
+
+/**
+ * Operations implemented for each cl object layer.
+ *
+ * \see vvp_ops, lov_ops, lovsub_ops, osc_ops
+ */
+struct cl_object_operations {
+        /**
+         * Initialize page slice for this layer. Called top-to-bottom through
+         * every object layer when a new cl_page is instantiated. Layer
+         * keeping private per-page data, or requiring its own page operations
+         * vector should allocate these data here, and attach then to the page
+         * by calling cl_page_slice_add(). \a vmpage is locked (in the VM
+         * sense). Optional.
+         *
+         * \retval NULL success.
+         *
+         * \retval ERR_PTR(errno) failure code.
+         *
+         * \retval valid-pointer pointer to already existing referenced page
+         *         to be used instead of newly created.
+         */
+        struct cl_page *(*coo_page_init)(const struct lu_env *env,
+                                         struct cl_object *obj,
+                                         struct cl_page *page,
+                                         cfs_page_t *vmpage);
+        /**
+         * Initialize lock slice for this layer. Called top-to-bottom through
+         * every object layer when a new cl_lock is instantiated. Layer
+         * keeping private per-lock data, or requiring its own lock operations
+         * vector should allocate these data here, and attach then to the lock
+         * by calling cl_lock_slice_add(). Mandatory.
+         */
+        int  (*coo_lock_init)(const struct lu_env *env,
+                              struct cl_object *obj, struct cl_lock *lock,
+                              const struct cl_io *io);
+        /**
+         * Initialize io state for a given layer.
+         *
+         * called top-to-bottom once per io existence to initialize io
+         * state. If layer wants to keep some state for this type of io, it
+         * has to embed struct cl_io_slice in lu_env::le_ses, and register
+         * slice with cl_io_slice_add(). It is guaranteed that all threads
+         * participating in this io share the same session.
+         */
+        int  (*coo_io_init)(const struct lu_env *env,
+                            struct cl_object *obj, struct cl_io *io);
+        /**
+         * Fill portion of \a attr that this layer controls. This method is
+         * called top-to-bottom through all object layers.
+         *
+         * \pre cl_object_header::coh_attr_guard of the top-object is locked.
+         *
+         * \return   0: to continue
+         * \return +ve: to stop iterating through layers (but 0 is returned
+         * from enclosing cl_object_attr_get())
+         * \return -ve: to signal error
+         */
+        int (*coo_attr_get)(const struct lu_env *env, struct cl_object *obj,
+                            struct cl_attr *attr);
+        /**
+         * Update attributes.
+         *
+         * \a valid is a bitmask composed from enum #cl_attr_valid, and
+         * indicating what attributes are to be set.
+         *
+         * \pre cl_object_header::coh_attr_guard of the top-object is locked.
+         *
+         * \return the same convention as for
+         * cl_object_operations::coo_attr_get() is used.
+         */
+        int (*coo_attr_set)(const struct lu_env *env, struct cl_object *obj,
+                            const struct cl_attr *attr, unsigned valid);
+        /**
+         * Update object configuration. Called top-to-bottom to modify object
+         * configuration.
+         *
+         * XXX error conditions and handling.
+         */
+        int (*coo_conf_set)(const struct lu_env *env, struct cl_object *obj,
+                            const struct cl_object_conf *conf);
+        /**
+         * Glimpse ast. Executed when glimpse ast arrives for a lock on this
+         * object. Layers are supposed to fill parts of \a lvb that will be
+         * shipped to the glimpse originator as a glimpse result.
+         *
+         * \see ccc_object_glimpse(), lovsub_object_glimpse(),
+         * \see osc_object_glimpse()
+         */
+        int (*coo_glimpse)(const struct lu_env *env,
+                           const struct cl_object *obj, struct ost_lvb *lvb);
+};
+
+/**
+ * Extended header for client object.
+ */
+struct cl_object_header {
+        /** Standard lu_object_header. cl_object::co_lu::lo_header points
+         * here. */
+        struct lu_object_header  coh_lu;
+        /** \name locks
+         * \todo XXX move locks below to the separate cache-lines, they are
+         * mostly useless otherwise.
+         */
+        /** @{ */
+        /** Lock protecting page tree. */
+        spinlock_t               coh_page_guard;
+        /** Lock protecting lock list. */
+        spinlock_t               coh_lock_guard;
+        /** @} locks */
+        /** Radix tree of cl_page's, cached for this object. */
+        struct radix_tree_root   coh_tree;
+        /** # of pages in radix tree. */
+        unsigned long            coh_pages;
+        /** List of cl_lock's granted for this object. */
+        struct list_head         coh_locks;
+
+        /**
+         * Parent object. It is assumed that an object has a well-defined
+         * parent, but not a well-defined child (there may be multiple
+         * sub-objects, for the same top-object). cl_object_header::coh_parent
+         * field allows certain code to be written generically, without
+         * limiting possible cl_object layouts unduly.
+         */
+        struct cl_object_header *coh_parent;
+        /**
+         * Protects consistency between cl_attr of parent object and
+         * attributes of sub-objects, that the former is calculated ("merged")
+         * from.
+         *
+         * \todo XXX this can be read/write lock if needed.
+         */
+        spinlock_t               coh_attr_guard;
+        /**
+         * Number of objects above this one: 0 for a top-object, 1 for its
+         * sub-object, etc.
+         */
+        unsigned                 coh_nesting;
+};
+
+/**
+ * Helper macro: iterate over all layers of the object \a obj, assigning every
+ * layer top-to-bottom to \a slice.
+ */
+#define cl_object_for_each(slice, obj)                                  \
+        list_for_each_entry((slice),                                    \
+                            &(obj)->co_lu.lo_header->loh_layers,        \
+                            co_lu.lo_linkage)
+/**
+ * Helper macro: iterate over all layers of the object \a obj, assigning every
+ * layer bottom-to-top to \a slice.
+ */
+#define cl_object_for_each_reverse(slice, obj)                          \
+        list_for_each_entry_reverse((slice),                            \
+                                    &(obj)->co_lu.lo_header->loh_layers, \
+                                    co_lu.lo_linkage)
+/** @} cl_object */
+
+#ifndef pgoff_t
+#define pgoff_t unsigned long
+#endif
+
+#define CL_PAGE_EOF ((pgoff_t)~0ull)
+
+/** \addtogroup cl_page cl_page
+ * @{ */
+
+/** \struct cl_page
+ * Layered client page.
+ *
+ * cl_page: represents a portion of a file, cached in the memory. All pages
+ *    of the given file are of the same size, and are kept in the radix tree
+ *    hanging off the cl_object. cl_page doesn't fan out, but as sub-objects
+ *    of the top-level file object are first class cl_objects, they have their
+ *    own radix trees of pages and hence page is implemented as a sequence of
+ *    struct cl_pages's, linked into double-linked list through
+ *    cl_page::cp_parent and cl_page::cp_child pointers, each residing in the
+ *    corresponding radix tree at the corresponding logical offset.
+ *
+ * cl_page is associated with VM page of the hosting environment (struct
+ *    page in Linux kernel, for example), cfs_page_t. It is assumed, that this
+ *    association is implemented by one of cl_page layers (top layer in the
+ *    current design) that
+ *
+ *        - intercepts per-VM-page call-backs made by the environment (e.g.,
+ *          memory pressure),
+ *
+ *        - translates state (page flag bits) and locking between lustre and
+ *          environment.
+ *
+ *    The association between cl_page and cfs_page_t is immutable and
+ *    established when cl_page is created.
+ *
+ * cl_page can be "owned" by a particular cl_io (see below), guaranteeing
+ *    this io an exclusive access to this page w.r.t. other io attempts and
+ *    various events changing page state (such as transfer completion, or
+ *    eviction of the page from the memory). Note, that in general cl_io
+ *    cannot be identified with a particular thread, and page ownership is not
+ *    exactly equal to the current thread holding a lock on the page. Layer
+ *    implementing association between cl_page and cfs_page_t has to implement
+ *    ownership on top of available synchronization mechanisms.
+ *
+ *    While lustre client maintains the notion of an page ownership by io,
+ *    hosting MM/VM usually has its own page concurrency control
+ *    mechanisms. For example, in Linux, page access is synchronized by the
+ *    per-page PG_locked bit-lock, and generic kernel code (generic_file_*())
+ *    takes care to acquire and release such locks as necessary around the
+ *    calls to the file system methods (->readpage(), ->prepare_write(),
+ *    ->commit_write(), etc.). This leads to the situation when there are two
+ *    different ways to own a page in the client:
+ *
+ *        - client code explicitly and voluntary owns the page (cl_page_own());
+ *
+ *        - VM locks a page and then calls the client, that has "to assume"
+ *          the ownership from the VM (cl_page_assume()).
+ *
+ *    Dual methods to release ownership are cl_page_disown() and
+ *    cl_page_unassume().
+ *
+ * cl_page is reference counted (cl_page::cp_ref). When reference counter
+ *    drops to 0, the page is returned to the cache, unless it is in
+ *    cl_page_state::CPS_FREEING state, in which case it is immediately
+ *    destroyed.
+ *
+ *    The general logic guaranteeing the absence of "existential races" for
+ *    pages is the following:
+ *
+ *        - there are fixed known ways for a thread to obtain a new reference
+ *          to a page:
+ *
+ *            - by doing a lookup in the cl_object radix tree, protected by the
+ *              spin-lock;
+ *
+ *            - by starting from VM-locked cfs_page_t and following some
+ *              hosting environment method (e.g., following ->private pointer in
+ *              the case of Linux kernel), see cl_vmpage_page();
+ *
+ *        - when the page enters cl_page_state::CPS_FREEING state, all these
+ *          ways are severed with the proper synchronization
+ *          (cl_page_delete());
+ *
+ *        - entry into cl_page_state::CPS_FREEING is serialized by the VM page
+ *          lock;
+ *
+ *        - no new references to the page in cl_page_state::CPS_FREEING state
+ *          are allowed (checked in cl_page_get()).
+ *
+ *    Together this guarantees that when last reference to a
+ *    cl_page_state::CPS_FREEING page is released, it is safe to destroy the
+ *    page, as neither references to it can be acquired at that point, nor
+ *    ones exist.
+ *
+ * cl_page is a state machine. States are enumerated in enum
+ *    cl_page_state. Possible state transitions are enumerated in
+ *    cl_page_state_set(). State transition process (i.e., actual changing of
+ *    cl_page::cp_state field) is protected by the lock on the underlying VM
+ *    page.
+ *
+ * Linux Kernel implementation.
+ *
+ *    Binding between cl_page and cfs_page_t (which is a typedef for
+ *    struct page) is implemented in the vvp layer. cl_page is attached to the
+ *    ->private pointer of the struct page, together with the setting of
+ *    PG_private bit in page->flags, and acquiring additional reference on the
+ *    struct page (much like struct buffer_head, or any similar file system
+ *    private data structures).
+ *
+ *    PG_locked lock is used to implement both ownership and transfer
+ *    synchronization, that is, page is VM-locked in CPS_{OWNED,PAGE{IN,OUT}}
+ *    states. No additional references are acquired for the duration of the
+ *    transfer.
+ *
+ * \warning *THIS IS NOT* the behavior expected by the Linux kernel, where
+ *          write-out is "protected" by the special PG_writeback bit.
+ */
+
+/**
+ * States of cl_page. cl_page.c assumes particular order here.
+ *
+ * The page state machine is rather crude, as it doesn't recognize finer page
+ * states like "dirty" or "up to date". This is because such states are not
+ * always well defined for the whole stack (see, for example, the
+ * implementation of the read-ahead, that hides page up-to-dateness to track
+ * cache hits accurately). Such sub-states are maintained by the layers that
+ * are interested in them.
+ */
+enum cl_page_state {
+        /**
+         * Page is in the cache, un-owned. Page leaves cached state in the
+         * following cases:
+         *
+         *     - [cl_page_state::CPS_OWNED] io comes across the page and
+         *     owns it;
+         *
+         *     - [cl_page_state::CPS_PAGEOUT] page is dirty, the
+         *     req-formation engine decides that it wants to include this page
+         *     into an cl_req being constructed, and yanks it from the cache;
+         *
+         *     - [cl_page_state::CPS_FREEING] VM callback is executed to
+         *     evict the page form the memory;
+         *
+         * \invariant cl_page::cp_owner == NULL && cl_page::cp_req == NULL
+         */
+        CPS_CACHED,
+        /**
+         * Page is exclusively owned by some cl_io. Page may end up in this
+         * state as a result of
+         *
+         *     - io creating new page and immediately owning it;
+         *
+         *     - [cl_page_state::CPS_CACHED] io finding existing cached page
+         *     and owning it;
+         *
+         *     - [cl_page_state::CPS_OWNED] io finding existing owned page
+         *     and waiting for owner to release the page;
+         *
+         * Page leaves owned state in the following cases:
+         *
+         *     - [cl_page_state::CPS_CACHED] io decides to leave the page in
+         *     the cache, doing nothing;
+         *
+         *     - [cl_page_state::CPS_PAGEIN] io starts read transfer for
+         *     this page;
+         *
+         *     - [cl_page_state::CPS_PAGEOUT] io starts immediate write
+         *     transfer for this page;
+         *
+         *     - [cl_page_state::CPS_FREEING] io decides to destroy this
+         *     page (e.g., as part of truncate or extent lock cancellation).
+         *
+         * \invariant cl_page::cp_owner != NULL && cl_page::cp_req == NULL
+         */
+        CPS_OWNED,
+        /**
+         * Page is being written out, as a part of a transfer. This state is
+         * entered when req-formation logic decided that it wants this page to
+         * be sent through the wire _now_. Specifically, it means that once
+         * this state is achieved, transfer completion handler (with either
+         * success or failure indication) is guaranteed to be executed against
+         * this page independently of any locks and any scheduling decisions
+         * made by the hosting environment (that effectively means that the
+         * page is never put into cl_page_state::CPS_PAGEOUT state "in
+         * advance". This property is mentioned, because it is important when
+         * reasoning about possible dead-locks in the system). The page can
+         * enter this state as a result of
+         *
+         *     - [cl_page_state::CPS_OWNED] an io requesting an immediate
+         *     write-out of this page, or
+         *
+         *     - [cl_page_state::CPS_CACHED] req-forming engine deciding
+         *     that it has enough dirty pages cached to issue a "good"
+         *     transfer.
+         *
+         * The page leaves cl_page_state::CPS_PAGEOUT state when the transfer
+         * is completed---it is moved into cl_page_state::CPS_CACHED state.
+         *
+         * Underlying VM page is locked for the duration of transfer.
+         *
+         * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req != NULL
+         */
+        CPS_PAGEOUT,
+        /**
+         * Page is being read in, as a part of a transfer. This is quite
+         * similar to the cl_page_state::CPS_PAGEOUT state, except that
+         * read-in is always "immediate"---there is no such thing a sudden
+         * construction of read cl_req from cached, presumably not up to date,
+         * pages.
+         *
+         * Underlying VM page is locked for the duration of transfer.
+         *
+         * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req != NULL
+         */
+        CPS_PAGEIN,
+        /**
+         * Page is being destroyed. This state is entered when client decides
+         * that page has to be deleted from its host object, as, e.g., a part
+         * of truncate.
+         *
+         * Once this state is reached, there is no way to escape it.
+         *
+         * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req == NULL
+         */
+        CPS_FREEING,
+        CPS_NR
+};
+
+enum cl_page_type {
+        /** Host page, the page is from the host inode which the cl_page
+         * belongs to. */
+        CPT_CACHEABLE = 1,
+
+        /** Transient page, the transient cl_page is used to bind a cl_page
+         *  to vmpage which is not belonging to the same object of cl_page.
+         *  it is used in DirectIO, lockless IO and liblustre. */
+        CPT_TRANSIENT,
+};
+
+/**
+ * Flags maintained for every cl_page.
+ */
+enum cl_page_flags {
+        /**
+         * Set when pagein completes. Used for debugging (read completes at
+         * most once for a page).
+         */
+        CPF_READ_COMPLETED = 1 << 0
+};
+
+/**
+ * Fields are protected by the lock on cfs_page_t, except for atomics and
+ * immutables.
+ *
+ * \invariant Data type invariants are in cl_page_invariant(). Basically:
+ * cl_page::cp_parent and cl_page::cp_child are a well-formed double-linked
+ * list, consistent with the parent/child pointers in the cl_page::cp_obj and
+ * cl_page::cp_owner (when set).
+ */
+struct cl_page {
+        /** Reference counter. */
+        atomic_t                 cp_ref;
+        /** An object this page is a part of. Immutable after creation. */
+        struct cl_object        *cp_obj;
+        /** Logical page index within the object. Immutable after creation. */
+        pgoff_t                  cp_index;
+        /** List of slices. Immutable after creation. */
+        struct list_head         cp_layers;
+        /** Parent page, NULL for top-level page. Immutable after creation. */
+        struct cl_page          *cp_parent;
+        /** Lower-layer page. NULL for bottommost page. Immutable after
+         * creation. */
+        struct cl_page          *cp_child;
+        /**
+         * Page state. This field is const to avoid accidental update, it is
+         * modified only internally within cl_page.c. Protected by a VM lock.
+         */
+        const enum cl_page_state cp_state;
+        /**
+         * Linkage of pages within some group. Protected by
+         * cl_page::cp_mutex. */
+        struct list_head         cp_batch;
+        /** Mutex serializing membership of a page in a batch. */
+        struct mutex             cp_mutex;
+        /** Linkage of pages within cl_req. */
+        struct list_head         cp_flight;
+        /** Transfer error. */
+        int                      cp_error;
+
+        /**
+         * Page type. Only CPT_TRANSIENT is used so far. Immutable after
+         * creation.
+         */
+        enum cl_page_type        cp_type;
+
+        /**
+         * Owning IO in cl_page_state::CPS_OWNED state. Sub-page can be owned
+         * by sub-io. Protected by a VM lock.
+         */
+        struct cl_io            *cp_owner;
+        /**
+         * Owning IO request in cl_page_state::CPS_PAGEOUT and
+         * cl_page_state::CPS_PAGEIN states. This field is maintained only in
+         * the top-level pages. Protected by a VM lock.
+         */
+        struct cl_req           *cp_req;
+        /** List of references to this page, for debugging. */
+        struct lu_ref            cp_reference;
+        /** Link to an object, for debugging. */
+        struct lu_ref_link      *cp_obj_ref;
+        /** Link to a queue, for debugging. */
+        struct lu_ref_link      *cp_queue_ref;
+        /** Per-page flags from enum cl_page_flags. Protected by a VM lock. */
+        unsigned                 cp_flags;
+};
+
+/**
+ * Per-layer part of cl_page.
+ *
+ * \see ccc_page, lov_page, osc_page
+ */
+struct cl_page_slice {
+        struct cl_page                  *cpl_page;
+        /**
+         * Object slice corresponding to this page slice. Immutable after
+         * creation.
+         */
+        struct cl_object                *cpl_obj;
+        const struct cl_page_operations *cpl_ops;
+        /** Linkage into cl_page::cp_layers. Immutable after creation. */
+        struct list_head                 cpl_linkage;
+};
+
+/**
+ * Lock mode. For the client extent locks.
+ *
+ * \warning: cl_lock_mode_match() assumes particular ordering here.
+ * \ingroup cl_lock
+ */
+enum cl_lock_mode {
+        /**
+         * Mode of a lock that protects no data, and exists only as a
+         * placeholder. This is used for `glimpse' requests. A phantom lock
+         * might get promoted to real lock at some point.
+         */
+        CLM_PHANTOM,
+        CLM_READ,
+        CLM_WRITE
+};
+
+/**
+ * Requested transfer type.
+ * \ingroup cl_req
+ */
+enum cl_req_type {
+        CRT_READ,
+        CRT_WRITE,
+        CRT_NR
+};
+
+/**
+ * Per-layer page operations.
+ *
+ * Methods taking an \a io argument are for the activity happening in the
+ * context of given \a io. Page is assumed to be owned by that io, except for
+ * the obvious cases (like cl_page_operations::cpo_own()).
+ *
+ * \see vvp_page_ops, lov_page_ops, osc_page_ops
+ */
+struct cl_page_operations {
+        /**
+         * cl_page<->cfs_page_t methods. Only one layer in the stack has to
+         * implement these. Current code assumes that this functionality is
+         * provided by the topmost layer, see cl_page_disown0() as an example.
+         */
+
+        /**
+         * \return the underlying VM page. Optional.
+         */
+        cfs_page_t *(*cpo_vmpage)(const struct lu_env *env,
+                                  const struct cl_page_slice *slice);
+        /**
+         * Called when \a io acquires this page into the exclusive
+         * ownership. When this method returns, it is guaranteed that the is
+         * not owned by other io, and no transfer is going on against
+         * it. Optional.
+         *
+         * \see cl_page_own()
+         * \see vvp_page_own(), lov_page_own()
+         */
+        void (*cpo_own)(const struct lu_env *env,
+                        const struct cl_page_slice *slice, struct cl_io *io);
+        /** Called when ownership it yielded. Optional.
+         *
+         * \see cl_page_disown()
+         * \see vvp_page_disown()
+         */
+        void (*cpo_disown)(const struct lu_env *env,
+                           const struct cl_page_slice *slice, struct cl_io *io);
+        /**
+         * Called for a page that is already "owned" by \a io from VM point of
+         * view. Optional.
+         *
+         * \see cl_page_assume()
+         * \see vvp_page_assume(), lov_page_assume()
+         */
+        void (*cpo_assume)(const struct lu_env *env,
+                           const struct cl_page_slice *slice, struct cl_io *io);
+        /** Dual to cl_page_operations::cpo_assume(). Optional. Called
+         * bottom-to-top when IO releases a page without actually unlocking
+         * it.
+         *
+         * \see cl_page_unassume()
+         * \see vvp_page_unassume()
+         */
+        void (*cpo_unassume)(const struct lu_env *env,
+                             const struct cl_page_slice *slice,
+                             struct cl_io *io);
+        /**
+         * Announces that page contains valid data and user space can look and
+         * them without client's involvement from now on. Effectively marks
+         * the page up-to-date. Optional.
+         *
+         * \see cl_page_export()
+         * \see vvp_page_export()
+         */
+        void  (*cpo_export)(const struct lu_env *env,
+                            const struct cl_page_slice *slice);
+        /**
+         * Unmaps page from the user space (if it is mapped).
+         *
+         * \see cl_page_unmap()
+         * \see vvp_page_unmap()
+         */
+        int (*cpo_unmap)(const struct lu_env *env,
+                         const struct cl_page_slice *slice, struct cl_io *io);
+        /**
+         * Checks whether underlying VM page is locked (in the suitable
+         * sense). Used for assertions.
+         *
+         * \retval    -EBUSY: page is protected by a lock of a given mode;
+         * \retval  -ENODATA: page is not protected by a lock;
+         * \retval         0: this layer cannot decide. (Should never happen.)
+         */
+        int (*cpo_is_vmlocked)(const struct lu_env *env,
+                               const struct cl_page_slice *slice);
+        /**
+         * Page destruction.
+         */
+
+        /**
+         * Called when page is truncated from the object. Optional.
+         *
+         * \see cl_page_discard()
+         * \see vvp_page_discard(), osc_page_discard()
+         */
+        void (*cpo_discard)(const struct lu_env *env,
+                            const struct cl_page_slice *slice,
+                            struct cl_io *io);
+        /**
+         * Called when page is removed from the cache, and is about to being
+         * destroyed. Optional.
+         *
+         * \see cl_page_delete()
+         * \see vvp_page_delete(), osc_page_delete()
+         */
+        void (*cpo_delete)(const struct lu_env *env,
+                           const struct cl_page_slice *slice);
+        /** Destructor. Frees resources and slice itself. */
+        void (*cpo_fini)(const struct lu_env *env,
+                         struct cl_page_slice *slice);
+
+        /**
+         * Checks whether the page is protected by a cl_lock. This is a
+         * per-layer method, because certain layers have ways to check for the
+         * lock much more efficiently than through the generic locks scan, or
+         * implement locking mechanisms separate from cl_lock, e.g.,
+         * LL_FILE_GROUP_LOCKED in vvp. If \a pending is true, check for locks
+         * being canceled, or scheduled for cancellation as soon as the last
+         * user goes away, too.
+         *
+         * \retval    -EBUSY: page is protected by a lock of a given mode;
+         * \retval  -ENODATA: page is not protected by a lock;
+         * \retval         0: this layer cannot decide.
+         *
+         * \see cl_page_is_under_lock()
+         */
+        int (*cpo_is_under_lock)(const struct lu_env *env,
+                                 const struct cl_page_slice *slice,
+                                 struct cl_io *io);
+
+        /**
+         * Optional debugging helper. Prints given page slice.
+         *
+         * \see cl_page_print()
+         */
+        int (*cpo_print)(const struct lu_env *env,
+                         const struct cl_page_slice *slice,
+                         void *cookie, lu_printer_t p);
+        /**
+         * \name transfer
+         *
+         * Transfer methods. See comment on cl_req for a description of
+         * transfer formation and life-cycle.
+         *
+         * @{
+         */
+        /**
+         * Request type dependent vector of operations.
+         *
+         * Transfer operations depend on transfer mode (cl_req_type). To avoid
+         * passing transfer mode to each and every of these methods, and to
+         * avoid branching on request type inside of the methods, separate
+         * methods for cl_req_type:CRT_READ and cl_req_type:CRT_WRITE are
+         * provided. That is, method invocation usually looks like
+         *
+         *         slice->cp_ops.io[req->crq_type].cpo_method(env, slice, ...);
+         */
+        struct {
+                /**
+                 * Called when a page is submitted for a transfer as a part of
+                 * cl_page_list.
+                 *
+                 * \return    0         : page is eligible for submission;
+                 * \return    -EALREADY : skip this page;
+                 * \return    -ve       : error.
+                 *
+                 * \see cl_page_prep()
+                 */
+                int  (*cpo_prep)(const struct lu_env *env,
+                                 const struct cl_page_slice *slice,
+                                 struct cl_io *io);
+                /**
+                 * Completion handler. This is guaranteed to be eventually
+                 * fired after cl_page_operations::cpo_prep() or
+                 * cl_page_operations::cpo_make_ready() call.
+                 *
+                 * This method can be called in a non-blocking context. It is
+                 * guaranteed however, that the page involved and its object
+                 * are pinned in memory (and, hence, calling cl_page_put() is
+                 * safe).
+                 *
+                 * \see cl_page_completion()
+                 */
+                void (*cpo_completion)(const struct lu_env *env,
+                                       const struct cl_page_slice *slice,
+                                       int ioret);
+                /**
+                 * Called when cached page is about to be added to the
+                 * cl_req as a part of req formation.
+                 *
+                 * \return    0       : proceed with this page;
+                 * \return    -EAGAIN : skip this page;
+                 * \return    -ve     : error.
+                 *
+                 * \see cl_page_make_ready()
+                 */
+                int  (*cpo_make_ready)(const struct lu_env *env,
+                                       const struct cl_page_slice *slice);
+                /**
+                 * Announce that this page is to be written out
+                 * opportunistically, that is, page is dirty, it is not
+                 * necessary to start write-out transfer right now, but
+                 * eventually page has to be written out.
+                 *
+                 * Main caller of this is the write path (see
+                 * vvp_io_commit_write()), using this method to build a
+                 * "transfer cache" from which large transfers are then
+                 * constructed by the req-formation engine.
+                 *
+                 * \todo XXX it would make sense to add page-age tracking
+                 * semantics here, and to oblige the req-formation engine to
+                 * send the page out not later than it is too old.
+                 *
+                 * \see cl_page_cache_add()
+                 */
+                int  (*cpo_cache_add)(const struct lu_env *env,
+                                      const struct cl_page_slice *slice,
+                                      struct cl_io *io);
+        } io[CRT_NR];
+        /**
+         * Tell transfer engine that only [to, from] part of a page should be
+         * transmitted.
+         *
+         * This is used for immediate transfers.
+         *
+         * \todo XXX this is not very good interface. It would be much better
+         * if all transfer parameters were supplied as arguments to
+         * cl_io_operations::cio_submit() call, but it is not clear how to do
+         * this for page queues.
+         *
+         * \see cl_page_clip()
+         */
+        void (*cpo_clip)(const struct lu_env *env,
+                         const struct cl_page_slice *slice,
+                         int from, int to);
+        /**
+         * \pre  the page was queued for transferring.
+         * \post page is removed from client's pending list, or -EBUSY
+         *       is returned if it has already been in transferring.
+         *
+         * This is one of seldom page operation which is:
+         * 0. called from top level;
+         * 1. don't have vmpage locked;
+         * 2. every layer should synchronize execution of its ->cpo_cancel()
+         *    with completion handlers. Osc uses client obd lock for this
+         *    purpose. Based on there is no vvp_page_cancel and
+         *    lov_page_cancel(), cpo_cancel is defacto protected by client lock.
+         *
+         * \see osc_page_cancel().
+         */
+        int (*cpo_cancel)(const struct lu_env *env,
+                          const struct cl_page_slice *slice);
+        /** @} transfer */
+};
+
+/**
+ * Helper macro, dumping detailed information about \a page into a log.
+ */
+#define CL_PAGE_DEBUG(mask, env, page, format, ...)                     \
+do {                                                                    \
+        static DECLARE_LU_CDEBUG_PRINT_INFO(__info, mask);              \
+                                                                        \
+        if (cdebug_show(mask, DEBUG_SUBSYSTEM)) {                       \
+                cl_page_print(env, &__info, lu_cdebug_printer, page);   \
+                CDEBUG(mask, format , ## __VA_ARGS__);                  \
+        }                                                               \
+} while (0)
+
+/**
+ * Helper macro, dumping shorter information about \a page into a log.
+ */
+#define CL_PAGE_HEADER(mask, env, page, format, ...)                    \
+do {                                                                    \
+        static DECLARE_LU_CDEBUG_PRINT_INFO(__info, mask);              \
+                                                                        \
+        if (cdebug_show(mask, DEBUG_SUBSYSTEM)) {                       \
+                cl_page_header_print(env, &__info, lu_cdebug_printer, page); \
+                CDEBUG(mask, format , ## __VA_ARGS__);                  \
+        }                                                               \
+} while (0)
+
+/** @} cl_page */
+
+/** \addtogroup cl_lock cl_lock
+ * @{ */
+/** \struct cl_lock
+ *
+ * Extent locking on the client.
+ *
+ * LAYERING
+ *
+ * The locking model of the new client code is built around
+ *
+ *        struct cl_lock
+ *
+ * data-type representing an extent lock on a regular file. cl_lock is a
+ * layered object (much like cl_object and cl_page), it consists of a header
+ * (struct cl_lock) and a list of layers (struct cl_lock_slice), linked to
+ * cl_lock::cll_layers list through cl_lock_slice::cls_linkage.
+ *
+ * All locks for a given object are linked into cl_object_header::coh_locks
+ * list (protected by cl_object_header::coh_lock_guard spin-lock) through
+ * cl_lock::cll_linkage. Currently this list is not sorted in any way. We can
+ * sort it in starting lock offset, or use altogether different data structure
+ * like a tree.
+ *
+ * Typical cl_lock consists of the two layers:
+ *
+ *     - vvp_lock (vvp specific data), and
+ *     - lov_lock (lov specific data).
+ *
+ * lov_lock contains an array of sub-locks. Each of these sub-locks is a
+ * normal cl_lock: it has a header (struct cl_lock) and a list of layers:
+ *
+ *     - lovsub_lock, and
+ *     - osc_lock
+ *
+ * Each sub-lock is associated with a cl_object (representing stripe
+ * sub-object or the file to which top-level cl_lock is associated to), and is
+ * linked into that cl_object::coh_locks. In this respect cl_lock is similar to
+ * cl_object (that at lov layer also fans out into multiple sub-objects), and
+ * is different from cl_page, that doesn't fan out (there is usually exactly
+ * one osc_page for every vvp_page). We shall call vvp-lov portion of the lock
+ * a "top-lock" and its lovsub-osc portion a "sub-lock".
+ *
+ * LIFE CYCLE
+ *
+ * cl_lock is reference counted. When reference counter drops to 0, lock is
+ * placed in the cache, except when lock is in CLS_FREEING state. CLS_FREEING
+ * lock is destroyed when last reference is released. Referencing between
+ * top-lock and its sub-locks is described in the lov documentation module.
+ *
+ * STATE MACHINE
+ *
+ * Also, cl_lock is a state machine. This requires some clarification. One of
+ * the goals of client IO re-write was to make IO path non-blocking, or at
+ * least to make it easier to make it non-blocking in the future. Here
+ * `non-blocking' means that when a system call (read, write, truncate)
+ * reaches a situation where it has to wait for a communication with the
+ * server, it should --instead of waiting-- remember its current state and
+ * switch to some other work.  E.g,. instead of waiting for a lock enqueue,
+ * client should proceed doing IO on the next stripe, etc. Obviously this is
+ * rather radical redesign, and it is not planned to be fully implemented at
+ * this time, instead we are putting some infrastructure in place, that would
+ * make it easier to do asynchronous non-blocking IO easier in the
+ * future. Specifically, where old locking code goes to sleep (waiting for
+ * enqueue, for example), new code returns cl_lock_transition::CLO_WAIT. When
+ * enqueue reply comes, its completion handler signals that lock state-machine
+ * is ready to transit to the next state. There is some generic code in
+ * cl_lock.c that sleeps, waiting for these signals. As a result, for users of
+ * this cl_lock.c code, it looks like locking is done in normal blocking
+ * fashion, and it the same time it is possible to switch to the non-blocking
+ * locking (simply by returning cl_lock_transition::CLO_WAIT from cl_lock.c
+ * functions).
+ *
+ * For a description of state machine states and transitions see enum
+ * cl_lock_state.
+ *
+ * There are two ways to restrict a set of states which lock might move to:
+ *
+ *     - placing a "hold" on a lock guarantees that lock will not be moved
+ *       into cl_lock_state::CLS_FREEING state until hold is released. Hold
+ *       can be only acquired on a lock that is not in
+ *       cl_lock_state::CLS_FREEING. All holds on a lock are counted in
+ *       cl_lock::cll_holds. Hold protects lock from cancellation and
+ *       destruction. Requests to cancel and destroy a lock on hold will be
+ *       recorded, but only honored when last hold on a lock is released;
+ *
+ *     - placing a "user" on a lock guarantees that lock will not leave
+ *       cl_lock_state::CLS_NEW, cl_lock_state::CLS_QUEUING,
+ *       cl_lock_state::CLS_ENQUEUED and cl_lock_state::CLS_HELD set of
+ *       states, once it enters this set. That is, if a user is added onto a
+ *       lock in a state not from this set, it doesn't immediately enforce
+ *       lock to move to this set, but once lock enters this set it will
+ *       remain there until all users are removed. Lock users are counted in
+ *       cl_lock::cll_users.
+ *
+ *       User is used to assure that lock is not canceled or destroyed while
+ *       it is being enqueued, or actively used by some IO.
+ *
+ *       Currently, a user always comes with a hold (cl_lock_invariant()
+ *       checks that a number of holds is not less than a number of users).
+ *
+ * CONCURRENCY
+ *
+ * This is how lock state-machine operates. struct cl_lock contains a mutex
+ * cl_lock::cll_guard that protects struct fields.
+ *
+ *     - mutex is taken, and cl_lock::cll_state is examined.
+ *
+ *     - for every state there are possible target states where lock can move
+ *       into. They are tried in order. Attempts to move into next state are
+ *       done by _try() functions in cl_lock.c:cl_{enqueue,unlock,wait}_try().
+ *
+ *     - if the transition can be performed immediately, state is changed,
+ *       and mutex is released.
+ *
+ *     - if the transition requires blocking, _try() function returns
+ *       cl_lock_transition::CLO_WAIT. Caller unlocks mutex and goes to
+ *       sleep, waiting for possibility of lock state change. It is woken
+ *       up when some event occurs, that makes lock state change possible
+ *       (e.g., the reception of the reply from the server), and repeats
+ *       the loop.
+ *
+ * Top-lock and sub-lock has separate mutexes and the latter has to be taken
+ * first to avoid dead-lock.
+ *
+ * To see an example of interaction of all these issues, take a look at the
+ * lov_cl.c:lov_lock_enqueue() function. It is called as a part of
+ * cl_enqueue_try(), and tries to advance top-lock to ENQUEUED state, by
+ * advancing state-machines of its sub-locks (lov_lock_enqueue_one()). Note
+ * also, that it uses trylock to grab sub-lock mutex to avoid dead-lock. It
+ * also has to handle CEF_ASYNC enqueue, when sub-locks enqueues have to be
+ * done in parallel, rather than one after another (this is used for glimpse
+ * locks, that cannot dead-lock).
+ *
+ * INTERFACE AND USAGE
+ *
+ * struct cl_lock_operations provide a number of call-backs that are invoked
+ * when events of interest occurs. Layers can intercept and handle glimpse,
+ * blocking, cancel ASTs and a reception of the reply from the server.
+ *
+ * One important difference with the old client locking model is that new
+ * client has a representation for the top-lock, whereas in the old code only
+ * sub-locks existed as real data structures and file-level locks are
+ * represented by "request sets" that are created and destroyed on each and
+ * every lock creation.
+ *
+ * Top-locks are cached, and can be found in the cache by the system calls. It
+ * is possible that top-lock is in cache, but some of its sub-locks were
+ * canceled and destroyed. In that case top-lock has to be enqueued again
+ * before it can be used.
+ *
+ * Overall process of the locking during IO operation is as following:
+ *
+ *     - once parameters for IO are setup in cl_io, cl_io_operations::cio_lock()
+ *       is called on each layer. Responsibility of this method is to add locks,
+ *       needed by a given layer into cl_io.ci_lockset.
+ *
+ *     - once locks for all layers were collected, they are sorted to avoid
+ *       dead-locks (cl_io_locks_sort()), and enqueued.
+ *
+ *     - when all locks are acquired, IO is performed;
+ *
+ *     - locks are released into cache.
+ *
+ * Striping introduces major additional complexity into locking. The
+ * fundamental problem is that it is generally unsafe to actively use (hold)
+ * two locks on the different OST servers at the same time, as this introduces
+ * inter-server dependency and can lead to cascading evictions.
+ *
+ * Basic solution is to sub-divide large read/write IOs into smaller pieces so
+ * that no multi-stripe locks are taken (note that this design abandons POSIX
+ * read/write semantics). Such pieces ideally can be executed concurrently. At
+ * the same time, certain types of IO cannot be sub-divived, without
+ * sacrificing correctness. This includes:
+ *
+ *  - O_APPEND write, where [0, EOF] lock has to be taken, to guarantee
+ *  atomicity;
+ *
+ *  - ftruncate(fd, offset), where [offset, EOF] lock has to be taken.
+ *
+ * Also, in the case of read(fd, buf, count) or write(fd, buf, count), where
+ * buf is a part of memory mapped Lustre file, a lock or locks protecting buf
+ * has to be held together with the usual lock on [offset, offset + count].
+ *
+ * As multi-stripe locks have to be allowed, it makes sense to cache them, so
+ * that, for example, a sequence of O_APPEND writes can proceed quickly
+ * without going down to the individual stripes to do lock matching. On the
+ * other hand, multi-stripe locks shouldn't be used by normal read/write
+ * calls. To achieve this, every layer can implement ->clo_fits_into() method,
+ * that is called by lock matching code (cl_lock_lookup()), and that can be
+ * used to selectively disable matching of certain locks for certain IOs. For
+ * exmaple, lov layer implements lov_lock_fits_into() that allow multi-stripe
+ * locks to be matched only for truncates and O_APPEND writes.
+ *
+ * Interaction with DLM
+ *
+ * In the expected setup, cl_lock is ultimately backed up by a collection of
+ * DLM locks (struct ldlm_lock). Association between cl_lock and DLM lock is
+ * implemented in osc layer, that also matches DLM events (ASTs, cancellation,
+ * etc.) into cl_lock_operation calls. See struct osc_lock for a more detailed
+ * description of interaction with DLM.
+ */
+
+/**
+ * Lock description.
+ */
+struct cl_lock_descr {
+        /** Object this lock is granted for. */
+        struct cl_object *cld_obj;
+        /** Index of the first page protected by this lock. */
+        pgoff_t           cld_start;
+        /** Index of the last page (inclusive) protected by this lock. */
+        pgoff_t           cld_end;
+        /** Lock mode. */
+        enum cl_lock_mode cld_mode;
+};
+
+#define DDESCR "%s(%d):[%lu, %lu]"
+#define PDESCR(descr)                                                   \
+        cl_lock_mode_name((descr)->cld_mode), (descr)->cld_mode,        \
+        (descr)->cld_start, (descr)->cld_end
+
+const char *cl_lock_mode_name(const enum cl_lock_mode mode);
+
+/**
+ * Lock state-machine states.
+ *
+ * \htmlonly
+ * <pre>
+ *
+ * Possible state transitions:
+ *
+ *              +------------------>NEW
+ *              |                    |
+ *              |                    | cl_enqueue_try()
+ *              |                    |
+ *              |    cl_unuse_try()  V
+ *              |  +--------------QUEUING (*)
+ *              |  |                 |
+ *              |  |                 | cl_enqueue_try()
+ *              |  |                 |
+ *              |  | cl_unuse_try()  V
+ *    sub-lock  |  +-------------ENQUEUED (*)
+ *    canceled  |  |                 |
+ *              |  |                 | cl_wait_try()
+ *              |  |                 |
+ *              |  |                (R)
+ *              |  |                 |
+ *              |  |                 V
+ *              |  |                HELD<---------+
+ *              |  |                 |            |
+ *              |  |                 |            |
+ *              |  |  cl_unuse_try() |            |
+ *              |  |                 |            |
+ *              |  |                 V            | cached
+ *              |  +------------>UNLOCKING (*)    | lock found
+ *              |                    |            |
+ *              |     cl_unuse_try() |            |
+ *              |                    |            |
+ *              |                    |            | cl_use_try()
+ *              |                    V            |
+ *              +------------------CACHED---------+
+ *                                   |
+ *                                  (C)
+ *                                   |
+ *                                   V
+ *                                FREEING
+ *
+ * Legend:
+ *
+ *         In states marked with (*) transition to the same state (i.e., a loop
+ *         in the diagram) is possible.
+ *
+ *         (R) is the point where Receive call-back is invoked: it allows layers
+ *         to handle arrival of lock reply.
+ *
+ *         (C) is the point where Cancellation call-back is invoked.
+ *
+ *         Transition to FREEING state is possible from any other state in the
+ *         diagram in case of unrecoverable error.
+ * </pre>
+ * \endhtmlonly
+ *
+ * These states are for individual cl_lock object. Top-lock and its sub-locks
+ * can be in the different states. Another way to say this is that we have
+ * nested state-machines.
+ *
+ * Separate QUEUING and ENQUEUED states are needed to support non-blocking
+ * operation for locks with multiple sub-locks. Imagine lock on a file F, that
+ * intersects 3 stripes S0, S1, and S2. To enqueue F client has to send
+ * enqueue to S0, wait for its completion, then send enqueue for S1, wait for
+ * its completion and at last enqueue lock for S2, and wait for its
+ * completion. In that case, top-lock is in QUEUING state while S0, S1 are
+ * handled, and is in ENQUEUED state after enqueue to S2 has been sent (note
+ * that in this case, sub-locks move from state to state, and top-lock remains
+ * in the same state).
+ *
+ * Separate UNLOCKING state is needed to maintain an invariant that in HELD
+ * state lock is immediately ready for use.
+ */
+enum cl_lock_state {
+        /**
+         * Lock that wasn't yet enqueued
+         */
+        CLS_NEW,
+        /**
+         * Enqueue is in progress, blocking for some intermediate interaction
+         * with the other side.
+         */
+        CLS_QUEUING,
+        /**
+         * Lock is fully enqueued, waiting for server to reply when it is
+         * granted.
+         */
+        CLS_ENQUEUED,
+        /**
+         * Lock granted, actively used by some IO.
+         */
+        CLS_HELD,
+        /**
+         * Lock is in the transition from CLS_HELD to CLS_CACHED. Lock is in
+         * this state only while cl_unuse() is executing against it.
+         */
+        CLS_UNLOCKING,
+        /**
+         * Lock granted, not used.
+         */
+        CLS_CACHED,
+        /**
+         * Lock is being destroyed.
+         */
+        CLS_FREEING,
+        CLS_NR
+};
+
+enum cl_lock_flags {
+        /**
+         * lock has been cancelled. This flag is never cleared once set (by
+         * cl_lock_cancel0()).
+         */
+        CLF_CANCELLED  = 1 << 0,
+        /** cancellation is pending for this lock. */
+        CLF_CANCELPEND = 1 << 1,
+        /** destruction is pending for this lock. */
+        CLF_DOOMED     = 1 << 2,
+        /** State update is pending. */
+        CLF_STATE      = 1 << 3
+};
+
+/**
+ * Lock closure.
+ *
+ * Lock closure is a collection of locks (both top-locks and sub-locks) that
+ * might be updated in a result of an operation on a certain lock (which lock
+ * this is a closure of).
+ *
+ * Closures are needed to guarantee dead-lock freedom in the presence of
+ *
+ *     - nested state-machines (top-lock state-machine composed of sub-lock
+ *       state-machines), and
+ *
+ *     - shared sub-locks.
+ *
+ * Specifically, many operations, such as lock enqueue, wait, unlock,
+ * etc. start from a top-lock, and then operate on a sub-locks of this
+ * top-lock, holding a top-lock mutex. When sub-lock state changes as a result
+ * of such operation, this change has to be propagated to all top-locks that
+ * share this sub-lock. Obviously, no natural lock ordering (e.g.,
+ * top-to-bottom or bottom-to-top) captures this scenario, so try-locking has
+ * to be used. Lock closure systematizes this try-and-repeat logic.
+ */
+struct cl_lock_closure {
+        /**
+         * Lock that is mutexed when closure construction is started. When
+         * closure in is `wait' mode (cl_lock_closure::clc_wait), mutex on
+         * origin is released before waiting.
+         */
+        struct cl_lock   *clc_origin;
+        /**
+         * List of enclosed locks, so far. Locks are linked here through
+         * cl_lock::cll_inclosure.
+         */
+        struct list_head  clc_list;
+        /**
+         * True iff closure is in a `wait' mode. This determines what
+         * cl_lock_enclosure() does when a lock L to be added to the closure
+         * is currently mutexed by some other thread.
+         *
+         * If cl_lock_closure::clc_wait is not set, then closure construction
+         * fails with CLO_REPEAT immediately.
+         *
+         * In wait mode, cl_lock_enclosure() waits until next attempt to build
+         * a closure might succeed. To this end it releases an origin mutex
+         * (cl_lock_closure::clc_origin), that has to be the only lock mutex
+         * owned by the current thread, and then waits on L mutex (by grabbing
+         * it and immediately releasing), before returning CLO_REPEAT to the
+         * caller.
+         */
+        int               clc_wait;
+        /** Number of locks in the closure. */
+        int               clc_nr;
+};
+
+/**
+ * Layered client lock.
+ */
+struct cl_lock {
+        /** Reference counter. */
+        atomic_t              cll_ref;
+        /** List of slices. Immutable after creation. */
+        struct list_head      cll_layers;
+        /**
+         * Linkage into cl_lock::cll_descr::cld_obj::coh_locks list. Protected
+         * by cl_lock::cll_descr::cld_obj::coh_lock_guard.
+         */
+        struct list_head      cll_linkage;
+        /**
+         * Parameters of this lock. Protected by
+         * cl_lock::cll_descr::cld_obj::coh_lock_guard nested within
+         * cl_lock::cll_guard. Modified only on lock creation and in
+         * cl_lock_modify().
+         */
+        struct cl_lock_descr  cll_descr;
+        /** Protected by cl_lock::cll_guard. */
+        enum cl_lock_state    cll_state;
+        /** signals state changes. */
+        cfs_waitq_t           cll_wq;
+        /**
+         * Recursive lock, most fields in cl_lock{} are protected by this.
+         *
+         * Locking rules: this mutex is never held across network
+         * communication, except when lock is being canceled.
+         *
+         * Lock ordering: a mutex of a sub-lock is taken first, then a mutex
+         * on a top-lock. Other direction is implemented through a
+         * try-lock-repeat loop. Mutices of unrelated locks can be taken only
+         * by try-locking.
+         *
+         * \see osc_lock_enqueue_wait(), lov_lock_cancel(), lov_sublock_wait().
+         */
+        struct mutex          cll_guard;
+        cfs_task_t           *cll_guarder;
+        int                   cll_depth;
+
+        int                   cll_error;
+        /**
+         * Number of holds on a lock. A hold prevents a lock from being
+         * canceled and destroyed. Protected by cl_lock::cll_guard.
+         *
+         * \see cl_lock_hold(), cl_lock_unhold(), cl_lock_release()
+         */
+        int                   cll_holds;
+         /**
+          * Number of lock users. Valid in cl_lock_state::CLS_HELD state
+          * only. Lock user pins lock in CLS_HELD state. Protected by
+          * cl_lock::cll_guard.
+          *
+          * \see cl_wait(), cl_unuse().
+          */
+        int                   cll_users;
+        /**
+         * Flag bit-mask. Values from enum cl_lock_flags. Updates are
+         * protected by cl_lock::cll_guard.
+         */
+        unsigned long         cll_flags;
+        /**
+         * A linkage into a list of locks in a closure.
+         *
+         * \see cl_lock_closure
+         */
+        struct list_head      cll_inclosure;
+        /**
+         * A list of references to this lock, for debugging.
+         */
+        struct lu_ref         cll_reference;
+        /**
+         * A list of holds on this lock, for debugging.
+         */
+        struct lu_ref         cll_holders;
+        /**
+         * A reference for cl_lock::cll_descr::cld_obj. For debugging.
+         */
+        struct lu_ref_link   *cll_obj_ref;
+#ifdef CONFIG_LOCKDEP
+        /* "dep_map" name is assumed by lockdep.h macros. */
+        struct lockdep_map    dep_map;
+#endif
+};
+
+/**
+ * Per-layer part of cl_lock
+ *
+ * \see ccc_lock, lov_lock, lovsub_lock, osc_lock
+ */
+struct cl_lock_slice {
+        struct cl_lock                  *cls_lock;
+        /** Object slice corresponding to this lock slice. Immutable after
+         * creation. */
+        struct cl_object                *cls_obj;
+        const struct cl_lock_operations *cls_ops;
+        /** Linkage into cl_lock::cll_layers. Immutable after creation. */
+        struct list_head                 cls_linkage;
+};
+
+/**
+ * Possible (non-error) return values of ->clo_{enqueue,wait,unlock}().
+ *
+ * NOTE: lov_subresult() depends on ordering here.
+ */
+enum cl_lock_transition {
+        /** operation cannot be completed immediately. Wait for state change. */
+        CLO_WAIT   = 1,
+        /** operation had to release lock mutex, restart. */
+        CLO_REPEAT = 2
+};
+
+/**
+ *
+ * \see vvp_lock_ops, lov_lock_ops, lovsub_lock_ops, osc_lock_ops
+ */
+struct cl_lock_operations {
+        /**
+         * \name statemachine
+         *
+         * State machine transitions. These 3 methods are called to transfer
+         * lock from one state to another, as described in the commentary
+         * above enum #cl_lock_state.
+         *
+         * \retval 0          this layer has nothing more to do to before
+         *                       transition to the target state happens;
+         *
+         * \retval CLO_REPEAT method had to release and re-acquire cl_lock
+         *                    mutex, repeat invocation of transition method
+         *                    across all layers;
+         *
+         * \retval CLO_WAIT   this layer cannot move to the target state
+         *                    immediately, as it has to wait for certain event
+         *                    (e.g., the communication with the server). It
+         *                    is guaranteed, that when the state transfer
+         *                    becomes possible, cl_lock::cll_wq wait-queue
+         *                    is signaled. Caller can wait for this event by
+         *                    calling cl_lock_state_wait();
+         *
+         * \retval -ve        failure, abort state transition, move the lock
+         *                    into cl_lock_state::CLS_FREEING state, and set
+         *                    cl_lock::cll_error.
+         *
+         * Once all layers voted to agree to transition (by returning 0), lock
+         * is moved into corresponding target state. All state transition
+         * methods are optional.
+         */
+        /** @{ */
+        /**
+         * Attempts to enqueue the lock. Called top-to-bottom.
+         *
+         * \see ccc_lock_enqueue(), lov_lock_enqueue(), lovsub_lock_enqueue(),
+         * \see osc_lock_enqueue()
+         */
+        int  (*clo_enqueue)(const struct lu_env *env,
+                            const struct cl_lock_slice *slice,
+                            struct cl_io *io, __u32 enqflags);
+        /**
+         * Attempts to wait for enqueue result. Called top-to-bottom.
+         *
+         * \see ccc_lock_wait(), lov_lock_wait(), osc_lock_wait()
+         */
+        int  (*clo_wait)(const struct lu_env *env,
+                         const struct cl_lock_slice *slice);
+        /**
+         * Attempts to unlock the lock. Called bottom-to-top. In addition to
+         * usual return values of lock state-machine methods, this can return
+         * -ESTALE to indicate that lock cannot be returned to the cache, and
+         * has to be re-initialized.
+         *
+         * \see ccc_lock_unlock(), lov_lock_unlock(), osc_lock_unlock()
+         */
+        int  (*clo_unuse)(const struct lu_env *env,
+                          const struct cl_lock_slice *slice);
+        /**
+         * Notifies layer that cached lock is started being used.
+         *
+         * \pre lock->cll_state == CLS_CACHED
+         *
+         * \see lov_lock_use(), osc_lock_use()
+         */
+        int  (*clo_use)(const struct lu_env *env,
+                        const struct cl_lock_slice *slice);
+        /** @} statemachine */
+        /**
+         * A method invoked when lock state is changed (as a result of state
+         * transition). This is used, for example, to track when the state of
+         * a sub-lock changes, to propagate this change to the corresponding
+         * top-lock. Optional
+         *
+         * \see lovsub_lock_state()
+         */
+        void (*clo_state)(const struct lu_env *env,
+                          const struct cl_lock_slice *slice,
+                          enum cl_lock_state st);
+        /**
+         * Returns true, iff given lock is suitable for the given io, idea
+         * being, that there are certain "unsafe" locks, e.g., ones acquired
+         * for O_APPEND writes, that we don't want to re-use for a normal
+         * write, to avoid the danger of cascading evictions. Optional. Runs
+         * under cl_object_header::coh_lock_guard.
+         *
+         * XXX this should take more information about lock needed by
+         * io. Probably lock description or something similar.
+         *
+         * \see lov_fits_into()
+         */
+        int (*clo_fits_into)(const struct lu_env *env,
+                             const struct cl_lock_slice *slice,
+                             const struct cl_lock_descr *need,
+                             const struct cl_io *io);
+        /**
+         * \name ast
+         * Asynchronous System Traps. All of then are optional, all are
+         * executed bottom-to-top.
+         */
+        /** @{ */
+
+        /**
+         * Cancellation callback. Cancel a lock voluntarily, or under
+         * the request of server.
+         */
+        void (*clo_cancel)(const struct lu_env *env,
+                           const struct cl_lock_slice *slice);
+        /**
+         * Lock weighting ast. Executed to estimate how precious this lock
+         * is. The sum of results across all layers is used to determine
+         * whether lock worth keeping in cache given present memory usage.
+         *
+         * \see osc_lock_weigh(), vvp_lock_weigh(), lovsub_lock_weigh().
+         */
+        unsigned long (*clo_weigh)(const struct lu_env *env,
+                                   const struct cl_lock_slice *slice);
+        /** @} ast */
+
+        /**
+         * \see lovsub_lock_closure()
+         */
+        int (*clo_closure)(const struct lu_env *env,
+                           const struct cl_lock_slice *slice,
+                           struct cl_lock_closure *closure);
+        /**
+         * Executed top-to-bottom when lock description changes (e.g., as a
+         * result of server granting more generous lock than was requested).
+         *
+         * \see lovsub_lock_modify()
+         */
+        int (*clo_modify)(const struct lu_env *env,
+                          const struct cl_lock_slice *slice,
+                          const struct cl_lock_descr *updated);
+        /**
+         * Notifies layers (bottom-to-top) that lock is going to be
+         * destroyed. Responsibility of layers is to prevent new references on
+         * this lock from being acquired once this method returns.
+         *
+         * This can be called multiple times due to the races.
+         *
+         * \see cl_lock_delete()
+         * \see osc_lock_delete(), lovsub_lock_delete()
+         */
+        void (*clo_delete)(const struct lu_env *env,
+                           const struct cl_lock_slice *slice);
+        /**
+         * Destructor. Frees resources and the slice.
+         *
+         * \see ccc_lock_fini(), lov_lock_fini(), lovsub_lock_fini(),
+         * \see osc_lock_fini()
+         */
+        void (*clo_fini)(const struct lu_env *env, struct cl_lock_slice *slice);
+        /**
+         * Optional debugging helper. Prints given lock slice.
+         */
+        int (*clo_print)(const struct lu_env *env,
+                         void *cookie, lu_printer_t p,
+                         const struct cl_lock_slice *slice);
+};
+
+#define CL_LOCK_DEBUG(mask, env, lock, format, ...)                     \
+do {                                                                    \
+        static DECLARE_LU_CDEBUG_PRINT_INFO(__info, mask);              \
+                                                                        \
+        if (cdebug_show(mask, DEBUG_SUBSYSTEM)) {                       \
+                cl_lock_print(env, &__info, lu_cdebug_printer, lock);   \
+                CDEBUG(mask, format , ## __VA_ARGS__);                  \
+        }                                                               \
+} while (0)
+
+/** @} cl_lock */
+
+/** \addtogroup cl_page_list cl_page_list
+ * Page list used to perform collective operations on a group of pages.
+ *
+ * Pages are added to the list one by one. cl_page_list acquires a reference
+ * for every page in it. Page list is used to perform collective operations on
+ * pages:
+ *
+ *     - submit pages for an immediate transfer,
+ *
+ *     - own pages on behalf of certain io (waiting for each page in turn),
+ *
+ *     - discard pages.
+ *
+ * When list is finalized, it releases references on all pages it still has.
+ *
+ * \todo XXX concurrency control.
+ *
+ * @{
+ */
+struct cl_page_list {
+        unsigned         pl_nr;
+        struct list_head pl_pages;
+        cfs_task_t      *pl_owner;
+};
+
+/** \addtogroup cl_page_list cl_page_list
+ * A 2-queue of pages. A convenience data-type for common use case, 2-queue
+ * contains an incoming page list and an outgoing page list.
+ */
+struct cl_2queue {
+        struct cl_page_list c2_qin;
+        struct cl_page_list c2_qout;
+};
+
+/** @} cl_page_list */
+
+/** \addtogroup cl_io cl_io
+ * @{ */
+/** \struct cl_io
+ * I/O
+ *
+ * cl_io represents a high level I/O activity like
+ * read(2)/write(2)/truncate(2) system call, or cancellation of an extent
+ * lock.
+ *
+ * cl_io is a layered object, much like cl_{object,page,lock} but with one
+ * important distinction. We want to minimize number of calls to the allocator
+ * in the fast path, e.g., in the case of read(2) when everything is cached:
+ * client already owns the lock over region being read, and data are cached
+ * due to read-ahead. To avoid allocation of cl_io layers in such situations,
+ * per-layer io state is stored in the session, associated with the io, see
+ * struct {vvp,lov,osc}_io for example. Sessions allocation is amortized
+ * by using free-lists, see cl_env_get().
+ *
+ * There is a small predefined number of possible io types, enumerated in enum
+ * cl_io_type.
+ *
+ * cl_io is a state machine, that can be advanced concurrently by the multiple
+ * threads. It is up to these threads to control the concurrency and,
+ * specifically, to detect when io is done, and its state can be safely
+ * released.
+ *
+ * For read/write io overall execution plan is as following:
+ *
+ *     (0) initialize io state through all layers;
+ *
+ *     (1) loop: prepare chunk of work to do
+ *
+ *     (2) call all layers to collect locks they need to process current chunk
+ *
+ *     (3) sort all locks to avoid dead-locks, and acquire them
+ *
+ *     (4) process the chunk: call per-page methods
+ *         (cl_io_operations::cio_read_page() for read,
+ *         cl_io_operations::cio_prepare_write(),
+ *         cl_io_operations::cio_commit_write() for write)
+ *
+ *     (5) release locks
+ *
+ *     (6) repeat loop.
+ *
+ * To implement the "parallel IO mode", lov layer creates sub-io's (lazily to
+ * address allocation efficiency issues mentioned above), and returns with the
+ * special error condition from per-page method when current sub-io has to
+ * block. This causes io loop to be repeated, and lov switches to the next
+ * sub-io in its cl_io_operations::cio_iter_init() implementation.
+ */
+
+/** IO types */
+enum cl_io_type {
+        /** read system call */
+        CIT_READ,
+        /** write system call */
+        CIT_WRITE,
+        /** truncate system call */
+        CIT_TRUNC,
+        /**
+         * page fault handling
+         */
+        CIT_FAULT,
+        /**
+         * Miscellaneous io. This is used for occasional io activity that
+         * doesn't fit into other types. Currently this is used for:
+         *
+         *     - cancellation of an extent lock. This io exists as a context
+         *     to write dirty pages from under the lock being canceled back
+         *     to the server;
+         *
+         *     - VM induced page write-out. An io context for writing page out
+         *     for memory cleansing;
+         *
+         *     - glimpse. An io context to acquire glimpse lock.
+         *
+         * CIT_MISC io is used simply as a context in which locks and pages
+         * are manipulated. Such io has no internal "process", that is,
+         * cl_io_loop() is never called for it.
+         */
+        CIT_MISC,
+        CIT_OP_NR
+};
+
+/**
+ * States of cl_io state machine
+ */
+enum cl_io_state {
+        /** Not initialized. */
+        CIS_ZERO,
+        /** Initialized. */
+        CIS_INIT,
+        /** IO iteration started. */
+        CIS_IT_STARTED,
+        /** Locks taken. */
+        CIS_LOCKED,
+        /** Actual IO is in progress. */
+        CIS_IO_GOING,
+        /** IO for the current iteration finished. */
+        CIS_IO_FINISHED,
+        /** Locks released. */
+        CIS_UNLOCKED,
+        /** Iteration completed. */
+        CIS_IT_ENDED,
+        /** cl_io finalized. */
+        CIS_FINI
+};
+
+/**
+ * IO state private for a layer.
+ *
+ * This is usually embedded into layer session data, rather than allocated
+ * dynamically.
+ *
+ * \see vvp_io, lov_io, osc_io, ccc_io
+ */
+struct cl_io_slice {
+        struct cl_io                  *cis_io;
+        /** corresponding object slice. Immutable after creation. */
+        struct cl_object              *cis_obj;
+        /** io operations. Immutable after creation. */
+        const struct cl_io_operations *cis_iop;
+        /**
+         * linkage into a list of all slices for a given cl_io, hanging off
+         * cl_io::ci_layers. Immutable after creation.
+         */
+        struct list_head               cis_linkage;
+};
+
+
+/**
+ * Per-layer io operations.
+ * \see vvp_io_ops, lov_io_ops, lovsub_io_ops, osc_io_ops
+ */
+struct cl_io_operations {
+        /**
+         * Vector of io state transition methods for every io type.
+         *
+         * \see cl_page_operations::io
+         */
+        struct {
+                /**
+                 * Prepare io iteration at a given layer.
+                 *
+                 * Called top-to-bottom at the beginning of each iteration of
+                 * "io loop" (if it makes sense for this type of io). Here
+                 * layer selects what work it will do during this iteration.
+                 *
+                 * \see cl_io_operations::cio_iter_fini()
+                 */
+                int (*cio_iter_init) (const struct lu_env *env,
+                                      const struct cl_io_slice *slice);
+                /**
+                 * Finalize io iteration.
+                 *
+                 * Called bottom-to-top at the end of each iteration of "io
+                 * loop". Here layers can decide whether IO has to be
+                 * continued.
+                 *
+                 * \see cl_io_operations::cio_iter_init()
+                 */
+                void (*cio_iter_fini) (const struct lu_env *env,
+                                       const struct cl_io_slice *slice);
+                /**
+                 * Collect locks for the current iteration of io.
+                 *
+                 * Called top-to-bottom to collect all locks necessary for
+                 * this iteration. This methods shouldn't actually enqueue
+                 * anything, instead it should post a lock through
+                 * cl_io_lock_add(). Once all locks are collected, they are
+                 * sorted and enqueued in the proper order.
+                 */
+                int  (*cio_lock) (const struct lu_env *env,
+                                  const struct cl_io_slice *slice);
+                /**
+                 * Finalize unlocking.
+                 *
+                 * Called bottom-to-top to finish layer specific unlocking
+                 * functionality, after generic code released all locks
+                 * acquired by cl_io_operations::cio_lock().
+                 */
+                void  (*cio_unlock)(const struct lu_env *env,
+                                    const struct cl_io_slice *slice);
+                /**
+                 * Start io iteration.
+                 *
+                 * Once all locks are acquired, called top-to-bottom to
+                 * commence actual IO. In the current implementation,
+                 * top-level vvp_io_{read,write}_start() does all the work
+                 * synchronously by calling generic_file_*(), so other layers
+                 * are called when everything is done.
+                 */
+                int  (*cio_start)(const struct lu_env *env,
+                                  const struct cl_io_slice *slice);
+                /**
+                 * Called top-to-bottom at the end of io loop. Here layer
+                 * might wait for an unfinished asynchronous io.
+                 */
+                void (*cio_end)  (const struct lu_env *env,
+                                  const struct cl_io_slice *slice);
+                /**
+                 * Called bottom-to-top to notify layers that read/write IO
+                 * iteration finished, with \a nob bytes transferred.
+                 */
+                void (*cio_advance)(const struct lu_env *env,
+                                    const struct cl_io_slice *slice,
+                                    size_t nob);
+                /**
+                 * Called once per io, bottom-to-top to release io resources.
+                 */
+                void (*cio_fini) (const struct lu_env *env,
+                                  const struct cl_io_slice *slice);
+        } op[CIT_OP_NR];
+        struct {
+                /**
+                 * Submit pages from \a queue->c2_qin for IO, and move
+                 * successfully submitted pages into \a queue->c2_qout. Return
+                 * non-zero if failed to submit even the single page. If
+                 * submission failed after some pages were moved into \a
+                 * queue->c2_qout, completion callback with non-zero ioret is
+                 * executed on them.
+                 */
+                int  (*cio_submit)(const struct lu_env *env,
+                                   const struct cl_io_slice *slice,
+                                   enum cl_req_type crt,
+                                   struct cl_2queue *queue);
+        } req_op[CRT_NR];
+        /**
+         * Read missing page.
+         *
+         * Called by a top-level cl_io_operations::op[CIT_READ]::cio_start()
+         * method, when it hits not-up-to-date page in the range. Optional.
+         *
+         * \pre io->ci_type == CIT_READ
+         */
+        int (*cio_read_page)(const struct lu_env *env,
+                             const struct cl_io_slice *slice,
+                             const struct cl_page_slice *page);
+        /**
+         * Prepare write of a \a page. Called bottom-to-top by a top-level
+         * cl_io_operations::op[CIT_WRITE]::cio_start() to prepare page for
+         * get data from user-level buffer.
+         *
+         * \pre io->ci_type == CIT_WRITE
+         *
+         * \see vvp_io_prepare_write(), lov_io_prepare_write(),
+         * osc_io_prepare_write().
+         */
+        int (*cio_prepare_write)(const struct lu_env *env,
+                                 const struct cl_io_slice *slice,
+                                 const struct cl_page_slice *page,
+                                 unsigned from, unsigned to);
+        /**
+         *
+         * \pre io->ci_type == CIT_WRITE
+         *
+         * \see vvp_io_commit_write(), lov_io_commit_write(),
+         * osc_io_commit_write().
+         */
+        int (*cio_commit_write)(const struct lu_env *env,
+                                const struct cl_io_slice *slice,
+                                const struct cl_page_slice *page,
+                                unsigned from, unsigned to);
+        /**
+         * Optional debugging helper. Print given io slice.
+         */
+        int (*cio_print)(const struct lu_env *env, void *cookie,
+                         lu_printer_t p, const struct cl_io_slice *slice);
+};
+
+/**
+ * Flags to lock enqueue procedure.
+ * \ingroup cl_lock
+ */
+enum cl_enq_flags {
+        /**
+         * instruct server to not block, if conflicting lock is found. Instead
+         * -EWOULDBLOCK is returned immediately.
+         */
+        CEF_NONBLOCK     = 0x00000001,
+        /**
+         * take lock asynchronously (out of order), as it cannot
+         * deadlock. This is for LDLM_FL_HAS_INTENT locks used for glimpsing.
+         */
+        CEF_ASYNC        = 0x00000002,
+        /**
+         * tell the server to instruct (though a flag in the blocking ast) an
+         * owner of the conflicting lock, that it can drop dirty pages
+         * protected by this lock, without sending them to the server.
+         */
+        CEF_DISCARD_DATA = 0x00000004
+};
+
+/**
+ * Link between lock and io. Intermediate structure is needed, because the
+ * same lock can be part of multiple io's simultaneously.
+ */
+struct cl_io_lock_link {
+        /** linkage into one of cl_lockset lists. */
+        struct list_head     cill_linkage;
+        struct cl_lock_descr cill_descr;
+        struct cl_lock      *cill_lock;
+        /**
+         * flags to enqueue lock for this IO. A combination of bit-flags from
+         * enum cl_enq_flags.
+         */
+        __u32                cill_enq_flags;
+        /** optional destructor */
+        void               (*cill_fini)(const struct lu_env *env,
+                                        struct cl_io_lock_link *link);
+};
+
+/**
+ * Lock-set represents a collection of locks, that io needs at a
+ * time. Generally speaking, client tries to avoid holding multiple locks when
+ * possible, because
+ *
+ *      - holding extent locks over multiple ost's introduces the danger of
+ *        "cascading timeouts";
+ *
+ *      - holding multiple locks over the same ost is still dead-lock prone,
+ *        see comment in osc_lock_enqueue(),
+ *
+ * but there are certain situations where this is unavoidable:
+ *
+ *      - O_APPEND writes have to take [0, EOF] lock for correctness;
+ *
+ *      - truncate has to take [new-size, EOF] lock for correctness;
+ *
+ *      - SNS has to take locks across full stripe for correctness;
+ *
+ *      - in the case when user level buffer, supplied to {read,write}(file0),
+ *        is a part of a memory mapped lustre file, client has to take a dlm
+ *        locks on file0, and all files that back up the buffer (or a part of
+ *        the buffer, that is being processed in the current chunk, in any
+ *        case, there are situations where at least 2 locks are necessary).
+ *
+ * In such cases we at least try to take locks in the same consistent
+ * order. To this end, all locks are first collected, then sorted, and then
+ * enqueued.
+ */
+struct cl_lockset {
+        /** locks to be acquired. */
+        struct list_head cls_todo;
+        /** locks currently being processed. */
+        struct list_head cls_curr;
+        /** locks acquired. */
+        struct list_head cls_done;
+};
+
+/**
+ * Lock requirements(demand) for IO. It should be cl_io_lock_req,
+ * but 'req' is always to be thought as 'request' :-)
+ */
+enum cl_io_lock_dmd {
+        /** Always lock data (e.g., O_APPEND). */
+        CILR_MANDATORY = 0,
+        /** Layers are free to decide between local and global locking. */
+        CILR_MAYBE,
+        /** Never lock: there is no cache (e.g., liblustre). */
+        CILR_NEVER
+};
+
+struct cl_io_rw_common {
+        loff_t      crw_pos;
+        size_t      crw_count;
+        int         crw_nonblock;
+};
+
+/**
+ * State for io.
+ *
+ * cl_io is shared by all threads participating in this IO (in current
+ * implementation only one thread advances IO, but parallel IO design and
+ * concurrent copy_*_user() require multiple threads acting on the same IO. It
+ * is up to these threads to serialize their activities, including updates to
+ * mutable cl_io fields.
+ */
+struct cl_io {
+        /** type of this IO. Immutable after creation. */
+        enum cl_io_type                ci_type;
+        /** current state of cl_io state machine. */
+        enum cl_io_state               ci_state;
+        /** main object this io is against. Immutable after creation. */
+        struct cl_object              *ci_obj;
+        /**
+         * Upper layer io, of which this io is a part of. Immutable after
+         * creation.
+         */
+        struct cl_io                  *ci_parent;
+        /** List of slices. Immutable after creation. */
+        struct list_head               ci_layers;
+        /** list of locks (to be) acquired by this io. */
+        struct cl_lockset              ci_lockset;
+        /** lock requirements, this is just a help info for sublayers. */
+        enum cl_io_lock_dmd            ci_lockreq;
+        union {
+                struct cl_rd_io {
+                        struct cl_io_rw_common rd;
+                        int                    rd_is_sendfile;
+                } ci_rd;
+                struct cl_wr_io {
+                        struct cl_io_rw_common wr;
+                        int                    wr_append;
+                } ci_wr;
+                struct cl_io_rw_common ci_rw;
+                struct cl_truncate_io {
+                        /** new size to which file is truncated */
+                        size_t           tr_size;
+                        struct obd_capa *tr_capa;
+                } ci_truncate;
+                struct cl_fault_io {
+                        /** page index within file. */
+                        pgoff_t         ft_index;
+                        /** bytes valid byte on a faulted page. */
+                        int             ft_nob;
+                        /** writable page? */
+                        int             ft_writable;
+                        /** page of an executable? */
+                        int             ft_executable;
+                        /** resulting page */
+                        struct cl_page *ft_page;
+                } ci_fault;
+        } u;
+        struct cl_2queue     ci_queue;
+        size_t               ci_nob;
+        int                  ci_result;
+        int                  ci_continue;
+        /**
+         * Number of pages owned by this IO. For invariant checking.
+         */
+        unsigned             ci_owned_nr;
+};
+
+/** @} cl_io */
+
+/** \addtogroup cl_req cl_req
+ * @{ */
+/** \struct cl_req
+ * Transfer.
+ *
+ * There are two possible modes of transfer initiation on the client:
+ *
+ *     - immediate transfer: this is started when a high level io wants a page
+ *       or a collection of pages to be transferred right away. Examples:
+ *       read-ahead, synchronous read in the case of non-page aligned write,
+ *       page write-out as a part of extent lock cancellation, page write-out
+ *       as a part of memory cleansing. Immediate transfer can be both
+ *       cl_req_type::CRT_READ and cl_req_type::CRT_WRITE;
+ *
+ *     - opportunistic transfer (cl_req_type::CRT_WRITE only), that happens
+ *       when io wants to transfer a page to the server some time later, when
+ *       it can be done efficiently. Example: pages dirtied by the write(2)
+ *       path.
+ *
+ * In any case, transfer takes place in the form of a cl_req, which is a
+ * representation for a network RPC.
+ *
+ * Pages queued for an opportunistic transfer are cached until it is decided
+ * that efficient RPC can be composed of them. This decision is made by "a
+ * req-formation engine", currently implemented as a part of osc
+ * layer. Req-formation depends on many factors: the size of the resulting
+ * RPC, whether or not multi-object RPCs are supported by the server,
+ * max-rpc-in-flight limitations, size of the dirty cache, etc.
+ *
+ * For the immediate transfer io submits a cl_page_list, that req-formation
+ * engine slices into cl_req's, possibly adding cached pages to some of
+ * the resulting req's.
+ *
+ * Whenever a page from cl_page_list is added to a newly constructed req, its
+ * cl_page_operations::cpo_prep() layer methods are called. At that moment,
+ * page state is atomically changed from cl_page_state::CPS_OWNED to
+ * cl_page_state::CPS_PAGEOUT or cl_page_state::CPS_PAGEIN, cl_page::cp_owner
+ * is zeroed, and cl_page::cp_req is set to the
+ * req. cl_page_operations::cpo_prep() method at the particular layer might
+ * return -EALREADY to indicate that it does not need to submit this page
+ * at all. This is possible, for example, if page, submitted for read,
+ * became up-to-date in the meantime; and for write, the page don't have
+ * dirty bit marked. \see cl_io_submit_rw()
+ *
+ * Whenever a cached page is added to a newly constructed req, its
+ * cl_page_operations::cpo_make_ready() layer methods are called. At that
+ * moment, page state is atomically changed from cl_page_state::CPS_CACHED to
+ * cl_page_state::CPS_PAGEOUT, and cl_page::cp_req is set to
+ * req. cl_page_operations::cpo_make_ready() method at the particular layer
+ * might return -EAGAIN to indicate that this page is not eligible for the
+ * transfer right now.
+ *
+ * FUTURE
+ *
+ * Plan is to divide transfers into "priority bands" (indicated when
+ * submitting cl_page_list, and queuing a page for the opportunistic transfer)
+ * and allow glueing of cached pages to immediate transfers only within single
+ * band. This would make high priority transfers (like lock cancellation or
+ * memory pressure induced write-out) really high priority.
+ *
+ */
+
+/**
+ * Per-transfer attributes.
+ */
+struct cl_req_attr {
+        /** Generic attributes for the server consumption. */
+        struct obdo     *cra_oa;
+        /** Capability. */
+        struct obd_capa *cra_capa;
+};
+
+/**
+ * Transfer request operations definable at every layer.
+ *
+ * Concurrency: transfer formation engine synchronizes calls to all transfer
+ * methods.
+ */
+struct cl_req_operations {
+        /**
+         * Invoked top-to-bottom by cl_req_prep() when transfer formation is
+         * complete (all pages are added).
+         *
+         * \see osc_req_prep()
+         */
+        int  (*cro_prep)(const struct lu_env *env,
+                         const struct cl_req_slice *slice);
+        /**
+         * Called top-to-bottom to fill in \a oa fields. This is called twice
+         * with different flags, see bug 10150 and osc_build_req().
+         *
+         * \param obj an object from cl_req which attributes are to be set in
+         *            \a oa.
+         *
+         * \param oa struct obdo where attributes are placed
+         *
+         * \param flags \a oa fields to be filled.
+         */
+        void (*cro_attr_set)(const struct lu_env *env,
+                             const struct cl_req_slice *slice,
+                             const struct cl_object *obj,
+                             struct cl_req_attr *attr, obd_valid flags);
+        /**
+         * Called top-to-bottom from cl_req_completion() to notify layers that
+         * transfer completed. Has to free all state allocated by
+         * cl_device_operations::cdo_req_init().
+         */
+        void (*cro_completion)(const struct lu_env *env,
+                               const struct cl_req_slice *slice, int ioret);
+};
+
+/**
+ * A per-object state that (potentially multi-object) transfer request keeps.
+ */
+struct cl_req_obj {
+        /** object itself */
+        struct cl_object   *ro_obj;
+        /** reference to cl_req_obj::ro_obj. For debugging. */
+        struct lu_ref_link *ro_obj_ref;
+        /* something else? Number of pages for a given object? */
+};
+
+/**
+ * Transfer request.
+ *
+ * Transfer requests are not reference counted, because IO sub-system owns
+ * them exclusively and knows when to free them.
+ *
+ * Life cycle.
+ *
+ * cl_req is created by cl_req_alloc() that calls
+ * cl_device_operations::cdo_req_init() device methods to allocate per-req
+ * state in every layer.
+ *
+ * Then pages are added (cl_req_page_add()), req keeps track of all objects it
+ * contains pages for.
+ *
+ * Once all pages were collected, cl_page_operations::cpo_prep() method is
+ * called top-to-bottom. At that point layers can modify req, let it pass, or
+ * deny it completely. This is to support things like SNS that have transfer
+ * ordering requirements invisible to the individual req-formation engine.
+ *
+ * On transfer completion (or transfer timeout, or failure to initiate the
+ * transfer of an allocated req), cl_req_operations::cro_completion() method
+ * is called, after execution of cl_page_operations::cpo_completion() of all
+ * req's pages.
+ */
+struct cl_req {
+        enum cl_req_type    crq_type;
+        /** A list of pages being transfered */
+        struct list_head    crq_pages;
+        /** Number of pages in cl_req::crq_pages */
+        unsigned            crq_nrpages;
+        /** An array of objects which pages are in ->crq_pages */
+        struct cl_req_obj  *crq_o;
+        /** Number of elements in cl_req::crq_objs[] */
+        unsigned            crq_nrobjs;
+        struct list_head    crq_layers;
+};
+
+/**
+ * Per-layer state for request.
+ */
+struct cl_req_slice {
+        struct cl_req    *crs_req;
+        struct cl_device *crs_dev;
+        struct list_head  crs_linkage;
+        const struct cl_req_operations *crs_ops;
+};
+
+/* @} cl_req */
+
+/**
+ * Stats for a generic cache (similar to inode, lu_object, etc. caches).
+ */
+struct cache_stats {
+        const char    *cs_name;
+        /** how many entities were created at all */
+        atomic_t       cs_created;
+        /** how many cache lookups were performed */
+        atomic_t       cs_lookup;
+        /** how many times cache lookup resulted in a hit */
+        atomic_t       cs_hit;
+        /** how many entities are in the cache right now */
+        atomic_t       cs_total;
+        /** how many entities in the cache are actively used (and cannot be
+         * evicted) right now */
+        atomic_t       cs_busy;
+};
+
+/** These are not exported so far */
+void cache_stats_init (struct cache_stats *cs, const char *name);
+int  cache_stats_print(const struct cache_stats *cs,
+                       char *page, int count, int header);
+
+/**
+ * Client-side site. This represents particular client stack. "Global"
+ * variables should (directly or indirectly) be added here to allow multiple
+ * clients to co-exist in the single address space.
+ */
+struct cl_site {
+        struct lu_site        cs_lu;
+        /**
+         * Statistical counters. Atomics do not scale, something better like
+         * per-cpu counters is needed.
+         *
+         * These are exported as /proc/fs/lustre/llite/.../site
+         *
+         * When interpreting keep in mind that both sub-locks (and sub-pages)
+         * and top-locks (and top-pages) are accounted here.
+         */
+        struct cache_stats    cs_pages;
+        struct cache_stats    cs_locks;
+        atomic_t              cs_pages_state[CPS_NR];
+        atomic_t              cs_locks_state[CLS_NR];
+};
+
+int  cl_site_init (struct cl_site *s, struct cl_device *top);
+void cl_site_fini (struct cl_site *s);
+void cl_stack_fini(const struct lu_env *env, struct cl_device *cl);
+
+/**
+ * Output client site statistical counters into a buffer. Suitable for
+ * ll_rd_*()-style functions.
+ */
+int cl_site_stats_print(const struct cl_site *s, char *page, int count);
+
+/**
+ * \name helpers
+ *
+ * Type conversion and accessory functions.
+ */
+/** @{ */
+
+static inline struct cl_site *lu2cl_site(const struct lu_site *site)
+{
+        return container_of(site, struct cl_site, cs_lu);
+}
+
+static inline int lu_device_is_cl(const struct lu_device *d)
+{
+        return d->ld_type->ldt_tags & LU_DEVICE_CL;
+}
+
+static inline struct cl_device *lu2cl_dev(const struct lu_device *d)
+{
+        LASSERT(d == NULL || IS_ERR(d) || lu_device_is_cl(d));
+        return container_of0(d, struct cl_device, cd_lu_dev);
+}
+
+static inline struct lu_device *cl2lu_dev(struct cl_device *d)
+{
+        return &d->cd_lu_dev;
+}
+
+static inline struct cl_object *lu2cl(const struct lu_object *o)
+{
+        LASSERT(o == NULL || IS_ERR(o) || lu_device_is_cl(o->lo_dev));
+        return container_of0(o, struct cl_object, co_lu);
+}
+
+static inline const struct cl_object_conf *
+lu2cl_conf(const struct lu_object_conf *conf)
+{
+        return container_of0(conf, struct cl_object_conf, coc_lu);
+}
+
+static inline struct cl_object *cl_object_next(const struct cl_object *obj)
+{
+        return obj ? lu2cl(lu_object_next(&obj->co_lu)) : NULL;
+}
+
+static inline struct cl_device *cl_object_device(const struct cl_object *o)
+{
+        LASSERT(o == NULL || IS_ERR(o) || lu_device_is_cl(o->co_lu.lo_dev));
+        return container_of0(o->co_lu.lo_dev, struct cl_device, cd_lu_dev);
+}
+
+static inline struct cl_object_header *luh2coh(const struct lu_object_header *h)
+{
+        return container_of0(h, struct cl_object_header, coh_lu);
+}
+
+static inline struct cl_site *cl_object_site(const struct cl_object *obj)
+{
+        return lu2cl_site(obj->co_lu.lo_dev->ld_site);
+}
+
+static inline
+struct cl_object_header *cl_object_header(const struct cl_object *obj)
+{
+        return luh2coh(obj->co_lu.lo_header);
+}
+
+static inline int cl_device_init(struct cl_device *d, struct lu_device_type *t)
+{
+        return lu_device_init(&d->cd_lu_dev, t);
+}
+
+static inline void cl_device_fini(struct cl_device *d)
+{
+        lu_device_fini(&d->cd_lu_dev);
+}
+
+void cl_page_slice_add(struct cl_page *page, struct cl_page_slice *slice,
+                       struct cl_object *obj,
+                       const struct cl_page_operations *ops);
+void cl_lock_slice_add(struct cl_lock *lock, struct cl_lock_slice *slice,
+                       struct cl_object *obj,
+                       const struct cl_lock_operations *ops);
+void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice,
+                     struct cl_object *obj, const struct cl_io_operations *ops);
+void cl_req_slice_add(struct cl_req *req, struct cl_req_slice *slice,
+                      struct cl_device *dev,
+                      const struct cl_req_operations *ops);
+/** @} helpers */
+
+/** \defgroup cl_object cl_object
+ * @{ */
+struct cl_object *cl_object_top (struct cl_object *o);
+struct cl_object *cl_object_find(const struct lu_env *env, struct cl_device *cd,
+                                 const struct lu_fid *fid,
+                                 const struct cl_object_conf *c);
+
+int  cl_object_header_init(struct cl_object_header *h);
+void cl_object_header_fini(struct cl_object_header *h);
+void cl_object_put        (const struct lu_env *env, struct cl_object *o);
+void cl_object_get        (struct cl_object *o);
+void cl_object_attr_lock  (struct cl_object *o);
+void cl_object_attr_unlock(struct cl_object *o);
+int  cl_object_attr_get   (const struct lu_env *env, struct cl_object *obj,
+                           struct cl_attr *attr);
+int  cl_object_attr_set   (const struct lu_env *env, struct cl_object *obj,
+                           const struct cl_attr *attr, unsigned valid);
+int  cl_object_glimpse    (const struct lu_env *env, struct cl_object *obj,
+                           struct ost_lvb *lvb);
+int  cl_conf_set          (const struct lu_env *env, struct cl_object *obj,
+                           const struct cl_object_conf *conf);
+void cl_object_prune      (const struct lu_env *env, struct cl_object *obj);
+void cl_object_kill       (const struct lu_env *env, struct cl_object *obj);
+
+/**
+ * Returns true, iff \a o0 and \a o1 are slices of the same object.
+ */
+static inline int cl_object_same(struct cl_object *o0, struct cl_object *o1)
+{
+        return cl_object_header(o0) == cl_object_header(o1);
+}
+
+/** @} cl_object */
+
+/** \defgroup cl_page cl_page
+ * @{ */
+struct cl_page       *cl_page_lookup(struct cl_object_header *hdr,
+                                     pgoff_t index);
+void                  cl_page_gang_lookup(const struct lu_env *env,
+                                          struct cl_object *obj,
+                                          struct cl_io *io,
+                                          pgoff_t start, pgoff_t end,
+                                          struct cl_page_list *plist);
+struct cl_page *cl_page_find        (const struct lu_env *env,
+                                     struct cl_object *obj,
+                                     pgoff_t idx, struct page *vmpage,
+                                     enum cl_page_type type);
+void            cl_page_get         (struct cl_page *page);
+void            cl_page_put         (const struct lu_env *env,
+                                     struct cl_page *page);
+void            cl_page_print       (const struct lu_env *env, void *cookie,
+                                     lu_printer_t printer,
+                                     const struct cl_page *pg);
+void            cl_page_header_print(const struct lu_env *env, void *cookie,
+                                     lu_printer_t printer,
+                                     const struct cl_page *pg);
+cfs_page_t     *cl_page_vmpage      (const struct lu_env *env,
+                                     struct cl_page *page);
+struct cl_page *cl_vmpage_page      (cfs_page_t *vmpage, struct cl_object *obj);
+struct cl_page *cl_page_top         (struct cl_page *page);
+int             cl_is_page          (const void *addr);
+
+const struct cl_page_slice *cl_page_at(const struct cl_page *page,
+                                       const struct lu_device_type *dtype);
+
+/**
+ * \name ownership
+ *
+ * Functions dealing with the ownership of page by io.
+ */
+/** @{ */
+
+int  cl_page_own        (const struct lu_env *env,
+                         struct cl_io *io, struct cl_page *page);
+void cl_page_assume     (const struct lu_env *env,
+                         struct cl_io *io, struct cl_page *page);
+void cl_page_unassume   (const struct lu_env *env,
+                         struct cl_io *io, struct cl_page *pg);
+void cl_page_disown     (const struct lu_env *env,
+                         struct cl_io *io, struct cl_page *page);
+int  cl_page_is_owned   (const struct cl_page *pg, const struct cl_io *io);
+
+/** @} ownership */
+
+/**
+ * \name transfer
+ *
+ * Functions dealing with the preparation of a page for a transfer, and
+ * tracking transfer state.
+ */
+/** @{ */
+int  cl_page_prep       (const struct lu_env *env, struct cl_io *io,
+                         struct cl_page *pg, enum cl_req_type crt);
+void cl_page_completion (const struct lu_env *env,
+                         struct cl_page *pg, enum cl_req_type crt, int ioret);
+int  cl_page_make_ready (const struct lu_env *env, struct cl_page *pg,
+                         enum cl_req_type crt);
+int  cl_page_cache_add  (const struct lu_env *env, struct cl_io *io,
+                         struct cl_page *pg, enum cl_req_type crt);
+void cl_page_clip       (const struct lu_env *env, struct cl_page *pg,
+                         int from, int to);
+int  cl_page_cancel     (const struct lu_env *env, struct cl_page *page);
+
+/** @} transfer */
+
+
+/**
+ * \name helper routines
+ * Functions to discard, delete and export a cl_page.
+ */
+/** @{ */
+void    cl_page_discard      (const struct lu_env *env, struct cl_io *io,
+                              struct cl_page *pg);
+void    cl_page_delete       (const struct lu_env *env, struct cl_page *pg);
+int     cl_page_unmap        (const struct lu_env *env, struct cl_io *io,
+                              struct cl_page *pg);
+int     cl_page_is_vmlocked  (const struct lu_env *env,
+                              const struct cl_page *pg);
+void    cl_page_export       (const struct lu_env *env, struct cl_page *pg);
+int     cl_page_is_under_lock(const struct lu_env *env, struct cl_io *io,
+                              struct cl_page *page);
+loff_t  cl_offset            (const struct cl_object *obj, pgoff_t idx);
+pgoff_t cl_index             (const struct cl_object *obj, loff_t offset);
+int     cl_page_size         (const struct cl_object *obj);
+int     cl_pages_prune       (const struct lu_env *env, struct cl_object *obj);
+
+void cl_lock_print      (const struct lu_env *env, void *cookie,
+                         lu_printer_t printer, const struct cl_lock *lock);
+void cl_lock_descr_print(const struct lu_env *env, void *cookie,
+                         lu_printer_t printer,
+                         const struct cl_lock_descr *descr);
+/* @} helper */
+
+/** @} cl_page */
+
+/** \defgroup cl_lock cl_lock
+ * @{ */
+
+struct cl_lock *cl_lock_hold(const struct lu_env *env, const struct cl_io *io,
+                             const struct cl_lock_descr *need,
+                             const char *scope, const void *source);
+struct cl_lock *cl_lock_peek(const struct lu_env *env, const struct cl_io *io,
+                             const struct cl_lock_descr *need,
+                             const char *scope, const void *source);
+struct cl_lock *cl_lock_request(const struct lu_env *env, struct cl_io *io,
+                                const struct cl_lock_descr *need,
+                                __u32 enqflags,
+                                const char *scope, const void *source);
+struct cl_lock *cl_lock_at_page(const struct lu_env *env, struct cl_object *obj,
+                                struct cl_page *page, struct cl_lock *except,
+                                int pending, int canceld);
+
+const struct cl_lock_slice *cl_lock_at(const struct cl_lock *lock,
+                                       const struct lu_device_type *dtype);
+
+void  cl_lock_get       (struct cl_lock *lock);
+void  cl_lock_get_trust (struct cl_lock *lock);
+void  cl_lock_put       (const struct lu_env *env, struct cl_lock *lock);
+void  cl_lock_hold_add  (const struct lu_env *env, struct cl_lock *lock,
+                         const char *scope, const void *source);
+void  cl_lock_unhold    (const struct lu_env *env, struct cl_lock *lock,
+                         const char *scope, const void *source);
+void  cl_lock_release   (const struct lu_env *env, struct cl_lock *lock,
+                         const char *scope, const void *source);
+void  cl_lock_user_add  (const struct lu_env *env, struct cl_lock *lock);
+int   cl_lock_user_del  (const struct lu_env *env, struct cl_lock *lock);
+int   cl_lock_compatible(const struct cl_lock *lock1,
+                         const struct cl_lock *lock2);
+
+/** \name statemachine statemachine
+ * Interface to lock state machine consists of 3 parts:
+ *
+ *     - "try" functions that attempt to effect a state transition. If state
+ *     transition is not possible right now (e.g., if it has to wait for some
+ *     asynchronous event to occur), these functions return
+ *     cl_lock_transition::CLO_WAIT.
+ *
+ *     - "non-try" functions that implement synchronous blocking interface on
+ *     top of non-blocking "try" functions. These functions repeatedly call
+ *     corresponding "try" versions, and if state transition is not possible
+ *     immediately, wait for lock state change.
+ *
+ *     - methods from cl_lock_operations, called by "try" functions. Lock can
+ *     be advanced to the target state only when all layers voted that they
+ *     are ready for this transition. "Try" functions call methods under lock
+ *     mutex. If a layer had to release a mutex, it re-acquires it and returns
+ *     cl_lock_transition::CLO_REPEAT, causing "try" function to call all
+ *     layers again.
+ *
+ * TRY              NON-TRY      METHOD                            FINAL STATE
+ *
+ * cl_enqueue_try() cl_enqueue() cl_lock_operations::clo_enqueue() CLS_ENQUEUED
+ *
+ * cl_wait_try()    cl_wait()    cl_lock_operations::clo_wait()    CLS_HELD
+ *
+ * cl_unuse_try()   cl_unuse()   cl_lock_operations::clo_unuse()   CLS_CACHED
+ *
+ * cl_use_try()     NONE         cl_lock_operations::clo_use()     CLS_HELD
+ *
+ * @{ */
+
+int   cl_enqueue    (const struct lu_env *env, struct cl_lock *lock,
+                     struct cl_io *io, __u32 flags);
+int   cl_wait       (const struct lu_env *env, struct cl_lock *lock);
+void  cl_unuse      (const struct lu_env *env, struct cl_lock *lock);
+int   cl_enqueue_try(const struct lu_env *env, struct cl_lock *lock,
+                     struct cl_io *io, __u32 flags);
+int   cl_unuse_try  (const struct lu_env *env, struct cl_lock *lock);
+int   cl_wait_try   (const struct lu_env *env, struct cl_lock *lock);
+int   cl_use_try    (const struct lu_env *env, struct cl_lock *lock);
+/** @} statemachine */
+
+void cl_lock_signal      (const struct lu_env *env, struct cl_lock *lock);
+int  cl_lock_state_wait  (const struct lu_env *env, struct cl_lock *lock);
+void cl_lock_state_set   (const struct lu_env *env, struct cl_lock *lock,
+                          enum cl_lock_state state);
+int  cl_queue_match      (const struct list_head *queue,
+                          const struct cl_lock_descr *need);
+
+void cl_lock_mutex_get  (const struct lu_env *env, struct cl_lock *lock);
+int  cl_lock_mutex_try  (const struct lu_env *env, struct cl_lock *lock);
+void cl_lock_mutex_put  (const struct lu_env *env, struct cl_lock *lock);
+int  cl_lock_is_mutexed (struct cl_lock *lock);
+int  cl_lock_nr_mutexed (const struct lu_env *env);
+int  cl_lock_page_out   (const struct lu_env *env, struct cl_lock *lock,
+                         int discard);
+int  cl_lock_ext_match  (const struct cl_lock_descr *has,
+                         const struct cl_lock_descr *need);
+int  cl_lock_descr_match(const struct cl_lock_descr *has,
+                         const struct cl_lock_descr *need);
+int  cl_lock_mode_match (enum cl_lock_mode has, enum cl_lock_mode need);
+int  cl_lock_modify     (const struct lu_env *env, struct cl_lock *lock,
+                         const struct cl_lock_descr *desc);
+
+void cl_lock_closure_init (const struct lu_env *env,
+                           struct cl_lock_closure *closure,
+                           struct cl_lock *origin, int wait);
+void cl_lock_closure_fini (struct cl_lock_closure *closure);
+int  cl_lock_closure_build(const struct lu_env *env, struct cl_lock *lock,
+                           struct cl_lock_closure *closure);
+void cl_lock_disclosure   (const struct lu_env *env,
+                           struct cl_lock_closure *closure);
+int  cl_lock_enclosure    (const struct lu_env *env, struct cl_lock *lock,
+                           struct cl_lock_closure *closure);
+
+void cl_lock_cancel(const struct lu_env *env, struct cl_lock *lock);
+void cl_lock_delete(const struct lu_env *env, struct cl_lock *lock);
+void cl_lock_error (const struct lu_env *env, struct cl_lock *lock, int error);
+void cl_locks_prune(const struct lu_env *env, struct cl_object *obj, int wait);
+int  cl_is_lock    (const void *addr);
+
+unsigned long cl_lock_weigh(const struct lu_env *env, struct cl_lock *lock);
+
+/** @} cl_lock */
+
+/** \defgroup cl_io cl_io
+ * @{ */
+
+int   cl_io_init         (const struct lu_env *env, struct cl_io *io,
+                          enum cl_io_type iot, struct cl_object *obj);
+int   cl_io_sub_init     (const struct lu_env *env, struct cl_io *io,
+                          enum cl_io_type iot, struct cl_object *obj);
+int   cl_io_rw_init      (const struct lu_env *env, struct cl_io *io,
+                          enum cl_io_type iot, loff_t pos, size_t count);
+int   cl_io_loop         (const struct lu_env *env, struct cl_io *io);
+
+void  cl_io_fini         (const struct lu_env *env, struct cl_io *io);
+int   cl_io_iter_init    (const struct lu_env *env, struct cl_io *io);
+void  cl_io_iter_fini    (const struct lu_env *env, struct cl_io *io);
+int   cl_io_lock         (const struct lu_env *env, struct cl_io *io);
+void  cl_io_unlock       (const struct lu_env *env, struct cl_io *io);
+int   cl_io_start        (const struct lu_env *env, struct cl_io *io);
+void  cl_io_end          (const struct lu_env *env, struct cl_io *io);
+int   cl_io_lock_add     (const struct lu_env *env, struct cl_io *io,
+                          struct cl_io_lock_link *link);
+int   cl_io_lock_alloc_add(const struct lu_env *env, struct cl_io *io,
+                           struct cl_lock_descr *descr);
+int   cl_io_read_page    (const struct lu_env *env, struct cl_io *io,
+                          struct cl_page *page);
+int   cl_io_prepare_write(const struct lu_env *env, struct cl_io *io,
+                          struct cl_page *page, unsigned from, unsigned to);
+int   cl_io_commit_write (const struct lu_env *env, struct cl_io *io,
+                          struct cl_page *page, unsigned from, unsigned to);
+int   cl_io_submit_rw    (const struct lu_env *env, struct cl_io *io,
+                          enum cl_req_type iot, struct cl_2queue *queue);
+void  cl_io_rw_advance   (const struct lu_env *env, struct cl_io *io,
+                          size_t nob);
+int   cl_io_cancel       (const struct lu_env *env, struct cl_io *io,
+                          struct cl_page_list *queue);
+int   cl_io_is_going     (const struct lu_env *env);
+
+/**
+ * True, iff \a io is an O_APPEND write(2).
+ */
+static inline int cl_io_is_append(const struct cl_io *io)
+{
+        return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_append;
+}
+
+int cl_io_is_sendfile(const struct cl_io *io);
+
+struct cl_io *cl_io_top(struct cl_io *io);
+
+void cl_io_print(const struct lu_env *env, void *cookie,
+                 lu_printer_t printer, const struct cl_io *io);
+
+#define CL_IO_SLICE_CLEAN(foo_io, base)                                 \
+do {                                                                    \
+        typeof(foo_io) __foo_io = (foo_io);                             \
+                                                                        \
+        CLASSERT(offsetof(typeof(*__foo_io), base) == 0);               \
+        memset(&__foo_io->base + 1, 0,                                  \
+               (sizeof *__foo_io) - sizeof __foo_io->base);             \
+} while (0)
+
+/** @} cl_io */
+
+/** \defgroup cl_page_list cl_page_list
+ * @{ */
+
+/**
+ * Iterate over pages in a page list.
+ */
+#define cl_page_list_for_each(page, list)                               \
+        list_for_each_entry((page), &(list)->pl_pages, cp_batch)
+
+/**
+ * Iterate over pages in a page list, taking possible removals into account.
+ */
+#define cl_page_list_for_each_safe(page, temp, list)                    \
+        list_for_each_entry_safe((page), (temp), &(list)->pl_pages, cp_batch)
+
+void cl_page_list_init   (struct cl_page_list *plist);
+void cl_page_list_add    (struct cl_page_list *plist, struct cl_page *page);
+void cl_page_list_move   (struct cl_page_list *dst, struct cl_page_list *src,
+                          struct cl_page *page);
+void cl_page_list_splice (struct cl_page_list *list,
+                          struct cl_page_list *head);
+void cl_page_list_del    (const struct lu_env *env,
+                          struct cl_page_list *plist, struct cl_page *page);
+void cl_page_list_disown (const struct lu_env *env,
+                          struct cl_io *io, struct cl_page_list *plist);
+int  cl_page_list_own    (const struct lu_env *env,
+                          struct cl_io *io, struct cl_page_list *plist);
+void cl_page_list_assume (const struct lu_env *env,
+                          struct cl_io *io, struct cl_page_list *plist);
+void cl_page_list_discard(const struct lu_env *env,
+                          struct cl_io *io, struct cl_page_list *plist);
+int  cl_page_list_unmap  (const struct lu_env *env,
+                          struct cl_io *io, struct cl_page_list *plist);
+void cl_page_list_fini   (const struct lu_env *env, struct cl_page_list *plist);
+
+void cl_2queue_init     (struct cl_2queue *queue);
+void cl_2queue_add      (struct cl_2queue *queue, struct cl_page *page);
+void cl_2queue_disown   (const struct lu_env *env,
+                         struct cl_io *io, struct cl_2queue *queue);
+void cl_2queue_assume   (const struct lu_env *env,
+                         struct cl_io *io, struct cl_2queue *queue);
+void cl_2queue_discard  (const struct lu_env *env,
+                         struct cl_io *io, struct cl_2queue *queue);
+void cl_2queue_fini     (const struct lu_env *env, struct cl_2queue *queue);
+void cl_2queue_init_page(struct cl_2queue *queue, struct cl_page *page);
+
+/** @} cl_page_list */
+
+/** \defgroup cl_req cl_req
+ * @{ */
+struct cl_req *cl_req_alloc(const struct lu_env *env, struct cl_page *page,
+                            enum cl_req_type crt, int nr_objects);
+
+void cl_req_page_add  (const struct lu_env *env, struct cl_req *req,
+                       struct cl_page *page);
+void cl_req_page_done (const struct lu_env *env, struct cl_page *page);
+int  cl_req_prep      (const struct lu_env *env, struct cl_req *req);
+void cl_req_attr_set  (const struct lu_env *env, struct cl_req *req,
+                       struct cl_req_attr *attr, obd_valid flags);
+void cl_req_completion(const struct lu_env *env, struct cl_req *req, int ioret);
+
+/** \defgroup cl_sync_io cl_sync_io
+ * @{ */
+
+/**
+ * Anchor for synchronous transfer. This is allocated on a stack by thread
+ * doing synchronous transfer, and a pointer to this structure is set up in
+ * every page submitted for transfer. Transfer completion routine updates
+ * anchor and wakes up waiting thread when transfer is complete.
+ */
+struct cl_sync_io {
+        /** number of pages yet to be transferred. */
+        atomic_t             csi_sync_nr;
+        /** completion to be signaled when transfer is complete. */
+        struct completion    csi_sync_completion;
+        /** error code. */
+        int                  csi_sync_rc;
+};
+
+void cl_sync_io_init(struct cl_sync_io *anchor, int nrpages);
+int  cl_sync_io_wait(const struct lu_env *env, struct cl_io *io,
+                     struct cl_page_list *queue, struct cl_sync_io *anchor);
+void cl_sync_io_note(struct cl_sync_io *anchor, int ioret);
+
+/** @} cl_sync_io */
+
+/** @} cl_req */
+
+/** \defgroup cl_env cl_env
+ *
+ * lu_env handling for a client.
+ *
+ * lu_env is an environment within which lustre code executes. Its major part
+ * is lu_context---a fast memory allocation mechanism that is used to conserve
+ * precious kernel stack space. Originally lu_env was designed for a server,
+ * where
+ *
+ *     - there is a (mostly) fixed number of threads, and
+ *
+ *     - call chains have no non-lustre portions inserted between lustre code.
+ *
+ * On a client both these assumtpion fails, because every user thread can
+ * potentially execute lustre code as part of a system call, and lustre calls
+ * into VFS or MM that call back into lustre.
+ *
+ * To deal with that, cl_env wrapper functions implement the following
+ * optimizations:
+ *
+ *     - allocation and destruction of environment is amortized by caching no
+ *     longer used environments instead of destroying them;
+ *
+ *     - there is a notion of "current" environment, attached to the kernel
+ *     data structure representing current thread (current->journal_info in
+ *     Linux kernel). Top-level lustre code allocates an environment and makes
+ *     it current, then calls into non-lustre code, that in turn calls lustre
+ *     back. Low-level lustre code thus called can fetch environment created
+ *     by the top-level code and reuse it, avoiding additional environment
+ *     allocation.
+ *
+ * \see lu_env, lu_context, lu_context_key
+ * @{ */
+
+struct cl_env_nest {
+        int   cen_refcheck;
+        void *cen_cookie;
+};
+
+struct lu_env *cl_env_peek       (int *refcheck);
+struct lu_env *cl_env_get        (int *refcheck);
+struct lu_env *cl_env_alloc      (int *refcheck, __u32 tags);
+struct lu_env *cl_env_nested_get (struct cl_env_nest *nest);
+void           cl_env_put        (struct lu_env *env, int *refcheck);
+void           cl_env_nested_put (struct cl_env_nest *nest, struct lu_env *env);
+void          *cl_env_reenter    (void);
+void           cl_env_reexit     (void *cookie);
+void           cl_env_implant    (struct lu_env *env, int *refcheck);
+void           cl_env_unplant    (struct lu_env *env, int *refcheck);
+unsigned       cl_env_cache_purge(unsigned nr);
+
+/** @} cl_env */
+
+/*
+ * Misc
+ */
+void cl_attr2lvb(struct ost_lvb *lvb, const struct cl_attr *attr);
+void cl_lvb2attr(struct cl_attr *attr, const struct ost_lvb *lvb);
+
+struct cl_device *cl_type_setup(const struct lu_env *env, struct lu_site *site,
+                                struct lu_device_type *ldt,
+                                struct lu_device *next);
+/** @} clio */
+
+#endif /* _LINUX_CL_OBJECT_H */
diff --git a/lustre/include/dt_object.h b/lustre/include/dt_object.h

index b1fa210..fbbb9ad 100644 (file)
--- a/lustre/include/dt_object.h
+++ b/lustre/include/dt_object.h
@@ -184,7 +184,7 @@ extern const struct dt_index_features dt_directory_features;
  
  /**
   * This is a general purpose dt allocation hint.
- * It now contains the parent object. 
+ * It now contains the parent object.
   * It can contain any allocation hint in the future.
   */
  struct dt_allocation_hint {
@@ -287,7 +287,7 @@ struct dt_object_operations {
           * postcondition: ergo(result == 0, dt_object_exists(dt));
           */
          int   (*do_create)(const struct lu_env *env, struct dt_object *dt,
-                           struct lu_attr *attr, 
+                           struct lu_attr *attr,
                             struct dt_allocation_hint *hint,
                             struct thandle *th);
  
@@ -414,7 +414,7 @@ struct dt_index_operations {
  };
  
  struct dt_device {
-        struct lu_device             dd_lu_dev;
+        struct lu_device                   dd_lu_dev;
          const struct dt_device_operations *dd_ops;
  
          /**
@@ -422,7 +422,7 @@ struct dt_device {
           * way, because callbacks are supposed to be added/deleted only during
           * single-threaded start-up shut-down procedures.
           */
-        struct list_head             dd_txn_callbacks;
+        struct list_head                   dd_txn_callbacks;
  };
  
  int  dt_device_init(struct dt_device *dev, struct lu_device_type *t);
@@ -440,7 +440,7 @@ static inline struct dt_device * lu2dt_dev(struct lu_device *l)
  }
  
  struct dt_object {
-        struct lu_object             do_lu;
+        struct lu_object                   do_lu;
          const struct dt_object_operations *do_ops;
          const struct dt_body_operations   *do_body_ops;
          const struct dt_index_operations  *do_index_ops;
diff --git a/lustre/include/lclient.h b/lustre/include/lclient.h

new file mode 100644 (file)

index 0000000..dfd7f65
--- /dev/null
+++ b/lustre/include/lclient.h
@@ -0,0 +1,375 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Definitions shared between vvp and liblustre, and other clients in the
+ * future.
+ *
+ *   Author: Oleg Drokin <oleg.drokin@sun.com>
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#ifndef LCLIENT_H
+#define LCLIENT_H
+
+int cl_glimpse_size(struct inode *inode);
+int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io,
+                    struct inode *inode, struct cl_object *clob);
+
+/**
+ * Common IO arguments for various VFS I/O interfaces.
+ */
+struct ccc_io_args {
+        int           cia_is_sendfile;
+#ifndef HAVE_FILE_WRITEV
+        struct kiocb *cia_iocb;
+#endif
+        struct iovec *cia_iov;
+        unsigned long cia_nrsegs;
+        read_actor_t  cia_actor;
+        void         *cia_target;
+};
+
+/**
+ * Locking policy for truncate.
+ */
+enum ccc_trunc_lock_type {
+        /** Locking is done by server */
+        TRUNC_NOLOCK,
+        /** Extent lock is enqueued */
+        TRUNC_EXTENT,
+        /** Existing local extent lock is used */
+        TRUNC_MATCH
+};
+
+/**
+ * IO state private to vvp or slp layers.
+ */
+struct ccc_io {
+        /** super class */
+        struct cl_io_slice     cui_cl;
+        struct cl_io_lock_link cui_link;
+        /**
+         * I/O vector information to or from which read/write is going.
+         */
+        struct iovec *cui_iov;
+        unsigned long cui_nrsegs;
+        /**
+         * Total iov count for left IO.
+         */
+        unsigned long cui_tot_nrsegs;
+        /**
+         * Old length for iov that was truncated partially.
+         */
+        size_t cui_iov_olen;
+        /**
+         * Total size for the left IO.
+         */
+        size_t cui_tot_count;
+
+        union {
+                struct {
+                        int                      cui_locks_released;
+                        enum ccc_trunc_lock_type cui_local_lock;
+                } trunc;
+        } u;
+        /**
+         * True iff io is processing glimpse right now.
+         */
+        int                  cui_glimpse;
+        /**
+         * File descriptor against which IO is done.
+         */
+        struct ll_file_data *cui_fd;
+#ifndef HAVE_FILE_WRITEV
+        struct kiocb *cui_iocb;
+#endif
+};
+
+extern struct lu_context_key ccc_key;
+extern struct lu_context_key ccc_session_key;
+
+struct ccc_thread_info {
+        struct cl_lock_descr cti_descr;
+        struct cl_io         cti_io;
+        struct cl_sync_io    cti_sync_io;
+        struct cl_attr       cti_attr;
+};
+
+static inline struct ccc_thread_info *ccc_env_info(const struct lu_env *env)
+{
+        struct ccc_thread_info      *info;
+
+        info = lu_context_key_get(&env->le_ctx, &ccc_key);
+        LASSERT(info != NULL);
+        return info;
+}
+
+struct ccc_session {
+        struct ccc_io cs_ios;
+};
+
+static inline struct ccc_session *ccc_env_session(const struct lu_env *env)
+{
+        struct ccc_session *ses;
+
+        ses = lu_context_key_get(env->le_ses, &ccc_session_key);
+        LASSERT(ses != NULL);
+        return ses;
+}
+
+static inline struct ccc_io *ccc_env_io(const struct lu_env *env)
+{
+        return &ccc_env_session(env)->cs_ios;
+}
+
+/**
+ * ccc-private object state.
+ */
+struct ccc_object {
+        struct cl_object_header cob_header;
+        struct cl_object        cob_cl;
+        struct inode           *cob_inode;
+
+        /**
+         * A list of dirty pages pending IO in the cache. Used by
+         * SOM. Protected by ll_inode_info::lli_lock.
+         *
+         * \see ccc_page::cpg_pending_linkage
+         */
+        struct list_head        cob_pending_list;
+
+        /**
+         * Access this counter is protected by inode->i_sem. Now that
+         * the lifetime of transient pages must be covered by inode sem,
+         * we don't need to hold any lock..
+         */
+        int                     cob_transient_pages;
+        /**
+         * Number of outstanding mmaps on this file.
+         *
+         * \see ll_vm_open(), ll_vm_close().
+         */
+        atomic_t                cob_mmap_cnt;
+};
+
+/**
+ * ccc-private page state.
+ */
+struct ccc_page {
+        struct cl_page_slice cpg_cl;
+        int                  cpg_defer_uptodate;
+        int                  cpg_ra_used;
+        int                  cpg_write_queued;
+        /**
+         * Non-empty iff this page is already counted in
+         * ccc_object::cob_pending_list. Protected by
+         * ccc_object::cob_pending_guard. This list is only used as a flag,
+         * that is, never iterated through, only checked for list_empty(), but
+         * having a list is useful for debugging.
+         */
+        struct list_head     cpg_pending_linkage;
+        /** VM page */
+        cfs_page_t          *cpg_page;
+        struct cl_sync_io   *cpg_sync_io;
+        /**
+         * checksum for paranoid I/O debugging enabled by
+         * ENABLE_LLITE_CHECKSUM configuration option.
+         *
+         * XXX This cannot be implemented reliably because checksum cannot be
+         * updated from ->set_page_dirty() that is called without page VM
+         * lock.
+         */
+        __u32                cpg_checksum;
+};
+
+static inline struct ccc_page *cl2ccc_page(const struct cl_page_slice *slice)
+{
+        return container_of(slice, struct ccc_page, cpg_cl);
+}
+
+struct cl_page    *ccc_vmpage_page_transient(cfs_page_t *vmpage);
+
+struct ccc_device {
+        struct cl_device    cdv_cl;
+        struct super_block *cdv_sb;
+        struct cl_device   *cdv_next;
+};
+
+struct ccc_lock {
+        struct cl_lock_slice clk_cl;
+};
+
+struct ccc_req {
+        struct cl_req_slice  crq_cl;
+};
+
+void *ccc_key_init        (const struct lu_context *ctx,
+                           struct lu_context_key *key);
+void  ccc_key_fini        (const struct lu_context *ctx,
+                           struct lu_context_key *key, void *data);
+void *ccc_session_key_init(const struct lu_context *ctx,
+                           struct lu_context_key *key);
+void  ccc_session_key_fini(const struct lu_context *ctx,
+                           struct lu_context_key *key, void *data);
+
+int              ccc_device_init  (const struct lu_env *env,
+                                   struct lu_device *d,
+                                   const char *name, struct lu_device *next);
+struct lu_device *ccc_device_fini (const struct lu_env *env,
+                                   struct lu_device *d);
+struct lu_device *ccc_device_alloc(const struct lu_env *env,
+                                   struct lu_device_type *t,
+                                   struct lustre_cfg *cfg,
+                                   const struct lu_device_operations *luops,
+                                   const struct cl_device_operations *clops);
+struct lu_device *ccc_device_free (const struct lu_env *env,
+                                   struct lu_device *d);
+struct lu_object *ccc_object_alloc(const struct lu_env *env,
+                                   const struct lu_object_header *hdr,
+                                   struct lu_device *dev,
+                                   const struct cl_object_operations *clops,
+                                   const struct lu_object_operations *luops);
+
+int ccc_req_init(const struct lu_env *env, struct cl_device *dev,
+                 struct cl_req *req);
+void ccc_umount(const struct lu_env *env, struct cl_device *dev);
+int ccc_global_init(struct lu_device_type *device_type);
+void ccc_global_fini(struct lu_device_type *device_type);
+int ccc_object_init0(const struct lu_env *env,struct ccc_object *vob,
+                     const struct cl_object_conf *conf);
+int ccc_object_init(const struct lu_env *env, struct lu_object *obj,
+                    const struct lu_object_conf *conf);
+void ccc_object_free(const struct lu_env *env, struct lu_object *obj);
+int ccc_lock_init(const struct lu_env *env, struct cl_object *obj,
+                  struct cl_lock *lock, const struct cl_io *io,
+                  const struct cl_lock_operations *lkops);
+int ccc_attr_set(const struct lu_env *env, struct cl_object *obj,
+                 const struct cl_attr *attr, unsigned valid);
+int ccc_object_glimpse(const struct lu_env *env,
+                       const struct cl_object *obj, struct ost_lvb *lvb);
+int ccc_conf_set(const struct lu_env *env, struct cl_object *obj,
+                 const struct cl_object_conf *conf);
+cfs_page_t *ccc_page_vmpage(const struct lu_env *env,
+                            const struct cl_page_slice *slice);
+int ccc_page_is_under_lock(const struct lu_env *env,
+                           const struct cl_page_slice *slice, struct cl_io *io);
+int ccc_fail(const struct lu_env *env, const struct cl_page_slice *slice);
+void ccc_transient_page_verify(const struct cl_page *page);
+void ccc_transient_page_own(const struct lu_env *env,
+                            const struct cl_page_slice *slice,
+                            struct cl_io *io);
+void ccc_transient_page_assume(const struct lu_env *env,
+                               const struct cl_page_slice *slice,
+                               struct cl_io *io);
+void ccc_transient_page_unassume(const struct lu_env *env,
+                                 const struct cl_page_slice *slice,
+                                 struct cl_io *io);
+void ccc_transient_page_disown(const struct lu_env *env,
+                               const struct cl_page_slice *slice,
+                               struct cl_io *io);
+void ccc_transient_page_discard(const struct lu_env *env,
+                                const struct cl_page_slice *slice,
+                                struct cl_io *io);
+int ccc_transient_page_prep(const struct lu_env *env,
+                            const struct cl_page_slice *slice,
+                            struct cl_io *io);
+void ccc_lock_fini(const struct lu_env *env,struct cl_lock_slice *slice);
+int ccc_lock_enqueue(const struct lu_env *env,const struct cl_lock_slice *slice,
+                     struct cl_io *io, __u32 enqflags);
+int ccc_lock_unuse(const struct lu_env *env,const struct cl_lock_slice *slice);
+int ccc_lock_wait(const struct lu_env *env,const struct cl_lock_slice *slice);
+int ccc_lock_fits_into(const struct lu_env *env,
+                       const struct cl_lock_slice *slice,
+                       const struct cl_lock_descr *need,
+                       const struct cl_io *io);
+void ccc_lock_state(const struct lu_env *env,
+                    const struct cl_lock_slice *slice,
+                    enum cl_lock_state state);
+
+void ccc_io_fini(const struct lu_env *env, const struct cl_io_slice *ios);
+int ccc_io_one_lock_index(const struct lu_env *env, struct cl_io *io,
+                          __u32 enqflags, enum cl_lock_mode mode,
+                          pgoff_t start, pgoff_t end);
+int ccc_io_one_lock(const struct lu_env *env, struct cl_io *io,
+                    __u32 enqflags, enum cl_lock_mode mode,
+                    loff_t start, loff_t end);
+void ccc_io_end(const struct lu_env *env, const struct cl_io_slice *ios);
+int ccc_prep_size(const struct lu_env *env, struct cl_object *obj,
+                  struct cl_io *io, loff_t pos, int vfslock);
+void ccc_req_completion(const struct lu_env *env,
+                        const struct cl_req_slice *slice, int ioret);
+void ccc_req_attr_set(const struct lu_env *env,const struct cl_req_slice *slice,
+                      const struct cl_object *obj,
+                      struct cl_req_attr *oa, obd_valid flags);
+
+struct lu_device   *ccc2lu_dev      (struct ccc_device *vdv);
+struct lu_object   *ccc2lu          (struct ccc_object *vob);
+struct ccc_device  *lu2ccc_dev      (const struct lu_device *d);
+struct ccc_device  *cl2ccc_dev      (const struct cl_device *d);
+struct ccc_object  *lu2ccc          (const struct lu_object *obj);
+struct ccc_object  *cl2ccc          (const struct cl_object *obj);
+struct ccc_lock    *cl2ccc_lock     (const struct cl_lock_slice *slice);
+struct ccc_io      *cl2ccc_io       (const struct lu_env *env,
+                                     const struct cl_io_slice *slice);
+struct ccc_req     *cl2ccc_req      (const struct cl_req_slice *slice);
+cfs_page_t         *cl2vm_page      (const struct cl_page_slice *slice);
+struct inode       *ccc_object_inode(const struct cl_object *obj);
+struct ccc_object  *cl_inode2ccc    (struct inode *inode);
+
+int cl_setattr_do_truncate(struct inode *inode, loff_t size,
+                           struct obd_capa *capa);
+int cl_setattr_ost(struct inode *inode, struct obd_capa *capa);
+
+struct cl_page *ccc_vmpage_page_transient(cfs_page_t *vmpage);
+int ccc_object_invariant(const struct cl_object *obj);
+int cl_inode_init(struct inode *inode, struct lustre_md *md);
+void cl_inode_fini(struct inode *inode);
+int cl_local_size(struct inode *inode);
+
+#ifdef INVARIANT_CHECK
+# define CLOBINVRNT(env, clob, expr)                                    \
+  do {                                                                  \
+          if (unlikely(!(expr))) {                                      \
+                  LU_OBJECT_DEBUG(D_ERROR, (env), &(clob)->co_lu, #expr "\n"); \
+                  LINVRNT(0);                                           \
+          }                                                             \
+  } while (0)
+#else /* !INVARIANT_CHECK */
+# define CLOBINVRNT(env, clob, expr)                                    \
+        ((void)sizeof(env), (void)sizeof(clob), (void)sizeof !!(expr))
+#endif /* !INVARIANT_CHECK */
+
+
+#endif /*LCLIENT_H */
diff --git a/lustre/include/liblustre.h b/lustre/include/liblustre.h

index 13b2e26..09bf725 100644 (file)
--- a/lustre/include/liblustre.h
+++ b/lustre/include/liblustre.h
@@ -71,7 +71,7 @@ typedef unsigned short umode_t;
  /*
   * The inter_module_get implementation is specific to liblustre, so this needs
   * to stay here for now.
- */ 
+ */
  static inline void inter_module_put(void *a)
  {
          return;
@@ -251,6 +251,7 @@ struct task_struct {
          int ngroups;
          gid_t *groups;
          __u32 cap_effective;
+        void *journal_info;
  };
  
  
@@ -378,7 +379,7 @@ void *liblustre_register_wait_callback(const char *name,
  void liblustre_deregister_wait_callback(void *notifier);
  int liblustre_wait_event(int timeout);
  
-void *liblustre_register_idle_callback(const char *name, 
+void *liblustre_register_idle_callback(const char *name,
                                         int (*fn)(void *arg), void *arg);
  void liblustre_deregister_idle_callback(void *notifier);
  void liblustre_wait_idle(void);
@@ -484,10 +485,10 @@ void posix_acl_release(struct posix_acl *acl)
  }
  
  #ifdef LIBLUSTRE_POSIX_ACL
-# ifndef posix_acl_xattr_entry 
+# ifndef posix_acl_xattr_entry
  #  define posix_acl_xattr_entry xattr_acl_entry
  # endif
-# ifndef posix_acl_xattr_header 
+# ifndef posix_acl_xattr_header
  #  define posix_acl_xattr_header xattr_acl_header
  # endif
  # ifndef posix_acl_xattr_size
diff --git a/lustre/include/linux/lustre_acl.h b/lustre/include/linux/lustre_acl.h

index f5d07a5..713341e 100644 (file)
--- a/lustre/include/linux/lustre_acl.h
+++ b/lustre/include/linux/lustre_acl.h
@@ -76,7 +76,11 @@
  
  # define LUSTRE_POSIX_ACL_MAX_ENTRIES   (32)
  
+#ifdef __KERNEL__
  # define LUSTRE_POSIX_ACL_MAX_SIZE   XATTR_ACL_SIZE
+#else
+# define LUSTRE_POSIX_ACL_MAX_SIZE   0
+#endif
  
  # else /* CONFIG_FS_POSIX_ACL */
  # define LUSTRE_POSIX_ACL_MAX_SIZE      0
diff --git a/lustre/include/linux/lustre_compat25.h b/lustre/include/linux/lustre_compat25.h

index 4bd7b0c..7f3f0da 100644 (file)
--- a/lustre/include/linux/lustre_compat25.h
+++ b/lustre/include/linux/lustre_compat25.h
@@ -339,7 +339,7 @@ int filemap_fdatawrite_range(struct address_space *mapping,
  #endif
  
  #ifdef HAVE_VFS_KERN_MOUNT
-static inline 
+static inline
  struct vfsmount *
  ll_kern_mount(const char *fstype, int flags, const char *name, void *data)
  {
@@ -355,45 +355,6 @@ ll_kern_mount(const char *fstype, int flags, const char *name, void *data)
  #define ll_kern_mount(fstype, flags, name, data) do_kern_mount((fstype), (flags), (name), (data))
  #endif
  
-#ifndef HAVE_GENERIC_FILE_READ
-static inline
-ssize_t
-generic_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
-{
-        struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
-        struct kiocb kiocb;
-        ssize_t ret;
-
-        init_sync_kiocb(&kiocb, filp);
-        kiocb.ki_pos = *ppos;
-        kiocb.ki_left = len;
-
-        ret = generic_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
-        *ppos = kiocb.ki_pos;
-        return ret;
-}
-#endif
-
-#ifndef HAVE_GENERIC_FILE_WRITE
-static inline
-ssize_t
-generic_file_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
-{
-        struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
-        struct kiocb kiocb;
-        ssize_t ret;
-
-        init_sync_kiocb(&kiocb, filp);
-        kiocb.ki_pos = *ppos;
-        kiocb.ki_left = len;
-
-        ret = generic_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
-        *ppos = kiocb.ki_pos;
-
-        return ret;
-}
-#endif
-
  #ifdef HAVE_STATFS_DENTRY_PARAM
  #define ll_do_statfs(sb, sfs) (sb)->s_op->statfs((sb)->s_root, (sfs))
  #else
@@ -426,7 +387,7 @@ static inline u32 get_sb_time_gran(struct super_block *sb)
  #ifdef HAVE_UNREGISTER_BLKDEV_RETURN_INT
  #define ll_unregister_blkdev(a,b)       unregister_blkdev((a),(b))
  #else
-static inline 
+static inline
  int ll_unregister_blkdev(unsigned int dev, const char *name)
  {
          unregister_blkdev(dev, name);
@@ -542,7 +503,7 @@ struct blkcipher_desc {
  
  extern struct ll_crypto_cipher *ll_crypto_alloc_blkcipher(
                              const char * algname, u32 type, u32 mask);
-static inline 
+static inline
  struct ll_crypto_hash *ll_crypto_alloc_hash(const char *alg, u32 type, u32 mask)
  {
          char        buf[CRYPTO_MAX_ALG_NAME + 1];
@@ -568,10 +529,10 @@ static inline int ll_crypto_hash_update(struct hash_desc *desc,
  {
          struct scatterlist *sl = sg;
          unsigned int        count;
-                /* 
+                /*
                   * This way is very weakness. We must ensure that
                   * the sum of sg[0..i]->length isn't greater than nbytes.
-                 * In the upstream kernel the crypto_hash_update() also 
+                 * In the upstream kernel the crypto_hash_update() also
                   * via the nbytes computed the count of sg[...].
                   * The old style is more safely. but it gone.
                   */
@@ -617,7 +578,7 @@ static inline int ll_crypto_hmac(struct crypto_tfm *tfm,
  #define ll_vfs_mknod(dir,entry,mnt,mode,dev)            \
                  vfs_mknod(dir,entry,mnt,mode,dev)
  #define ll_security_inode_unlink(dir,entry,mnt)         \
-                security_inode_unlink(dir,entry,mnt)     
+                security_inode_unlink(dir,entry,mnt)
  #define ll_vfs_rename(old,old_dir,mnt,new,new_dir,mnt1) \
                  vfs_rename(old,old_dir,mnt,new,new_dir,mnt1)
  #else
@@ -627,7 +588,7 @@ static inline int ll_crypto_hmac(struct crypto_tfm *tfm,
  #define ll_vfs_link(old,mnt,dir,new,mnt1)       vfs_link(old,dir,new)
  #define ll_vfs_unlink(inode,entry,mnt)          vfs_unlink(inode,entry)
  #define ll_vfs_mknod(dir,entry,mnt,mode,dev)    vfs_mknod(dir,entry,mode,dev)
-#define ll_security_inode_unlink(dir,entry,mnt) security_inode_unlink(dir,entry)     
+#define ll_security_inode_unlink(dir,entry,mnt) security_inode_unlink(dir,entry)
  #define ll_vfs_rename(old,old_dir,mnt,new,new_dir,mnt1) \
                  vfs_rename(old,old_dir,new,new_dir)
  #endif
diff --git a/lustre/include/linux/obd_support.h b/lustre/include/linux/obd_support.h

index ccb001d..2e21b08 100644 (file)
--- a/lustre/include/linux/obd_support.h
+++ b/lustre/include/linux/obd_support.h
@@ -45,9 +45,12 @@
  #ifndef AUTOCONF_INCLUDED
  #include <linux/config.h>
  #endif
+#include <linux/seq_file.h>
+#include <linux/module.h>
  #include <linux/autoconf.h>
  #include <linux/slab.h>
  #include <linux/highmem.h>
+#include <linux/swap.h>
  #endif
  #include <libcfs/libcfs.h>
  #include <linux/lustre_compat25.h>
diff --git a/lustre/include/lu_object.h b/lustre/include/lu_object.h

index 9dd557f..9c2f283 100644 (file)
--- a/lustre/include/lu_object.h
+++ b/lustre/include/lu_object.h
@@ -143,7 +143,7 @@ struct lu_device_operations {
           *  repeatedly, until no new objects are created.
           *
           * \post ergo(!IS_ERR(result), result->lo_dev == d &&
-         *                                      result->lo_ops != NULL);
+         *                             result->lo_ops != NULL);
           */
          struct lu_object *(*ldo_object_alloc)(const struct lu_env *env,
                                                const struct lu_object_header *h,
@@ -177,7 +177,7 @@ typedef int (*lu_printer_t)(const struct lu_env *env,
                              void *cookie, const char *format, ...)
          __attribute__ ((format (printf, 3, 4)));
  
-/*
+/**
   * Operations specific for particular lu_object.
   */
  struct lu_object_operations {
@@ -247,7 +247,7 @@ struct lu_device {
           *
           * \todo XXX which means that atomic_t is probably too small.
           */
-        atomic_t                     ld_ref;
+        atomic_t                           ld_ref;
          /**
           * Pointer to device type. Never modified once set.
           */
@@ -259,11 +259,11 @@ struct lu_device {
          /**
           * Stack this device belongs to.
           */
-        struct lu_site              *ld_site;
-        struct proc_dir_entry       *ld_proc_entry;
+        struct lu_site                    *ld_site;
+        struct proc_dir_entry             *ld_proc_entry;
  
          /** \todo XXX: temporary back pointer into obd. */
-        struct obd_device           *ld_obd;
+        struct obd_device                 *ld_obd;
          /**
           * A list of references to this object, for debugging.
           */
@@ -292,11 +292,11 @@ struct lu_device_type {
          /**
           * Tag bits. Taken from enum lu_device_tag. Never modified once set.
           */
-        __u32                             ldt_tags;
+        __u32                                   ldt_tags;
          /**
           * Name of this class. Unique system-wide. Never modified once set.
           */
-        char                             *ldt_name;
+        char                                   *ldt_name;
          /**
           * Operations for this type.
           */
@@ -304,11 +304,11 @@ struct lu_device_type {
          /**
           * \todo XXX: temporary pointer to associated obd_type.
           */
-        struct obd_type                  *ldt_obd_type;
+        struct obd_type                        *ldt_obd_type;
          /**
           * \todo XXX: temporary: context tags used by obd_*() calls.
           */
-        __u32                             ldt_ctx_tags;
+        __u32                                   ldt_ctx_tags;
          /**
           * Number of existing device type instances.
           */
@@ -437,34 +437,34 @@ enum la_valid {
          LA_BLKSIZE = 1 << 12,
  };
  
-/*
+/**
   * Layer in the layered object.
   */
  struct lu_object {
-        /*
+        /**
           * Header for this object.
           */
-        struct lu_object_header     *lo_header;
-        /*
+        struct lu_object_header           *lo_header;
+        /**
           * Device for this layer.
           */
-        struct lu_device            *lo_dev;
-        /*
+        struct lu_device                  *lo_dev;
+        /**
           * Operations for this object.
           */
          const struct lu_object_operations *lo_ops;
-        /*
+        /**
           * Linkage into list of all layers.
           */
-        struct list_head             lo_linkage;
-        /*
+        struct list_head                   lo_linkage;
+        /**
           * Depth. Top level layer depth is 0.
           */
-        int                          lo_depth;
-        /*
+        int                                lo_depth;
+        /**
           * Flags from enum lu_object_flags.
           */
-        unsigned long                lo_flags;
+        unsigned long                      lo_flags;
          /**
           * Link to the device, for debugging.
           */
@@ -472,7 +472,7 @@ struct lu_object {
  };
  
  enum lu_object_header_flags {
-        /*
+        /**
           * Don't keep this object in cache. Object will be destroyed as soon
           * as last reference to it is released. This flag cannot be cleared
           * once set.
@@ -483,14 +483,14 @@ enum lu_object_header_flags {
  enum lu_object_header_attr {
          LOHA_EXISTS   = 1 << 0,
          LOHA_REMOTE   = 1 << 1,
-        /*
+        /**
           * UNIX file type is stored in S_IFMT bits.
           */
-        LOHA_FT_START = 1 << 12, /* S_IFIFO */
-        LOHA_FT_END   = 1 << 15, /* S_IFREG */
+        LOHA_FT_START = 1 << 12, /**< S_IFIFO */
+        LOHA_FT_END   = 1 << 15, /**< S_IFREG */
  };
  
-/*
+/**
   * "Compound" object, consisting of multiple layers.
   *
   * Compound object with given fid is unique with given lu_site.
@@ -506,33 +506,33 @@ struct lu_object_header {
           * Object flags from enum lu_object_header_flags. Set and checked
           * atomically.
           */
-        unsigned long     loh_flags;
+        unsigned long       loh_flags;
          /**
           * Object reference count. Protected by lu_site::ls_guard.
           */
-        atomic_t          loh_ref;
+        atomic_t            loh_ref;
          /**
           * Fid, uniquely identifying this object.
           */
-        struct lu_fid     loh_fid;
+        struct lu_fid       loh_fid;
          /**
           * Common object attributes, cached for efficiency. From enum
           * lu_object_header_attr.
           */
-        __u32             loh_attr;
+        __u32               loh_attr;
          /**
           * Linkage into per-site hash table. Protected by lu_site::ls_guard.
           */
-        struct hlist_node loh_hash;
+        struct hlist_node   loh_hash;
          /**
           * Linkage into per-site LRU list. Protected by lu_site::ls_guard.
           */
-        struct list_head  loh_lru;
+        struct list_head    loh_lru;
          /**
           * Linkage into list of layers. Never modified once set (except lately
           * during object destruction). No locking is necessary.
           */
-        struct list_head  loh_layers;
+        struct list_head    loh_layers;
          /**
           * A list of references to this object, for debugging.
           */
@@ -608,6 +608,7 @@ struct lu_site {
           * Top-level device for this stack.
           */
          struct lu_device     *ls_top_dev;
+
          /**
           * Wait-queue signaled when an object in this site is ultimately
           * destroyed (lu_object_free()). It is used by lu_object_find() to
@@ -666,10 +667,10 @@ void lu_device_get        (struct lu_device *d);
  void lu_device_put        (struct lu_device *d);
  int  lu_device_init       (struct lu_device *d, struct lu_device_type *t);
  void lu_device_fini       (struct lu_device *d);
-int lu_object_header_init(struct lu_object_header *h);
+int  lu_object_header_init(struct lu_object_header *h);
  void lu_object_header_fini(struct lu_object_header *h);
  int  lu_object_init       (struct lu_object *o,
-                   struct lu_object_header *h, struct lu_device *d);
+                           struct lu_object_header *h, struct lu_device *d);
  void lu_object_fini       (struct lu_object *o);
  void lu_object_add_top    (struct lu_object_header *h, struct lu_object *o);
  void lu_object_add        (struct lu_object *before, struct lu_object *o);
@@ -801,20 +802,20 @@ int lu_cdebug_printer(const struct lu_env *env,
   * Print object description followed by a user-supplied message.
   */
  #define LU_OBJECT_DEBUG(mask, env, object, format, ...)                 \
-({                                                                      \
+do {                                                                    \
          static DECLARE_LU_CDEBUG_PRINT_INFO(__info, mask);              \
                                                                          \
          if (cdebug_show(mask, DEBUG_SUBSYSTEM)) {                       \
-        lu_object_print(env, &__info, lu_cdebug_printer, object);       \
-        CDEBUG(mask, format , ## __VA_ARGS__);                          \
+                lu_object_print(env, &__info, lu_cdebug_printer, object); \
+                CDEBUG(mask, format , ## __VA_ARGS__);                  \
          }                                                               \
-})
+} while (0)
  
  /**
   * Print short object description followed by a user-supplied message.
   */
  #define LU_OBJECT_HEADER(mask, env, object, format, ...)                \
-({                                                                      \
+do {                                                                    \
          static DECLARE_LU_CDEBUG_PRINT_INFO(__info, mask);              \
                                                                          \
          if (cdebug_show(mask, DEBUG_SUBSYSTEM)) {                       \
@@ -823,10 +824,10 @@ int lu_cdebug_printer(const struct lu_env *env,
                  lu_cdebug_printer(env, &__info, "\n");                  \
                  CDEBUG(mask, format , ## __VA_ARGS__);                  \
          }                                                               \
-})
+} while (0)
  
  void lu_object_print       (const struct lu_env *env, void *cookie,
-                     lu_printer_t printer, const struct lu_object *o);
+                            lu_printer_t printer, const struct lu_object *o);
  void lu_object_header_print(const struct lu_env *env, void *cookie,
                              lu_printer_t printer,
                              const struct lu_object_header *hdr);
@@ -975,6 +976,10 @@ struct lu_context {
           * keys were registered.
           */
          unsigned               lc_version;
+        /**
+         * Debugging cookie.
+         */
+        unsigned               lc_cookie;
  };
  
  /**
@@ -1167,50 +1172,52 @@ void  lu_context_key_revive  (struct lu_context_key *key);
  
  #define LU_KEY_INIT_GENERIC(mod)                                        \
          static void mod##_key_init_generic(struct lu_context_key *k, ...) \
-        {                                                                        \
+        {                                                               \
                  struct lu_context_key *key = k;                         \
-                va_list args;                                                    \
-                                                                                 \
-                va_start(args, k);                                               \
-                do {                                                             \
-                        LU_CONTEXT_KEY_INIT(key);                                \
+                va_list args;                                           \
+                                                                        \
+                va_start(args, k);                                      \
+                do {                                                    \
+                        LU_CONTEXT_KEY_INIT(key);                       \
                          key = va_arg(args, struct lu_context_key *);    \
-                } while (key != NULL);                                           \
-                va_end(args);                                                    \
+                } while (key != NULL);                                  \
+                va_end(args);                                           \
          }
  
-#define LU_TYPE_INIT(mod, ...)                                         \
+#define LU_TYPE_INIT(mod, ...)                                          \
          LU_KEY_INIT_GENERIC(mod)                                        \
-        static int mod##_type_init(struct lu_device_type *t)           \
-        {                                                              \
+        static int mod##_type_init(struct lu_device_type *t)            \
+        {                                                               \
                  mod##_key_init_generic(__VA_ARGS__, NULL);              \
                  return lu_context_key_register_many(__VA_ARGS__, NULL); \
-        }                                                              \
+        }                                                               \
          struct __##mod##_dummy_type_init {;}
  
-#define LU_TYPE_FINI(mod, ...)                                         \
-        static void mod##_type_fini(struct lu_device_type *t)          \
-        {                                                              \
+#define LU_TYPE_FINI(mod, ...)                                          \
+        static void mod##_type_fini(struct lu_device_type *t)           \
+        {                                                               \
                  lu_context_key_degister_many(__VA_ARGS__, NULL);        \
-        }                                                              \
+        }                                                               \
          struct __##mod##_dummy_type_fini {;}
  
  #define LU_TYPE_START(mod, ...)                                 \
          static void mod##_type_start(struct lu_device_type *t)  \
          {                                                       \
+                lu_context_key_revive_many(__VA_ARGS__, NULL);  \
          }                                                       \
          struct __##mod##_dummy_type_start {;}
  
  #define LU_TYPE_STOP(mod, ...)                                  \
          static void mod##_type_stop(struct lu_device_type *t)   \
          {                                                       \
+                lu_context_key_quiesce_many(__VA_ARGS__, NULL); \
          }                                                       \
          struct __##mod##_dummy_type_stop {;}
  
  
  
-#define LU_TYPE_INIT_FINI(mod, ...)                                 \
-        LU_TYPE_INIT(mod, __VA_ARGS__);                             \
+#define LU_TYPE_INIT_FINI(mod, ...)             \
+        LU_TYPE_INIT(mod, __VA_ARGS__);         \
          LU_TYPE_FINI(mod, __VA_ARGS__);         \
          LU_TYPE_START(mod, __VA_ARGS__);        \
          LU_TYPE_STOP(mod, __VA_ARGS__)
@@ -1245,8 +1252,9 @@ struct lu_env {
          struct lu_context *le_ses;
  };
  
-int  lu_env_init(struct lu_env *env, struct lu_context *ses, __u32 tags);
-void lu_env_fini(struct lu_env *env);
+int  lu_env_init  (struct lu_env *env, __u32 tags);
+void lu_env_fini  (struct lu_env *env);
+int  lu_env_refill(struct lu_env *env);
  
  /** @} lu_context */
  
diff --git a/lustre/include/lustre/lustre_idl.h b/lustre/include/lustre/lustre_idl.h

index ff040ba..63dab1b 100644 (file)
--- a/lustre/include/lustre/lustre_idl.h
+++ b/lustre/include/lustre/lustre_idl.h
@@ -293,7 +293,7 @@ static inline void fid_zero(struct lu_fid *fid)
  /**
   * Check if a fid is igif or not.
   * \param fid the fid to be tested.
- * \return true if the fid is a igif; otherwise false. 
+ * \return true if the fid is a igif; otherwise false.
   */
  static inline int fid_is_igif(const struct lu_fid *fid)
  {
@@ -303,7 +303,7 @@ static inline int fid_is_igif(const struct lu_fid *fid)
  /**
   * Check if a fid is idif or not.
   * \param fid the fid to be tested.
- * \return true if the fid is a idif; otherwise false. 
+ * \return true if the fid is a idif; otherwise false.
   */
  static inline int fid_is_idif(const struct lu_fid *fid)
  {
@@ -324,7 +324,7 @@ static inline ino_t lu_igif_ino(const struct lu_fid *fid)
   * Get inode generation from a igif.
   * \param fid a igif to get inode generation from.
   * \return inode generation for the igif.
- */ 
+ */
  static inline __u32 lu_igif_gen(const struct lu_fid *fid)
  {
          return fid_oid(fid);
@@ -390,7 +390,6 @@ static inline void fid_be_to_cpu(struct lu_fid *dst, const struct lu_fid *src)
   *
   * Variable size, first byte contains the length of the whole record.
   */
-
  struct lu_fid_pack {
          char fp_len;
          char fp_area[sizeof(struct lu_fid)];
@@ -947,7 +946,7 @@ extern void lustre_swab_obd_statfs (struct obd_statfs *os);
  #define OBD_BRW_CHECK           0x10
  #define OBD_BRW_FROM_GRANT      0x20 /* the osc manages this under llite */
  #define OBD_BRW_GRANTED         0x40 /* the ost manages this */
-#define OBD_BRW_DROP            0x80 /* drop the page after IO */
+#define OBD_BRW_NOCACHE         0x80 /* this page is a part of non-cached IO */
  #define OBD_BRW_NOQUOTA        0x100
  #define OBD_BRW_SRVLOCK        0x200 /* Client holds no lock over this page */
  
@@ -976,7 +975,7 @@ extern void lustre_swab_niobuf_remote (struct niobuf_remote *nbr);
  
  /* lock value block communicated between the filter and llite */
  
-/* OST_LVB_ERR_INIT is needed because the return code in rc is 
+/* OST_LVB_ERR_INIT is needed because the return code in rc is
   * negative, i.e. because ((MASK + rc) & MASK) != MASK. */
  #define OST_LVB_ERR_INIT 0xffbadbad80000000ULL
  #define OST_LVB_ERR_MASK 0xffbadbad00000000ULL
diff --git a/lustre/include/lustre_cache.h b/lustre/include/lustre_cache.h

deleted file mode 100644 (file)

index 5bff0a2..0000000
--- a/lustre/include/lustre_cache.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
- *
- * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
- * CA 95054 USA or visit www.sun.com if you need additional information or
- * have any questions.
- *
- * GPL HEADER END
- */
-/*
- * Copyright  2008 Sun Microsystems, Inc. All rights reserved
- * Use is subject to license terms.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- */
-
-#ifndef LUSTRE_CACHE_H
-#define LUSTRE_CACHE_H
-#include <obd.h>
-#include <lustre/lustre_idl.h>
-#include <lustre_dlm.h>
-
-struct lustre_cache;
-struct osc_async_page;
-struct page_removal_cb_element {
-        struct list_head        prce_list;
-        obd_page_removal_cb_t   prce_callback;
-        atomic_t                prce_refcnt;
-};
-
-typedef int (*cache_iterate_extents_cb_t)(struct lustre_cache *,
-                                          struct lustre_handle *,
-                                          struct osc_async_page *,
-                                          void *);
-typedef int (*cache_iterate_locks_cb_t)(struct lustre_cache *,
-                                        struct ldlm_res_id *,
-                                        struct lustre_handle *, void *);
-
-struct lustre_cache {
-        struct list_head         lc_locks_list;
-        spinlock_t               lc_locks_list_lock;
-        struct list_head         lc_page_removal_callback_list;
-        rwlock_t                 lc_page_removal_cb_lock; /* iterate vs modify list */
-        struct obd_device       *lc_obd;
-        obd_pin_extent_cb        lc_pin_extent_cb;
-};
-
-int cache_add_lock(struct lustre_cache *cache, struct lustre_handle *lockh);
-int cache_add_extent(struct lustre_cache *cache, struct ldlm_res_id *res,
-                     struct osc_async_page *extent,
-                     struct lustre_handle *lockh);
-void cache_remove_extent(struct lustre_cache *, struct osc_async_page *);
-int cache_add_extent_removal_cb(struct lustre_cache *cache,
-                                obd_page_removal_cb_t func_cb,
-                                obd_pin_extent_cb pin_cb);
-int cache_del_extent_removal_cb(struct lustre_cache *cache,
-                                obd_page_removal_cb_t func_cb);
-int cache_iterate_extents(struct lustre_cache *cache, struct lustre_handle *lockh,
-                          cache_iterate_extents_cb_t cb_func, void *data);
-int cache_remove_lock(struct lustre_cache *cache, struct lustre_handle *lockh);
-int cache_iterate_locks(struct lustre_cache *cache, struct ldlm_res_id *res,
-                        cache_iterate_locks_cb_t cb_fun, void *data);
-struct lustre_cache *cache_create(struct obd_device *obd);
-int cache_destroy(struct lustre_cache *cache);
-
-
-#endif /* LUSTRE_CACHE_H */
diff --git a/lustre/include/lustre_dlm.h b/lustre/include/lustre_dlm.h

index 1bd1c53..aa0e358 100644 (file)
--- a/lustre/include/lustre_dlm.h
+++ b/lustre/include/lustre_dlm.h
@@ -261,13 +261,13 @@ struct ldlm_pool_ops {
          int (*po_setup)(struct ldlm_pool *pl, int limit);
  };
  
-/** 
- * One second for pools thread check interval. Each pool has own period. 
+/**
+ * One second for pools thread check interval. Each pool has own period.
   */
  #define LDLM_POOLS_THREAD_PERIOD (1)
  
-/** 
- * 5% margin for modest pools. See ldlm_pool.c for details. 
+/**
+ * 5% margin for modest pools. See ldlm_pool.c for details.
   */
  #define LDLM_POOLS_MODEST_MARGIN (5)
  
@@ -432,7 +432,6 @@ struct ldlm_namespace {
  
          unsigned int           ns_max_unused;
          unsigned int           ns_max_age;
-
           /**
            * Seconds.
            */
@@ -546,7 +545,7 @@ struct ldlm_interval_tree {
  };
  
  struct ldlm_lock {
-        /** 
+        /**
           * Must be first in the structure.
           */
          struct portals_handle    l_handle;
@@ -554,34 +553,34 @@ struct ldlm_lock {
           * Lock reference count.
           */
          atomic_t                 l_refc;
-        /** 
+        /**
           * Internal spinlock protects l_resource.  we should hold this lock
           * first before grabbing res_lock.
           */
          spinlock_t               l_lock;
-        /** 
-         * ldlm_lock_change_resource() can change this. 
+        /**
+         * ldlm_lock_change_resource() can change this.
           */
          struct ldlm_resource    *l_resource;
-        /** 
+        /**
           * Protected by ns_hash_lock. List item for client side lru list.
           */
          struct list_head         l_lru;
-        /** 
-         * Protected by lr_lock, linkage to resource's lock queues. 
+        /**
+         * Protected by lr_lock, linkage to resource's lock queues.
           */
          struct list_head         l_res_link;
-        /** 
-         * Tree node for ldlm_extent. 
+        /**
+         * Tree node for ldlm_extent.
           */
          struct ldlm_interval    *l_tree_node;
-        /** 
+        /**
           * Protected by per-bucket exp->exp_lock_hash locks. Per export hash
           * of locks.
           */
          struct hlist_node        l_exp_hash;
-        /** 
-         * Protected by lr_lock. Requested mode. 
+        /**
+         * Protected by lr_lock. Requested mode.
           */
          ldlm_mode_t              l_req_mode;
          /**
@@ -633,27 +632,27 @@ struct ldlm_lock {
           */
          __u8                  l_destroyed;
  
-        /** 
+        /**
           * If the lock is granted, a process sleeps on this waitq to learn when
           * it's no longer in use.  If the lock is not granted, a process sleeps
-         * on this waitq to learn when it becomes granted. 
+         * on this waitq to learn when it becomes granted.
           */
          cfs_waitq_t           l_waitq;
  
          struct timeval        l_enqueued_time;
  
          /**
-         * Jiffies. Should be converted to time if needed. 
+         * Jiffies. Should be converted to time if needed.
           */
          cfs_time_t            l_last_used;
  
          struct ldlm_extent    l_req_extent;
  
-        /* 
-         * Client-side-only members. 
+        /*
+         * Client-side-only members.
           */
-         
-        /** 
+
+        /**
           * Temporary storage for an LVB received during an enqueue operation.
           */
          __u32                 l_lvb_len;
@@ -666,43 +665,43 @@ struct ldlm_lock {
  
          struct list_head      l_cache_locks_list;
  
-        /* 
-         * Server-side-only members. 
+        /*
+         * Server-side-only members.
           */
  
-        /* connection cookie for the client originated the opeation */
+        /** connection cookie for the client originated the operation. */
          __u64                 l_client_cookie;
  
-        /** 
+        /**
           * Protected by elt_lock. Callbacks pending.
           */
          struct list_head      l_pending_chain;
  
          cfs_time_t            l_callback_timeout;
  
-        /** 
-         * Pid which created this lock. 
+        /**
+         * Pid which created this lock.
           */
          __u32                 l_pid;
  
-        /** 
-         * For ldlm_add_ast_work_item(). 
+        /**
+         * For ldlm_add_ast_work_item().
           */
          struct list_head      l_bl_ast;
-        /** 
-         * For ldlm_add_ast_work_item(). 
+        /**
+         * For ldlm_add_ast_work_item().
           */
          struct list_head      l_cp_ast;
-        /** 
-         * For ldlm_add_ast_work_item(). 
+        /**
+         * For ldlm_add_ast_work_item().
           */
          struct list_head      l_rk_ast;
  
          struct ldlm_lock     *l_blocking_lock;
          int                   l_bl_ast_run;
  
-        /** 
-         * Protected by lr_lock, linkages to "skip lists". 
+        /**
+         * Protected by lr_lock, linkages to "skip lists".
           */
          struct list_head      l_sl_mode;
          struct list_head      l_sl_policy;
@@ -871,7 +870,6 @@ void ldlm_lock2handle(const struct ldlm_lock *lock,
                        struct lustre_handle *lockh);
  struct ldlm_lock *__ldlm_handle2lock(const struct lustre_handle *, int flags);
  void ldlm_cancel_callback(struct ldlm_lock *);
-int ldlm_lock_set_data(struct lustre_handle *, void *data);
  int ldlm_lock_remove_from_lru(struct ldlm_lock *);
  
  static inline struct ldlm_lock *ldlm_handle2lock(const struct lustre_handle *h)
@@ -959,12 +957,10 @@ int  ldlm_lock_addref_try(struct lustre_handle *lockh, __u32 mode);
  void ldlm_lock_decref(struct lustre_handle *lockh, __u32 mode);
  void ldlm_lock_decref_and_cancel(struct lustre_handle *lockh, __u32 mode);
  void ldlm_lock_allow_match(struct ldlm_lock *lock);
-int ldlm_lock_fast_match(struct ldlm_lock *, int, obd_off, obd_off, void **);
-void ldlm_lock_fast_release(void *, int);
  ldlm_mode_t ldlm_lock_match(struct ldlm_namespace *ns, int flags,
                              const struct ldlm_res_id *, ldlm_type_t type,
                              ldlm_policy_data_t *, ldlm_mode_t mode,
-                            struct lustre_handle *);
+                            struct lustre_handle *, int unref);
  struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode,
                                          __u32 *flags);
  void ldlm_lock_downgrade(struct ldlm_lock *lock, int new_mode);
@@ -1016,7 +1012,7 @@ int ldlm_lock_change_resource(struct ldlm_namespace *, struct ldlm_lock *,
                                const struct ldlm_res_id *);
  
  #define LDLM_RESOURCE_ADDREF(res) do {                                  \
-        lu_ref_add(&(res)->lr_reference, __FUNCTION__, cfs_current());  \
+        lu_ref_add_atomic(&(res)->lr_reference, __FUNCTION__, cfs_current());  \
  } while (0)
  
  #define LDLM_RESOURCE_DELREF(res) do {                                  \
diff --git a/lustre/include/lustre_lite.h b/lustre/include/lustre_lite.h

index 439d0dd..34bb9f1 100644 (file)
--- a/lustre/include/lustre_lite.h
+++ b/lustre/include/lustre_lite.h
@@ -165,15 +165,15 @@ static inline int ll_ocd_update(struct obd_device *host,
          RETURN(result);
  }
  
-/*      
+/*
   * Chain of hash overflow pages.
- */            
+ */
  struct ll_dir_chain {
          /* XXX something. Later */
  };
-        
+
  static inline void ll_dir_chain_init(struct ll_dir_chain *chain)
-{       
+{
  }
  
  static inline void ll_dir_chain_fini(struct ll_dir_chain *chain)
diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h

index 34b5831..558930d 100644 (file)
--- a/lustre/include/lustre_net.h
+++ b/lustre/include/lustre_net.h
@@ -213,7 +213,7 @@ union ptlrpc_async_args {
           * a pointer to it here.  The pointer_arg ensures this struct is at
           * least big enough for that. */
          void      *pointer_arg[9];
-        __u64      space[4];
+        __u64      space[5];
  };
  
  struct ptlrpc_request_set;
@@ -803,7 +803,11 @@ enum ptlrpcd_ctl_flags {
           * Ptlrpc thread stop force flag. This will cause also
           * aborting any inflight rpcs handled by thread.
           */
-        LIOD_STOP_FORCE  = 1 << 2
+        LIOD_STOP_FORCE  = 1 << 2,
+        /**
+         * This is a recovery ptlrpc thread.
+         */
+        LIOD_RECOVERY    = 1 << 3
  };
  
  /* ptlrpc/events.c */
@@ -1214,10 +1218,25 @@ void ping_evictor_stop(void);
  int ptlrpc_check_and_wait_suspend(struct ptlrpc_request *req);
  
  /* ptlrpc/ptlrpcd.c */
-int ptlrpcd_start(char *name, struct ptlrpcd_ctl *pc);
+
+/**
+ * Ptlrpcd scope is a set of two threads: ptlrpcd-foo and ptlrpcd-foo-rcv,
+ * these threads are used to asynchronously send requests queued with
+ * ptlrpcd_add_req(req, PCSOPE_FOO), and to handle completion call-backs for
+ * such requests. Multiple scopes are needed to avoid dead-locks.
+ */
+enum ptlrpcd_scope {
+        /** Scope of bulk read-write rpcs. */
+        PSCOPE_BRW,
+        /** Everything else. */
+        PSCOPE_OTHER,
+        PSCOPE_NR
+};
+
+int ptlrpcd_start(const char *name, struct ptlrpcd_ctl *pc);
  void ptlrpcd_stop(struct ptlrpcd_ctl *pc, int force);
  void ptlrpcd_wake(struct ptlrpc_request *req);
-void ptlrpcd_add_req(struct ptlrpc_request *req);
+void ptlrpcd_add_req(struct ptlrpc_request *req, enum ptlrpcd_scope scope);
  int ptlrpcd_addref(void);
  void ptlrpcd_decref(void);
  
diff --git a/lustre/include/md_object.h b/lustre/include/md_object.h

index 556728d..c2fc95c 100644 (file)
--- a/lustre/include/md_object.h
+++ b/lustre/include/md_object.h
@@ -88,7 +88,9 @@ struct md_ucred {
         struct md_identity *mu_identity;
  };
  
-#define MD_CAPAINFO_MAX 5
+enum {
+        MD_CAPAINFO_MAX = 5
+};
  
  /** there are at most 5 fids in one operation, see rename, NOTE the last one
   * is a temporary one used for is_subdir() */
@@ -335,9 +337,9 @@ struct md_upcall {
  };
  
  struct md_device {
-        struct lu_device             md_lu_dev;
+        struct lu_device                   md_lu_dev;
          const struct md_device_operations *md_ops;
-        struct md_upcall             md_upcall;
+        struct md_upcall                   md_upcall;
  };
  
  static inline void md_upcall_init(struct md_device *m, void *upcl)
@@ -377,7 +379,7 @@ static inline int md_do_upcall(const struct lu_env *env, struct md_device *m,
  }
  
  struct md_object {
-        struct lu_object             mo_lu;
+        struct lu_object                   mo_lu;
          const struct md_object_operations *mo_ops;
          const struct md_dir_operations    *mo_dir_ops;
  };
@@ -454,12 +456,12 @@ static inline struct md_site *lu_site2md(const struct lu_site *s)
  
  static inline int md_device_init(struct md_device *md, struct lu_device_type *t)
  {
-       return lu_device_init(&md->md_lu_dev, t);
+        return lu_device_init(&md->md_lu_dev, t);
  }
  
  static inline void md_device_fini(struct md_device *md)
  {
-       lu_device_fini(&md->md_lu_dev);
+        lu_device_fini(&md->md_lu_dev);
  }
  
  static inline struct md_object *md_object_find_slice(const struct lu_env *env,
diff --git a/lustre/include/obd.h b/lustre/include/obd.h

index b42e2c3..15d4273 100644 (file)
--- a/lustre/include/obd.h
+++ b/lustre/include/obd.h
@@ -175,7 +175,7 @@ struct lov_stripe_md {
  
  struct obd_info;
  
-typedef int (*obd_enqueue_update_f)(struct obd_info *oinfo, int rc);
+typedef int (*obd_enqueue_update_f)(void *cookie, int rc);
  
  /* obd info for a particular level (lov, osc). */
  struct obd_info {
@@ -239,52 +239,6 @@ struct brw_page {
          obd_flag flag;
  };
  
-enum async_flags {
-        ASYNC_READY = 0x1, /* ap_make_ready will not be called before this
-                              page is added to an rpc */
-        ASYNC_URGENT = 0x2, /* page must be put into an RPC before return */
-        ASYNC_COUNT_STABLE = 0x4, /* ap_refresh_count will not be called
-                                     to give the caller a chance to update
-                                     or cancel the size of the io */
-        ASYNC_GROUP_SYNC = 0x8,  /* ap_completion will not be called, instead
-                                    the page is accounted for in the
-                                    obd_io_group given to
-                                    obd_queue_group_io */
-};
-
-struct obd_async_page_ops {
-        int  (*ap_make_ready)(void *data, int cmd);
-        int  (*ap_refresh_count)(void *data, int cmd);
-        void (*ap_fill_obdo)(void *data, int cmd, struct obdo *oa);
-        void (*ap_update_obdo)(void *data, int cmd, struct obdo *oa,
-                               obd_valid valid);
-        int  (*ap_completion)(void *data, int cmd, struct obdo *oa, int rc);
-        struct obd_capa *(*ap_lookup_capa)(void *data, int cmd);
-};
-
-/* the `oig' is passed down from a caller of obd rw methods.  the callee
- * records enough state such that the caller can sleep on the oig and
- * be woken when all the callees have finished their work */
-struct obd_io_group {
-        spinlock_t      oig_lock;
-        atomic_t        oig_refcount;
-        int             oig_pending;
-        int             oig_rc;
-        struct list_head oig_occ_list;
-        cfs_waitq_t     oig_waitq;
-};
-
-/* the oig callback context lets the callee of obd rw methods register
- * for callbacks from the caller. */
-struct oig_callback_context {
-        struct list_head occ_oig_item;
-        /* called when the caller has received a signal while sleeping.
-         * callees of this method are encouraged to abort their state
-         * in the oig.  This may be called multiple times. */
-        void (*occ_interrupted)(struct oig_callback_context *occ);
-        unsigned long interrupted:1;
-};
-
  /* Individual type definitions */
  
  struct ost_server_data;
@@ -296,11 +250,6 @@ struct obd_device_target {
          struct lustre_quota_ctxt  obt_qctxt;
  };
  
-typedef void (*obd_pin_extent_cb)(void *data);
-typedef int (*obd_page_removal_cb_t)(void *data, int discard);
-typedef int (*obd_lock_cancel_cb)(struct ldlm_lock *,struct ldlm_lock_desc *,
-                                   void *, int);
-
  /* llog contexts */
  enum llog_ctxt_id {
          LLOG_CONFIG_ORIG_CTXT  =  0,
@@ -426,7 +375,6 @@ struct filter_obd {
  
  struct mdc_rpc_lock;
  struct obd_import;
-struct lustre_cache;
  struct client_obd {
          struct rw_semaphore      cl_sem;
          struct obd_uuid          cl_target_uuid;
@@ -448,6 +396,7 @@ struct client_obd {
          /* the grant values are protected by loi_list_lock below */
          long                     cl_dirty;         /* all _dirty_ in bytes */
          long                     cl_dirty_max;     /* allowed w/o rpc */
+        long                     cl_dirty_transit; /* dirty synchronous */
          long                     cl_avail_grant;   /* bytes of credit for ost */
          long                     cl_lost_grant;    /* lost credits (trunc) */
          struct list_head         cl_cache_waiters; /* waiting for cache/grant */
@@ -521,10 +470,6 @@ struct client_obd {
          struct lu_client_seq    *cl_seq;
  
          atomic_t                 cl_resends; /* resend count */
-
-        /* Cache of triples */
-        struct lustre_cache     *cl_cache;
-        obd_lock_cancel_cb       cl_ext_lock_cancel_cb;
  };
  #define obd2cli_tgt(obd) ((char *)(obd)->u.cli.cl_target_uuid.uuid)
  
@@ -639,6 +584,7 @@ struct echo_client_obd {
          struct obd_export   *ec_exp;   /* the local connection to osc/lov */
          spinlock_t           ec_lock;
          struct list_head     ec_objects;
+        struct list_head     ec_locks;
          int                  ec_nstripes;
          __u64                ec_unique;
  };
@@ -734,9 +680,6 @@ struct lov_obd {
          __u32                   lov_death_row;/* tgts scheduled to be deleted */
          __u32                   lov_tgt_size;   /* size of tgts array */
          int                     lov_connects;
-        obd_page_removal_cb_t   lov_page_removal_cb;
-        obd_pin_extent_cb       lov_page_pin_cb;
-        obd_lock_cancel_cb      lov_lock_cancel_cb;
          int                     lov_pool_count;
          lustre_hash_t          *lov_pools_hash_body; /* used for key access */
          struct list_head        lov_pool_list; /* used for sequential access */
@@ -801,8 +744,10 @@ struct niobuf_local {
  #define LUSTRE_CMM_NAME         "cmm"
  #define LUSTRE_MDD_NAME         "mdd"
  #define LUSTRE_OSD_NAME         "osd"
+#define LUSTRE_VVP_NAME         "vvp"
  #define LUSTRE_LMV_NAME         "lmv"
  #define LUSTRE_CMM_MDC_NAME     "cmm-mdc"
+#define LUSTRE_SLP_NAME         "slp"
  
  /* obd device type names */
   /* FIXME all the references to LUSTRE_MDS_NAME should be swapped with LUSTRE_MDT_NAME */
@@ -1281,47 +1226,6 @@ struct obd_ops {
          int (*o_brw)(int rw, struct obd_export *exp, struct obd_info *oinfo,
                       obd_count oa_bufs, struct brw_page *pgarr,
                       struct obd_trans_info *oti);
-        int (*o_brw_async)(int rw, struct obd_export *exp,
-                           struct obd_info *oinfo, obd_count oa_bufs,
-                           struct brw_page *pgarr, struct obd_trans_info *oti,
-                           struct ptlrpc_request_set *);
-        int (*o_prep_async_page)(struct obd_export *exp,
-                                 struct lov_stripe_md *lsm,
-                                 struct lov_oinfo *loi,
-                                 cfs_page_t *page, obd_off offset,
-                                 struct obd_async_page_ops *ops, void *data,
-                                 void **res, int nocache,
-                                 struct lustre_handle *lockh);
-        int (*o_reget_short_lock)(struct obd_export *exp,
-                                  struct lov_stripe_md *lsm,
-                                  void **res, int rw,
-                                  obd_off start, obd_off end,
-                                  void **cookie);
-        int (*o_release_short_lock)(struct obd_export *exp,
-                                    struct lov_stripe_md *lsm, obd_off end,
-                                    void *cookie, int rw);
-        int (*o_queue_async_io)(struct obd_export *exp,
-                                struct lov_stripe_md *lsm,
-                                struct lov_oinfo *loi, void *cookie,
-                                int cmd, obd_off off, int count,
-                                obd_flag brw_flags, obd_flag async_flags);
-        int (*o_queue_group_io)(struct obd_export *exp,
-                                struct lov_stripe_md *lsm,
-                                struct lov_oinfo *loi,
-                                struct obd_io_group *oig,
-                                void *cookie, int cmd, obd_off off, int count,
-                                obd_flag brw_flags, obd_flag async_flags);
-        int (*o_trigger_group_io)(struct obd_export *exp,
-                                  struct lov_stripe_md *lsm,
-                                  struct lov_oinfo *loi,
-                                  struct obd_io_group *oig);
-        int (*o_set_async_flags)(struct obd_export *exp,
-                                struct lov_stripe_md *lsm,
-                                struct lov_oinfo *loi, void *cookie,
-                                obd_flag async_flags);
-        int (*o_teardown_async_page)(struct obd_export *exp,
-                                     struct lov_stripe_md *lsm,
-                                     struct lov_oinfo *loi, void *cookie);
          int (*o_merge_lvb)(struct obd_export *exp, struct lov_stripe_md *lsm,
                             struct ost_lvb *lvb, int kms_only);
          int (*o_adjust_kms)(struct obd_export *exp, struct lov_stripe_md *lsm,
@@ -1355,9 +1259,6 @@ struct obd_ops {
          int (*o_enqueue)(struct obd_export *, struct obd_info *oinfo,
                           struct ldlm_enqueue_info *einfo,
                           struct ptlrpc_request_set *rqset);
-        int (*o_match)(struct obd_export *, struct lov_stripe_md *, __u32 type,
-                       ldlm_policy_data_t *, __u32 mode, int *flags, void *data,
-                       struct lustre_handle *lockh);
          int (*o_change_cbdata)(struct obd_export *, struct lov_stripe_md *,
                                 ldlm_iterator_t it, void *data);
          int (*o_cancel)(struct obd_export *, struct lov_stripe_md *md,
@@ -1396,15 +1297,6 @@ struct obd_ops {
  
          int (*o_ping)(struct obd_export *exp);
  
-        int (*o_register_page_removal_cb)(struct obd_export *exp,
-                                          obd_page_removal_cb_t cb,
-                                          obd_pin_extent_cb pin_cb);
-        int (*o_unregister_page_removal_cb)(struct obd_export *exp,
-                                            obd_page_removal_cb_t cb);
-        int (*o_register_lock_cancel_cb)(struct obd_export *exp,
-                                       obd_lock_cancel_cb cb);
-        int (*o_unregister_lock_cancel_cb)(struct obd_export *exp,
-                                         obd_lock_cancel_cb cb);
          /* pools methods */
          int (*o_pool_new)(struct obd_device *obd, char *poolname);
          int (*o_pool_del)(struct obd_device *obd, char *poolname);
diff --git a/lustre/include/obd_class.h b/lustre/include/obd_class.h

index 045e3ff..3e885b2 100644 (file)
--- a/lustre/include/obd_class.h
+++ b/lustre/include/obd_class.h
@@ -92,13 +92,6 @@ struct obd_device * class_devices_in_group(struct obd_uuid *grp_uuid,
                                             int *next);
  struct obd_device * class_num2obd(int num);
  
-int oig_init(struct obd_io_group **oig);
-int oig_add_one(struct obd_io_group *oig, struct oig_callback_context *occ);
-void oig_complete_one(struct obd_io_group *oig,
-                      struct oig_callback_context *occ, int rc);
-void oig_release(struct obd_io_group *oig);
-int oig_wait(struct obd_io_group *oig);
-
  char *obd_export_nid2str(struct obd_export *exp);
  
  int obd_export_evict_by_nid(struct obd_device *obd, const char *nid);
@@ -107,6 +100,7 @@ int obd_export_evict_by_uuid(struct obd_device *obd, const char *uuid);
  int obd_zombie_impexp_init(void);
  void obd_zombie_impexp_stop(void);
  void obd_zombie_impexp_cull(void);
+void obd_zombie_barrier(void);
  
  /* obd_config.c */
  int class_process_config(struct lustre_cfg *lcfg);
@@ -411,26 +405,39 @@ static inline int obd_set_info_async(struct obd_export *exp, obd_count keylen,
          RETURN(rc);
  }
  
-#ifdef __KERNEL__
+/*
+ * obd-lu integration.
+ *
+ * Functionality is being moved into new lu_device-based layering, but some
+ * pieces of configuration process are still based on obd devices.
+ *
+ * Specifically, lu_device_type_operations::ldto_device_alloc() methods fully
+ * subsume ->o_setup() methods of obd devices they replace. The same for
+ * lu_device_operations::ldo_process_config() and ->o_process_config(). As a
+ * result, obd_setup() and obd_process_config() branch and call one XOR
+ * another.
+ *
+ * Yet neither lu_device_type_operations::ldto_device_fini() nor
+ * lu_device_type_operations::ldto_device_free() fully implement the
+ * functionality of ->o_precleanup() and ->o_cleanup() they override. Hence,
+ * obd_precleanup() and obd_cleanup() call both lu_device and obd operations.
+ */
+
  #define DECLARE_LU_VARS(ldt, d)                 \
-        struct lu_device_type *ldt;             \
+        struct lu_device_type *ldt;       \
          struct lu_device *d
-#else
-#define DECLARE_LU_VARS(ldt, d)                                 \
-        extern void __placeholder_to_put_a_semicolon(void)
-#endif
+
  static inline int obd_setup(struct obd_device *obd, struct lustre_cfg *cfg)
  {
          int rc;
          DECLARE_LU_VARS(ldt, d);
          ENTRY;
  
-#ifdef __KERNEL__
          ldt = obd->obd_type->typ_lu;
          if (ldt != NULL) {
                  struct lu_env env;
  
-                rc = lu_env_init(&env, NULL, ldt->ldt_ctx_tags);
+                rc = lu_env_init(&env, ldt->ldt_ctx_tags);
                  if (rc == 0) {
                          d = ldt->ldt_ops->ldto_device_alloc(&env, ldt, cfg);
                          lu_env_fini(&env);
@@ -441,9 +448,7 @@ static inline int obd_setup(struct obd_device *obd, struct lustre_cfg *cfg)
                          } else
                                  rc = PTR_ERR(d);
                  }
-        } else
-#endif
-        {
+        } else {
                  OBD_CHECK_DT_OP(obd, setup, -EOPNOTSUPP);
                  OBD_COUNTER_INCREMENT(obd, setup);
                  rc = OBP(obd, setup)(obd, cfg);
@@ -459,29 +464,23 @@ static inline int obd_precleanup(struct obd_device *obd,
          ENTRY;
  
          OBD_CHECK_DEV(obd);
-#ifdef __KERNEL__
          ldt = obd->obd_type->typ_lu;
          d = obd->obd_lu_dev;
          if (ldt != NULL && d != NULL) {
                  if (cleanup_stage == OBD_CLEANUP_EXPORTS) {
                          struct lu_env env;
  
-                        rc = lu_env_init(&env, NULL, ldt->ldt_ctx_tags);
+                        rc = lu_env_init(&env, ldt->ldt_ctx_tags);
                          if (rc == 0) {
                                  ldt->ldt_ops->ldto_device_fini(&env, d);
                                  lu_env_fini(&env);
                          }
-                } else {
-                        rc = 0;
                  }
-        } else
-#endif
-        {
-                OBD_CHECK_DT_OP(obd, precleanup, 0);
-                rc = OBP(obd, precleanup)(obd, cleanup_stage);
          }
-
+        OBD_CHECK_DT_OP(obd, precleanup, 0);
          OBD_COUNTER_INCREMENT(obd, precleanup);
+
+        rc = OBP(obd, precleanup)(obd, cleanup_stage);
          RETURN(rc);
  }
  
@@ -493,25 +492,22 @@ static inline int obd_cleanup(struct obd_device *obd)
  
          OBD_CHECK_DEV(obd);
  
-#ifdef __KERNEL__
          ldt = obd->obd_type->typ_lu;
          d = obd->obd_lu_dev;
          if (ldt != NULL && d != NULL) {
                  struct lu_env env;
  
-                rc = lu_env_init(&env, NULL, ldt->ldt_ctx_tags);
+                rc = lu_env_init(&env, ldt->ldt_ctx_tags);
                  if (rc == 0) {
                          ldt->ldt_ops->ldto_device_free(&env, d);
                          lu_env_fini(&env);
                          obd->obd_lu_dev = NULL;
                  }
-        } else
-#endif
-        {
-                OBD_CHECK_DT_OP(obd, cleanup, 0);
-                rc = OBP(obd, cleanup)(obd);
          }
+        OBD_CHECK_DT_OP(obd, cleanup, 0);
          OBD_COUNTER_INCREMENT(obd, cleanup);
+
+        rc = OBP(obd, cleanup)(obd);
          RETURN(rc);
  }
  
@@ -524,20 +520,17 @@ obd_process_config(struct obd_device *obd, int datalen, void *data)
  
          OBD_CHECK_DEV(obd);
  
-#ifdef __KERNEL__
          ldt = obd->obd_type->typ_lu;
          d = obd->obd_lu_dev;
          if (ldt != NULL && d != NULL) {
                  struct lu_env env;
  
-                rc = lu_env_init(&env, NULL, ldt->ldt_ctx_tags);
+                rc = lu_env_init(&env, ldt->ldt_ctx_tags);
                  if (rc == 0) {
                          rc = d->ld_ops->ldo_process_config(&env, d, data);
                          lu_env_fini(&env);
                  }
-        } else
-#endif
-        {
+        } else {
                  OBD_CHECK_DT_OP(obd, process_config, -EOPNOTSUPP);
                  rc = OBP(obd, process_config)(obd, datalen, data);
          }
@@ -624,9 +617,13 @@ static inline int obd_alloc_memmd(struct obd_export *exp,
  static inline int obd_free_memmd(struct obd_export *exp,
                                   struct lov_stripe_md **mem_tgt)
  {
+        int rc;
+
          LASSERT(mem_tgt);
          LASSERT(*mem_tgt);
-        return obd_unpackmd(exp, mem_tgt, NULL, 0);
+        rc = obd_unpackmd(exp, mem_tgt, NULL, 0);
+        *mem_tgt = NULL;
+        return rc;
  }
  
  static inline int obd_checkmd(struct obd_export *exp,
@@ -811,10 +808,8 @@ static inline int obd_connect(const struct lu_env *env,
                                void *localdata)
  {
          int rc;
-#ifdef LIBCFS_DEBUG
          __u64 ocf = d ? d->ocd_connect_flags : 0; /* for post-condition
                                                     * check */
-#endif
          ENTRY;
  
          OBD_CHECK_DEV_ACTIVE(obd);
@@ -836,10 +831,8 @@ static inline int obd_reconnect(const struct lu_env *env,
                                  void *localdata)
  {
          int rc;
-#ifdef LIBCFS_DEBUG
          __u64 ocf = d ? d->ocd_connect_flags : 0; /* for post-condition
                                                     * check */
-#endif
  
          ENTRY;
  
@@ -1200,230 +1193,6 @@ static inline int obd_brw(int cmd, struct obd_export *exp,
          RETURN(rc);
  }
  
-static inline int obd_brw_async(int cmd, struct obd_export *exp,
-                                struct obd_info *oinfo, obd_count oa_bufs,
-                                struct brw_page *pg, struct obd_trans_info *oti,
-                                struct ptlrpc_request_set *set)
-{
-        int rc;
-        ENTRY;
-
-        EXP_CHECK_DT_OP(exp, brw_async);
-        EXP_COUNTER_INCREMENT(exp, brw_async);
-
-        if (!(cmd & OBD_BRW_RWMASK)) {
-                CERROR("obd_brw: cmd must be OBD_BRW_READ or OBD_BRW_WRITE\n");
-                LBUG();
-        }
-
-        rc = OBP(exp->exp_obd, brw_async)(cmd, exp, oinfo, oa_bufs, pg,oti,set);
-        RETURN(rc);
-}
-
-static inline int obd_brw_rqset(int cmd, struct obd_export *exp,
-                                struct obdo *oa, struct lov_stripe_md *lsm,
-                                obd_count oa_bufs, struct brw_page *pg,
-                                struct obd_trans_info *oti,
-                                struct obd_capa *ocapa)
-{
-        struct ptlrpc_request_set *set = NULL;
-        struct obd_info oinfo = { { { 0 } } };
-        int rc = 0;
-        ENTRY;
-
-        set =  ptlrpc_prep_set();
-        if (set == NULL)
-                RETURN(-ENOMEM);
-
-        oinfo.oi_oa = oa;
-        oinfo.oi_md = lsm;
-        oinfo.oi_capa = ocapa;
-        rc = obd_brw_async(cmd, exp, &oinfo, oa_bufs, pg, oti, set);
-        if (rc == 0) {
-                rc = ptlrpc_set_wait(set);
-                if (rc)
-                        CERROR("error from callback: rc = %d\n", rc);
-        } else {
-                CDEBUG(rc == -ENOSPC ? D_INODE : D_ERROR,
-                       "error from obd_brw_async: rc = %d\n", rc);
-        }
-        ptlrpc_set_destroy(set);
-        RETURN(rc);
-}
-
-static inline  int obd_prep_async_page(struct obd_export *exp,
-                                       struct lov_stripe_md *lsm,
-                                       struct lov_oinfo *loi,
-                                       cfs_page_t *page, obd_off offset,
-                                       struct obd_async_page_ops *ops,
-                                       void *data, void **res, int nocache,
-                                       struct lustre_handle *lockh)
-{
-        int ret;
-        ENTRY;
-
-        EXP_CHECK_DT_OP(exp, prep_async_page);
-        EXP_COUNTER_INCREMENT(exp, prep_async_page);
-
-        ret = OBP(exp->exp_obd, prep_async_page)(exp, lsm, loi, page, offset,
-                                                 ops, data, res, nocache,
-                                                 lockh);
-        RETURN(ret);
-}
-
-/**
- * Checks if requested extent lock is compatible with a lock under the page.
- *
- * Checks if the lock under \a page is compatible with a read or write lock
- * (specified by \a rw) for an extent [\a start , \a end].
- *
- * \param exp obd export (lov or osc)
- * \param lsm striping information for the file
- * \param res async_page placeholder
- * \param rw OBD_BRW_READ if requested for reading,
- *           OBD_BRW_WRITE if requested for writing
- * \param start start of the requested extent
- * \param end end of the requested extent
- * \param cookie transparent parameter for passing locking context
- *
- * \post result == 1, *cookie == context, appropriate lock is referenced or
- *
- * \retval 1 owned lock is reused for the request
- * \retval 0 no lock reused for the request
- * \retval -ENOTSUPP reget_short_lock is not exported at this layer
- *
- * \see obd_release_short_lock
- */
-static inline int obd_reget_short_lock(struct obd_export *exp,
-                                       struct lov_stripe_md *lsm,
-                                       void **res, int rw,
-                                       obd_off start, obd_off end,
-                                       void **cookie)
-{
-        ENTRY;
-
-        EXP_CHECK_DT_OP(exp, reget_short_lock);
-        EXP_COUNTER_INCREMENT(exp, reget_short_lock);
-
-        RETURN(OBP(exp->exp_obd, reget_short_lock)(exp, lsm, res, rw,
-                                                   start, end, cookie));
-}
-
-
-/**
- * Releases a reference to a lock taken in a "fast" way.
- *
- * Releases a read or write (specified by \a rw) lock
- * referenced by \a cookie.
- *
- * \param exp obd export (lov or osc)
- * \param lsm striping information for the file
- * \param end end of the locked extent
- * \param rw OBD_BRW_READ if requested for reading,
- *           OBD_BRW_WRITE if requested for writing
- * \param cookie transparent parameter for passing locking context
- *
- * \post appropriate lock is dereferenced
- *
- * \see obd_reget_short_lock
- */
-static inline int obd_release_short_lock(struct obd_export *exp,
-                                         struct lov_stripe_md *lsm, obd_off end,
-                                         void *cookie, int rw)
-{
-        ENTRY;
-
-        EXP_CHECK_DT_OP(exp, release_short_lock);
-        EXP_COUNTER_INCREMENT(exp, release_short_lock);
-
-        RETURN(OBP(exp->exp_obd, release_short_lock)(exp, lsm, end,
-                                                     cookie, rw));
-}
-
-static inline int obd_queue_async_io(struct obd_export *exp,
-                                     struct lov_stripe_md *lsm,
-                                     struct lov_oinfo *loi, void *cookie,
-                                     int cmd, obd_off off, int count,
-                                     obd_flag brw_flags, obd_flag async_flags)
-{
-        int rc;
-        ENTRY;
-
-        EXP_CHECK_DT_OP(exp, queue_async_io);
-        EXP_COUNTER_INCREMENT(exp, queue_async_io);
-        LASSERT(cmd & OBD_BRW_RWMASK);
-
-        rc = OBP(exp->exp_obd, queue_async_io)(exp, lsm, loi, cookie, cmd, off,
-                                               count, brw_flags, async_flags);
-        RETURN(rc);
-}
-
-static inline int obd_set_async_flags(struct obd_export *exp,
-                                      struct lov_stripe_md *lsm,
-                                      struct lov_oinfo *loi, void *cookie,
-                                      obd_flag async_flags)
-{
-        int rc;
-        ENTRY;
-
-        EXP_CHECK_DT_OP(exp, set_async_flags);
-        EXP_COUNTER_INCREMENT(exp, set_async_flags);
-
-        rc = OBP(exp->exp_obd, set_async_flags)(exp, lsm, loi, cookie,
-                                                async_flags);
-        RETURN(rc);
-}
-
-static inline int obd_queue_group_io(struct obd_export *exp,
-                                     struct lov_stripe_md *lsm,
-                                     struct lov_oinfo *loi,
-                                     struct obd_io_group *oig,
-                                     void *cookie, int cmd, obd_off off,
-                                     int count, obd_flag brw_flags,
-                                     obd_flag async_flags)
-{
-        int rc;
-        ENTRY;
-
-        EXP_CHECK_DT_OP(exp, queue_group_io);
-        EXP_COUNTER_INCREMENT(exp, queue_group_io);
-        LASSERT(cmd & OBD_BRW_RWMASK);
-
-        rc = OBP(exp->exp_obd, queue_group_io)(exp, lsm, loi, oig, cookie,
-                                               cmd, off, count, brw_flags,
-                                               async_flags);
-        RETURN(rc);
-}
-
-static inline int obd_trigger_group_io(struct obd_export *exp,
-                                       struct lov_stripe_md *lsm,
-                                       struct lov_oinfo *loi,
-                                       struct obd_io_group *oig)
-{
-        int rc;
-        ENTRY;
-
-        EXP_CHECK_DT_OP(exp, trigger_group_io);
-        EXP_COUNTER_INCREMENT(exp, trigger_group_io);
-
-        rc = OBP(exp->exp_obd, trigger_group_io)(exp, lsm, loi, oig);
-        RETURN(rc);
-}
-
-static inline int obd_teardown_async_page(struct obd_export *exp,
-                                          struct lov_stripe_md *lsm,
-                                          struct lov_oinfo *loi, void *cookie)
-{
-        int rc;
-        ENTRY;
-
-        EXP_CHECK_DT_OP(exp, teardown_async_page);
-        EXP_COUNTER_INCREMENT(exp, teardown_async_page);
-
-        rc = OBP(exp->exp_obd, teardown_async_page)(exp, lsm, loi, cookie);
-        RETURN(rc);
-}
-
  static inline int obd_preprw(int cmd, struct obd_export *exp, struct obdo *oa,
                               int objcount, struct obd_ioobj *obj,
                               struct niobuf_remote *remote, int *pages,
@@ -1536,21 +1305,6 @@ static inline int obd_enqueue(struct obd_export *exp,
          RETURN(rc);
  }
  
-static inline int obd_match(struct obd_export *exp, struct lov_stripe_md *ea,
-                            __u32 type, ldlm_policy_data_t *policy, __u32 mode,
-                            int *flags, void *data, struct lustre_handle *lockh)
-{
-        int rc;
-        ENTRY;
-
-        EXP_CHECK_DT_OP(exp, match);
-        EXP_COUNTER_INCREMENT(exp, match);
-
-        rc = OBP(exp->exp_obd, match)(exp, ea, type, policy, mode, flags, data,
-                                      lockh);
-        RETURN(rc);
-}
-
  static inline int obd_change_cbdata(struct obd_export *exp,
                                      struct lov_stripe_md *lsm,
                                      ldlm_iterator_t it, void *data)
@@ -1769,6 +1523,7 @@ static inline int obd_register_observer(struct obd_device *obd,
          RETURN(0);
  }
  
+#if 0
  static inline int obd_register_page_removal_cb(struct obd_export *exp,
                                                 obd_page_removal_cb_t cb,
                                                 obd_pin_extent_cb pin_cb)
@@ -1821,6 +1576,7 @@ static inline int obd_unregister_lock_cancel_cb(struct obd_export *exp,
          rc = OBP(exp->exp_obd, unregister_lock_cancel_cb)(exp, cb);
          RETURN(rc);
  }
+#endif
  
  /* metadata helpers */
  static inline int md_getstatus(struct obd_export *exp,
diff --git a/lustre/include/obd_ost.h b/lustre/include/obd_ost.h

index e72400a..b849ff0 100644 (file)
--- a/lustre/include/obd_ost.h
+++ b/lustre/include/obd_ost.h
@@ -36,7 +36,7 @@
   * lustre/include/obd_ost.h
   *
   * Data structures for object storage targets and client: OST & OSC's
- * 
+ *
   * See also lustre_idl.h for wire formats of requests.
   */
  
@@ -54,21 +54,34 @@ struct osc_brw_async_args {
          struct brw_page  **aa_ppga;
          struct client_obd *aa_cli;
          struct list_head   aa_oaps;
+        struct cl_req     *aa_clerq;
  };
  
  struct osc_async_args {
          struct obd_info   *aa_oi;
  };
  
+struct osc_punch_args {
+        struct obdo         *pa_oa;
+        obd_enqueue_update_f pa_upcall;
+        void                *pa_cookie;
+};
+
  struct osc_enqueue_args {
-        struct obd_export       *oa_exp;
-        struct obd_info         *oa_oi;
-        struct ldlm_enqueue_info*oa_ei;
+        struct obd_export        *oa_exp;
+        int                      *oa_flags;
+        obd_enqueue_update_f      oa_upcall;
+        void                     *oa_cookie;
+        struct ost_lvb           *oa_lvb;
+        struct lustre_handle     *oa_lockh;
+        struct ldlm_enqueue_info *oa_ei;
  };
  
+#if 0
  int osc_extent_blocking_cb(struct ldlm_lock *lock,
                             struct ldlm_lock_desc *new, void *data,
                             int flag);
+#endif
  
  /** 
   * Build DLM resource name from object id & group for osc-ost extent lock.
diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h

index 2a8389d..25ab060 100644 (file)
--- a/lustre/include/obd_support.h
+++ b/lustre/include/obd_support.h
@@ -77,6 +77,7 @@ extern unsigned int ldlm_timeout;         /* seconds */
  extern unsigned int obd_sync_filter;
  extern unsigned int obd_max_dirty_pages;
  extern atomic_t obd_dirty_pages;
+extern atomic_t obd_dirty_transit_pages;
  extern cfs_waitq_t obd_race_waitq;
  extern int obd_race_state;
  extern unsigned int obd_alloc_fail_rate;
diff --git a/lustre/kernel_patches/patches/2.6-rhel4-kgdb-ga.patch b/lustre/kernel_patches/patches/2.6-rhel4-kgdb-ga.patch

index ceaaa20..b88a2bd 100644 (file)
--- a/lustre/kernel_patches/patches/2.6-rhel4-kgdb-ga.patch
+++ b/lustre/kernel_patches/patches/2.6-rhel4-kgdb-ga.patch
@@ -1573,7 +1573,7 @@ Index: linux/MAINTAINERS
  ===================================================================
  --- linux.orig/MAINTAINERS
  +++ linux/MAINTAINERS
-@@ -1242,6 +1242,12 @@ W:      http://sf.net/projects/kernel-janitor
+@@ -1247,6 +1247,12 @@ W:      http://sf.net/projects/kernel-janitor
   W:    http://developer.osdl.org/rddunlap/kj-patches/
   S:    Maintained
   
@@ -1590,7 +1590,7 @@ Index: linux/arch/i386/Kconfig
  ===================================================================
  --- linux.orig/arch/i386/Kconfig
  +++ linux/arch/i386/Kconfig
-@@ -1250,6 +1250,14 @@ menu "Executable file formats"
+@@ -1292,6 +1292,14 @@ menu "Executable file formats"
   
   source "fs/Kconfig.binfmt"
   
@@ -1800,7 +1800,7 @@ Index: linux/arch/i386/Makefile
  ===================================================================
  --- linux.orig/arch/i386/Makefile
  +++ linux/arch/i386/Makefile
-@@ -98,6 +98,9 @@ core-$(CONFIG_X86_ES7000)    := arch/i386/m
+@@ -102,6 +102,9 @@ core-$(CONFIG_X86_ES7000)  := arch/i386/m
   # default subarch .h files
   mflags-y += -Iinclude/asm-i386/mach-default
   
@@ -1855,7 +1855,7 @@ Index: linux/arch/i386/kernel/entry.S
   
   #define __RESTORE_INT_REGS \
         popl %ebx;      \
-@@ -357,6 +370,7 @@ need_resched:
+@@ -361,6 +374,7 @@ need_resched:
         # sysenter call handler stub
   ENTRY(sysenter_entry)
         movl TSS_sysenter_esp0(%esp),%esp
@@ -1863,7 +1863,7 @@ Index: linux/arch/i386/kernel/entry.S
   sysenter_past_esp:
         sti
         pushl $(__USER_DS)
-@@ -437,6 +451,19 @@ syscall_exit:
+@@ -441,6 +455,19 @@ syscall_exit:
         testw $_TIF_ALLWORK_MASK, %cx   # current->work
         jne syscall_exit_work
   restore_all:
@@ -4382,7 +4382,7 @@ Index: linux/arch/i386/kernel/traps.c
                 if (notify_die(DIE_GPF, "general protection fault", regs,
                                 error_code, 13, SIGSEGV) == NOTIFY_STOP)
                         return;
-@@ -829,8 +874,18 @@ asmlinkage void do_debug(struct pt_regs 
+@@ -835,8 +880,18 @@ asmlinkage void do_debug(struct pt_regs 
                  * allowing programs to debug themselves without the ptrace()
                  * interface.
                  */
@@ -4401,7 +4401,7 @@ Index: linux/arch/i386/kernel/traps.c
                 if ((tsk->ptrace & (PT_DTRACE|PT_PTRACED)) == PT_DTRACE)
                         goto clear_TF;
         }
-@@ -842,6 +897,17 @@ asmlinkage void do_debug(struct pt_regs 
+@@ -848,6 +903,17 @@ asmlinkage void do_debug(struct pt_regs 
         info.si_errno = 0;
         info.si_code = TRAP_BRKPT;
         
@@ -4419,7 +4419,7 @@ Index: linux/arch/i386/kernel/traps.c
         /* If this is a kernel mode trap, save the user PC on entry to 
          * the kernel, that's what the debugger can make sense of.
          */
-@@ -856,6 +922,7 @@ clear_dr7:
+@@ -862,6 +928,7 @@ clear_dr7:
         __asm__("movl %0,%%db7"
                 : /* no output */
                 : "r" (0));
@@ -4427,7 +4427,7 @@ Index: linux/arch/i386/kernel/traps.c
         return;
   
   debug_vm86:
-@@ -1151,6 +1218,12 @@ static void __init set_task_gate(unsigne
+@@ -1157,6 +1224,12 @@ static void __init set_task_gate(unsigne
   {
         _set_gate(idt_table+n,5,0,0,(gdt_entry<<3));
   }
@@ -4440,7 +4440,7 @@ Index: linux/arch/i386/kernel/traps.c
   
   
   void __init trap_init(void)
-@@ -1169,7 +1242,11 @@ void __init trap_init(void)
+@@ -1175,7 +1248,11 @@ void __init trap_init(void)
         set_trap_gate(0,&divide_error);
         set_intr_gate(1,&debug);
         set_intr_gate(2,&nmi);
@@ -5051,7 +5051,7 @@ Index: linux/drivers/serial/8250.c
  ===================================================================
  --- linux.orig/drivers/serial/8250.c
  +++ linux/drivers/serial/8250.c
-@@ -880,7 +880,7 @@ receive_chars(struct uart_8250_port *up,
+@@ -882,7 +882,7 @@ receive_chars(struct uart_8250_port *up,
                 if (unlikely(tty->flip.count >= TTY_FLIPBUF_SIZE)) {
                         tty->flip.work.func((void *)tty);
                         if (tty->flip.count >= TTY_FLIPBUF_SIZE)
@@ -5060,7 +5060,7 @@ Index: linux/drivers/serial/8250.c
                 }
                 ch = serial_inp(up, UART_RX);
                 *tty->flip.char_buf_ptr = ch;
-@@ -1241,12 +1241,21 @@ static void serial8250_break_ctl(struct 
+@@ -1245,12 +1245,21 @@ static void serial8250_break_ctl(struct 
         spin_unlock_irqrestore(&up->port.lock, flags);
   }
   
@@ -5082,7 +5082,7 @@ Index: linux/drivers/serial/8250.c
         up->capabilities = uart_config[up->port.type].flags;
         up->mcr = 0;
   
-@@ -1877,6 +1886,10 @@ static void __init serial8250_register_p
+@@ -1881,6 +1890,10 @@ static void __init serial8250_register_p
         for (i = 0; i < UART_NR; i++) {
                 struct uart_8250_port *up = &serial8250_ports[i];
   
@@ -5093,7 +5093,7 @@ Index: linux/drivers/serial/8250.c
                 up->port.line = i;
                 up->port.ops = &serial8250_pops;
                 init_timer(&up->timer);
-@@ -2160,6 +2173,31 @@ void serial8250_resume_port(int line)
+@@ -2181,6 +2194,31 @@ void serial8250_resume_port(int line)
         uart_resume_port(&serial8250_reg, &serial8250_ports[line].port);
   }
   
@@ -6317,20 +6317,12 @@ Index: linux/include/linux/spinlock.h
  +                SET_WHO(x, current)       \
         } while (0)
   
- /* without debugging, spin_is_locked on UP always says
-@@ -151,6 +162,7 @@ typedef struct {
-               (x)->lock = 1; \
-               (x)->owner = __FILE__; \
-               (x)->oline = __LINE__; \
-+                SET_WHO(x, current)       \
-               1; \
-       })
- 
+ #define spin_is_locked(x) \
  Index: linux/kernel/pid.c
  ===================================================================
  --- linux.orig/kernel/pid.c
  +++ linux/kernel/pid.c
-@@ -276,6 +276,9 @@ int pid_alive(struct task_struct *p)
+@@ -320,6 +320,9 @@ struct pid *find_ge_pid(int nr)
    * machine.  From a minimum of 16 slots up to 4096 slots at one gigabyte or
    * more.
    */
@@ -6340,7 +6332,7 @@ Index: linux/kernel/pid.c
   void __init pidhash_init(void)
   {
         int i, j, pidhash_size;
-@@ -297,6 +300,9 @@ void __init pidhash_init(void)
+@@ -341,6 +344,9 @@ void __init pidhash_init(void)
                 for (j = 0; j < pidhash_size; j++)
                         INIT_HLIST_HEAD(&pid_hash[i][j]);
         }
@@ -6354,7 +6346,7 @@ Index: linux/kernel/sched.c
  ===================================================================
  --- linux.orig/kernel/sched.c
  +++ linux/kernel/sched.c
-@@ -3190,6 +3190,13 @@ out_unlock:
+@@ -3207,6 +3207,13 @@ out_unlock:
   
   EXPORT_SYMBOL(set_user_nice);
   
diff --git a/lustre/kernel_patches/patches/2.6-rhel5-kgdb-ga.patch b/lustre/kernel_patches/patches/2.6-rhel5-kgdb-ga.patch

new file mode 100644 (file)

index 0000000..6e38859
--- /dev/null
+++ b/lustre/kernel_patches/patches/2.6-rhel5-kgdb-ga.patch
@@ -0,0 +1,19200 @@
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/Documentation/DocBook/Makefile linux-2.6.18-53.1.14.kgdb/Documentation/DocBook/Makefile
+--- linux-2.6.18-53.1.14/Documentation/DocBook/Makefile        2008-03-06 05:54:50.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/Documentation/DocBook/Makefile   2008-06-10 15:37:25.000000000 +0400
+@@ -11,7 +11,7 @@ DOCBOOKS := wanbook.xml z8530book.xml mc
+           procfs-guide.xml writing_usb_driver.xml \
+           kernel-api.xml journal-api.xml lsm.xml utrace.xml usb.xml \
+           gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml \
+-          genericirq.xml
++          genericirq.xml kgdb.xml
+ 
+ ###
+ # The build process is as follows (targets):
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/Documentation/DocBook/kgdb.tmpl linux-2.6.18-53.1.14.kgdb/Documentation/DocBook/kgdb.tmpl
+--- linux-2.6.18-53.1.14/Documentation/DocBook/kgdb.tmpl       1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/Documentation/DocBook/kgdb.tmpl  2008-06-10 15:38:50.000000000 +0400
+@@ -0,0 +1,250 @@
++<?xml version="1.0" encoding="UTF-8"?>
++<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
++      "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
++
++<book id="kgdbInternals">
++ <bookinfo>
++  <title>KGDB Internals</title>
++
++  <authorgroup>
++   <author>
++    <firstname>Tom</firstname>
++    <surname>Rini</surname>
++    <affiliation>
++     <address>
++      <email>trini@kernel.crashing.org</email>
++     </address>
++    </affiliation>
++   </author>
++  </authorgroup>
++
++  <authorgroup>
++   <author>
++    <firstname>Amit S.</firstname>
++    <surname>Kale</surname>
++    <affiliation>
++     <address>
++      <email>amitkale@linsyssoft.com</email>
++     </address>
++    </affiliation>
++   </author>
++  </authorgroup>
++
++  <copyright>
++   <year>2004-2005</year>
++   <holder>MontaVista Software, Inc.</holder>
++  </copyright>
++  <copyright>
++   <year>2004</year>
++   <holder>Amit S. Kale</holder>
++  </copyright>
++
++  <legalnotice>
++   <para>
++   This file is licensed under the terms of the GNU General Public License
++   version 2. This program is licensed "as is" without any warranty of any
++   kind, whether express or implied.
++   </para>
++
++  </legalnotice>
++ </bookinfo>
++
++<toc></toc>
++  <chapter id="Introduction">
++    <title>Introduction</title>
++    <para>
++    kgdb is a source level debugger for linux kernel. It is used along
++    with gdb to debug a linux kernel. Kernel developers can debug a kernel
++    similar to application programs with the use of kgdb. It makes it
++    possible to place breakpoints in kernel code, step through the code
++    and observe variables.
++    </para>
++    <para>
++    Two machines are required for using kgdb. One of these machines is a
++    development machine and the other is a test machine. The machines are
++    typically connected through a serial line, a null-modem cable which
++    connects their serial ports.  It is also possible however, to use an
++    ethernet connection between the machines.  The kernel to be debugged
++    runs on the test machine. gdb runs on the development machine. The
++    serial line or ethernet connection is used by gdb to communicate to
++    the kernel being debugged.
++    </para>
++  </chapter>
++  <chapter id="CompilingAKernel">
++    <title>Compiling a kernel</title>
++    <para>
++    To enable <symbol>CONFIG_KGDB</symbol>, look under the "Kernel debugging"
++    and then select "KGDB: kernel debugging with remote gdb".
++    </para>
++    <para>
++    The first choice for I/O is <symbol>CONFIG_KGDB_ONLY_MODULES</symbol>.
++    This means that you will only be able to use KGDB after loading a
++    kernel module that defines how you want to be able to talk with
++    KGDB.  There are two other choices (more on some architectures) that
++    can be enabled as modules later, if not picked here.
++    </para>
++    <para>The first of these is <symbol>CONFIG_KGDB_8250_NOMODULE</symbol>.
++    This has sub-options such as <symbol>CONFIG_KGDB_SIMPLE_SERIAL</symbol>
++    which toggles choosing the serial port by ttyS number or by specifying
++    a port and IRQ number.
++    </para>
++    <para>
++    The second of these choices on most systems for I/O is
++    <symbol>CONFIG_KGDBOE</symbol>. This requires that the machine to be
++    debugged has an ethernet card which supports the netpoll API, such as
++    the cards supported by <symbol>CONFIG_E100</symbol>.  There are no
++    sub-options for this, but a kernel command line option is required.
++    </para>
++  </chapter>
++  <chapter id="BootingTheKernel">
++    <title>Booting the kernel</title>
++    <para>
++    The Kernel command line option <constant>kgdbwait</constant> makes kgdb
++    wait for gdb connection during booting of a kernel.  If the
++    <symbol>CONFIG_KGDB_8250</symbol> driver is used (or if applicable,
++    another serial driver) this breakpoint will happen very early on, before
++    console output.  If you wish to change serial port information and you
++    have enabled both <symbol>CONFIG_KGDB_8250</symbol> and
++    <symbol>CONFIG_KGDB_SIMPLE_SERIAL</symbol> then you must pass the option
++    <constant>kgdb8250=&lt;io or mmio&gt;,&lt;address&gt;,&lt;baud
++    rate&gt;,&lt;irq&gt;</constant> before <constant>kgdbwait</constant>.
++    The values <constant>io</constant> or <constant>mmio</constant> refer to
++    if the address being passed next needs to be memory mapped
++    (<constant>mmio</constant>) or not. The <constant>address</constant> must
++    be passed in hex and is the hardware address and will be remapped if
++    passed as <constant>mmio</constant>. The value
++    <constant>baud rate</constant> and <constant>irq</constant> are base-10.
++    The supported values for <constant>baud rate</constant> are
++    <constant>9600</constant>, <constant>19200</constant>,
++    <constant>38400</constant>, <constant>57600</constant>, and
++    <constant>115200</constant>.
++    </para>
++    <para>
++    To have KGDB stop the kernel and wait, with the compiled values for the
++    serial driver, pass in: <constant>kgdbwait</constant>.
++    </para>
++    <para>
++    To specify the values of the SH SCI(F) serial port at boot:
++    <constant>kgdbsci=0,115200</constant>.
++    </para>
++    <para>
++    To specify the values of the serial port at boot:
++    <constant>kgdb8250=io,3f8,115200,3</constant>.
++    On IA64 this could also be:
++    <constant>kgdb8250=mmio,0xff5e0000,115200,74</constant>
++    And to have KGDB also stop the kernel and wait for GDB to connect, pass in
++    <constant>kgdbwait</constant> after this arguement.
++    </para>
++    <para>
++    To configure the <symbol>CONFIG_KGDBOE</symbol> driver, pass in
++    <constant>kgdboe=[src-port]@&lt;src-ip&gt;/[dev],[tgt-port]@&lt;tgt-ip&gt;/[tgt-macaddr]</constant>
++    where:
++    <itemizedlist>
++      <listitem><para>src-port (optional): source for UDP packets (defaults to <constant>6443</constant>)</para></listitem>
++      <listitem><para>src-ip: source IP to use (interface address)</para></listitem>
++      <listitem><para>dev (optional): network interface (<constant>eth0</constant>)</para></listitem>
++      <listitem><para>tgt-port (optional): port GDB will use (defaults to <constant>6442</constant>)</para></listitem>
++      <listitem><para>tgt-ip: IP address GDB will be connecting from</para></listitem>
++      <listitem><para>tgt-macaddr (optional): ethernet MAC address for logging agent (default is broadcast)</para></listitem>
++    </itemizedlist>
++    </para>
++    <para>
++    The <symbol>CONFIG_KGDBOE</symbol> driver can be reconfigured at run
++    time, if <symbol>CONFIG_SYSFS</symbol> and
++    <symbol>CONFIG_MODULES</symbol> by echo'ing a new config string to
++    <constant>/sys/module/kgdboe/parameter/kgdboe</constant>.  The
++    driver can be unconfigured with the special string
++    <constant>not_configured</constant>.
++    </para>
++  </chapter>
++  <chapter id="ConnectingGDB">
++  <title>Connecting gdb</title>
++    <para>
++    If you have used any of the methods to have KGDB stop and create
++    an initial breakpoint described in the previous chapter, kgdb prints
++    the message "Waiting for connection from remote gdb..." on the console
++    and waits for connection from gdb. At this point you connect gdb to kgdb.
++    </para>
++    <para>
++    Example (serial):
++    </para>
++    <programlisting>
++    % gdb ./vmlinux
++    (gdb) set remotebaud 115200
++    (gdb) target remote /dev/ttyS0
++    </programlisting>
++    <para>
++    Example (ethernet):
++    </para>
++    <programlisting>
++    % gdb ./vmlinux
++    (gdb) target remote udp:192.168.2.2:6443
++    </programlisting>
++    <para>
++    Once connected, you can debug a kernel the way you would debug an
++    application program.
++    </para>
++  </chapter>
++  <chapter id="ArchitectureNotes">
++    <title>Architecture specific notes</title>
++      <para>
++      SuperH: The NMI switch found on some boards can be used to trigger an
++      initial breakpoint.  Subsequent triggers do nothing.  If console
++      is enabled on the SCI(F) serial port, and that is the port being used
++      for KGDB, then you must trigger a breakpoint via sysrq, NMI, or
++      some other method prior to connecting, or echo a control-c to the
++      serial port.  Also, to use the SCI(F) port for KGDB, the
++      <symbol>CONFIG_SERIAL_SH_SCI</symbol> driver must be enabled.
++      </para>
++  </chapter>
++  <chapter id="CommonBackEndReq">
++    <title>The common backend (required)</title>
++      <para>
++      There are a few flags which must be set on every architecture in
++      their &lt;asm/kgdb.h&gt; file.  These are:
++      <itemizedlist>
++        <listitem>
++        <para>
++        NUMREGBYTES: The size in bytes of all of the registers, so
++        that we can ensure they will all fit into a packet.
++        </para>
++        <para>
++        BUFMAX: The size in bytes of the buffer GDB will read into.
++        This must be larger than NUMREGBYTES.
++        </para>
++        <para>
++        CACHE_FLUSH_IS_SAFE: Set to one if it always safe to call
++        flush_cache_range or flush_icache_range.  On some architectures,
++        these functions may not be safe to call on SMP since we keep other
++        CPUs in a holding pattern.
++        </para>
++      </listitem>
++      </itemizedlist>
++      </para>
++      <para>
++      There are also the following functions for the common backend,
++      found in kernel/kgdb.c that must be supplied by the
++      architecture-specific backend.  No weak version of these is provided.
++      </para>
++!Iinclude/linux/kgdb.h
++  </chapter>
++  <chapter id="CommonBackEndOpt">
++    <title>The common backend (optional)</title>
++      <para>
++      These functions are part of the common backend, found in kernel/kgdb.c
++      and are optionally implemented.  Some functions (with _hw_ in the name)
++      end up being required on arches which use hardware breakpoints.
++      </para>
++!Ikernel/kgdb.c
++  </chapter>
++  <chapter id="DriverSpecificFunctions">
++    <title>Driver-Specific Functions</title>
++      <para>
++      Some of the I/O drivers have additional functions that can be
++      called, that are specific to the driver.  Calls from other places
++      to these functions must be wrapped in #ifdefs for the driver in
++      question.
++      </para>
++!Idrivers/serial/8250_kgdb.c
++   </chapter>
++</book>
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/MAINTAINERS linux-2.6.18-53.1.14.kgdb/MAINTAINERS
+--- linux-2.6.18-53.1.14/MAINTAINERS   2008-03-06 05:54:49.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/MAINTAINERS      2008-06-10 15:37:25.000000000 +0400
+@@ -1715,6 +1715,15 @@ L:      linux-kernel@vger.kernel.org
+ L:    fastboot@osdl.org
+ S:    Maintained
+ 
++KGDB
++P:    Tom Rini
++P:    Amit S. Kale
++M:    trini@kernel.crashing.org
++M:    amitkale@linsyssoft.com
++W:    http://sourceforge.net/projects/kgdb
++L:    kgdb-bugreport@lists.sourceforge.net
++S:    Maintained
++
+ KPROBES
+ P:    Prasanna S Panchamukhi
+ M:    prasanna@in.ibm.com
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/Makefile linux-2.6.18-53.1.14.kgdb/Makefile
+--- linux-2.6.18-53.1.14/Makefile      2008-03-06 05:55:00.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/Makefile 2008-06-10 15:39:01.000000000 +0400
+@@ -992,6 +992,7 @@ MRPROPER_DIRS  += include/config include
+ MRPROPER_FILES += .config .config.old include/asm .version .old_version \
+                   include/linux/autoconf.h include/linux/version.h      \
+                   include/linux/utsrelease.h                            \
++                include/linux/dwarf2-defs.h                           \
+                 Module.symvers tags TAGS cscope*
+ 
+ # clean - Delete most, but leave enough to build external modules
+@@ -1422,7 +1423,11 @@ clean := -f $(if $(KBUILD_SRC),$(srctree
+ endif # skip-makefile
+ 
+ PHONY += FORCE
+-FORCE:
++include/linux/dwarf2-defs.h: $(srctree)/include/linux/dwarf2.h $(srctree)/scripts/dwarfh.awk
++      mkdir -p include/linux/
++      awk -f $(srctree)/scripts/dwarfh.awk $(srctree)/include/linux/dwarf2.h > include/linux/dwarf2-defs.h
++
++FORCE: include/linux/dwarf2-defs.h
+ 
+ 
+ # Declare the contents of the .PHONY variable as phony.  We keep that
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/arm/kernel/Makefile linux-2.6.18-53.1.14.kgdb/arch/arm/kernel/Makefile
+--- linux-2.6.18-53.1.14/arch/arm/kernel/Makefile      2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/arm/kernel/Makefile 2008-06-10 15:38:56.000000000 +0400
+@@ -20,6 +20,7 @@ obj-$(CONFIG_ISA_DMA)                += dma-isa.o
+ obj-$(CONFIG_PCI)             += bios32.o isa.o
+ obj-$(CONFIG_SMP)             += smp.o
+ obj-$(CONFIG_OABI_COMPAT)     += sys_oabi-compat.o
++obj-$(CONFIG_KGDB)            += kgdb.o kgdb-jmp.o
+ 
+ obj-$(CONFIG_CRUNCH)          += crunch.o crunch-bits.o
+ AFLAGS_crunch-bits.o          := -Wa,-mcpu=ep9312
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/arm/kernel/entry-armv.S linux-2.6.18-53.1.14.kgdb/arch/arm/kernel/entry-armv.S
+--- linux-2.6.18-53.1.14/arch/arm/kernel/entry-armv.S  2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/arm/kernel/entry-armv.S     2008-06-10 15:39:01.000000000 +0400
+@@ -15,6 +15,7 @@
+  *  it to save wrong values...  Be aware!
+  */
+ 
++#include <asm/kgdb.h>
+ #include <asm/memory.h>
+ #include <asm/glue.h>
+ #include <asm/vfpmacros.h>
+@@ -232,6 +233,7 @@ svc_preempt:
+       beq     preempt_return                  @ go again
+       b       1b
+ #endif
++      CFI_END_FRAME(__irq_svc)
+ 
+       .align  5
+ __und_svc:
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/arm/kernel/kgdb-jmp.S linux-2.6.18-53.1.14.kgdb/arch/arm/kernel/kgdb-jmp.S
+--- linux-2.6.18-53.1.14/arch/arm/kernel/kgdb-jmp.S    1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/arch/arm/kernel/kgdb-jmp.S       2008-06-10 15:38:56.000000000 +0400
+@@ -0,0 +1,32 @@
++/*
++ * arch/arm/kernel/kgdb-jmp.S
++ *
++ * Trivial setjmp and longjmp procedures to support bus error recovery
++ * which may occur during kgdb memory read/write operations.
++ *
++ * Author: MontaVista Software, Inc. <source@mvista.com>
++ *         source@mvista.com
++ *
++ * 2002-2005 (c) MontaVista Software, Inc.  This file is licensed under the
++ * terms of the GNU General Public License version 2. This program as licensed
++ * "as is" without any warranty of any kind, whether express or implied.
++ */
++#include <linux/linkage.h>
++
++ENTRY (kgdb_fault_setjmp)
++      /* Save registers */
++      stmia   r0, {r0-r14}
++      str     lr,[r0, #60]
++      mrs     r1,cpsr
++      str     r1,[r0,#64]
++      ldr     r1,[r0,#4]
++      mov     r0, #0
++      mov     pc,lr
++
++ENTRY (kgdb_fault_longjmp)
++      /* Restore registers */
++      mov     r1,#1
++      str     r1,[r0]
++      ldr     r1,[r0, #64]
++      msr     spsr,r1
++      ldmia   r0,{r0-pc}^
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/arm/kernel/kgdb.c linux-2.6.18-53.1.14.kgdb/arch/arm/kernel/kgdb.c
+--- linux-2.6.18-53.1.14/arch/arm/kernel/kgdb.c        1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/arch/arm/kernel/kgdb.c   2008-06-10 15:38:56.000000000 +0400
+@@ -0,0 +1,208 @@
++/*
++ * arch/arm/kernel/kgdb.c
++ *
++ * ARM KGDB support
++ *
++ * Copyright (c) 2002-2004 MontaVista Software, Inc
++ *
++ * Authors:  George Davis <davis_g@mvista.com>
++ *           Deepak Saxena <dsaxena@plexity.net>
++ */
++#include <linux/config.h>
++#include <linux/types.h>
++#include <linux/kernel.h>
++#include <linux/signal.h>
++#include <linux/sched.h>
++#include <linux/mm.h>
++#include <linux/spinlock.h>
++#include <linux/personality.h>
++#include <linux/ptrace.h>
++#include <linux/elf.h>
++#include <linux/interrupt.h>
++#include <linux/init.h>
++#include <linux/kgdb.h>
++
++#include <asm/atomic.h>
++#include <asm/io.h>
++#include <asm/pgtable.h>
++#include <asm/system.h>
++#include <asm/uaccess.h>
++#include <asm/unistd.h>
++#include <asm/ptrace.h>
++#include <asm/traps.h>
++
++/* Make a local copy of the registers passed into the handler (bletch) */
++void regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *kernel_regs)
++{
++      int regno;
++
++      /* Initialize all to zero (??) */
++      for (regno = 0; regno < GDB_MAX_REGS; regno++)
++              gdb_regs[regno] = 0;
++
++      gdb_regs[_R0] = kernel_regs->ARM_r0;
++      gdb_regs[_R1] = kernel_regs->ARM_r1;
++      gdb_regs[_R2] = kernel_regs->ARM_r2;
++      gdb_regs[_R3] = kernel_regs->ARM_r3;
++      gdb_regs[_R4] = kernel_regs->ARM_r4;
++      gdb_regs[_R5] = kernel_regs->ARM_r5;
++      gdb_regs[_R6] = kernel_regs->ARM_r6;
++      gdb_regs[_R7] = kernel_regs->ARM_r7;
++      gdb_regs[_R8] = kernel_regs->ARM_r8;
++      gdb_regs[_R9] = kernel_regs->ARM_r9;
++      gdb_regs[_R10] = kernel_regs->ARM_r10;
++      gdb_regs[_FP] = kernel_regs->ARM_fp;
++      gdb_regs[_IP] = kernel_regs->ARM_ip;
++      gdb_regs[_SP] = kernel_regs->ARM_sp;
++      gdb_regs[_LR] = kernel_regs->ARM_lr;
++      gdb_regs[_PC] = kernel_regs->ARM_pc;
++      gdb_regs[_CPSR] = kernel_regs->ARM_cpsr;
++}
++
++/* Copy local gdb registers back to kgdb regs, for later copy to kernel */
++void gdb_regs_to_regs(unsigned long *gdb_regs, struct pt_regs *kernel_regs)
++{
++      kernel_regs->ARM_r0 = gdb_regs[_R0];
++      kernel_regs->ARM_r1 = gdb_regs[_R1];
++      kernel_regs->ARM_r2 = gdb_regs[_R2];
++      kernel_regs->ARM_r3 = gdb_regs[_R3];
++      kernel_regs->ARM_r4 = gdb_regs[_R4];
++      kernel_regs->ARM_r5 = gdb_regs[_R5];
++      kernel_regs->ARM_r6 = gdb_regs[_R6];
++      kernel_regs->ARM_r7 = gdb_regs[_R7];
++      kernel_regs->ARM_r8 = gdb_regs[_R8];
++      kernel_regs->ARM_r9 = gdb_regs[_R9];
++      kernel_regs->ARM_r10 = gdb_regs[_R10];
++      kernel_regs->ARM_fp = gdb_regs[_FP];
++      kernel_regs->ARM_ip = gdb_regs[_IP];
++      kernel_regs->ARM_sp = gdb_regs[_SP];
++      kernel_regs->ARM_lr = gdb_regs[_LR];
++      kernel_regs->ARM_pc = gdb_regs[_PC];
++      kernel_regs->ARM_cpsr = gdb_regs[GDB_MAX_REGS - 1];
++}
++
++static inline struct pt_regs *kgdb_get_user_regs(struct task_struct *task)
++{
++      return (struct pt_regs *)
++          ((unsigned long)task->thread_info + THREAD_SIZE -
++           8 - sizeof(struct pt_regs));
++}
++
++void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs,
++                               struct task_struct *task)
++{
++      int regno;
++      struct pt_regs *thread_regs;
++
++      /* Just making sure... */
++      if (task == NULL)
++              return;
++
++      /* Initialize to zero */
++      for (regno = 0; regno < GDB_MAX_REGS; regno++)
++              gdb_regs[regno] = 0;
++
++      /* Otherwise, we have only some registers from switch_to() */
++      thread_regs = kgdb_get_user_regs(task);
++      gdb_regs[_R0] = thread_regs->ARM_r0;    /* Not really valid? */
++      gdb_regs[_R1] = thread_regs->ARM_r1;    /* "               " */
++      gdb_regs[_R2] = thread_regs->ARM_r2;    /* "               " */
++      gdb_regs[_R3] = thread_regs->ARM_r3;    /* "               " */
++      gdb_regs[_R4] = thread_regs->ARM_r4;
++      gdb_regs[_R5] = thread_regs->ARM_r5;
++      gdb_regs[_R6] = thread_regs->ARM_r6;
++      gdb_regs[_R7] = thread_regs->ARM_r7;
++      gdb_regs[_R8] = thread_regs->ARM_r8;
++      gdb_regs[_R9] = thread_regs->ARM_r9;
++      gdb_regs[_R10] = thread_regs->ARM_r10;
++      gdb_regs[_FP] = thread_regs->ARM_fp;
++      gdb_regs[_IP] = thread_regs->ARM_ip;
++      gdb_regs[_SP] = thread_regs->ARM_sp;
++      gdb_regs[_LR] = thread_regs->ARM_lr;
++      gdb_regs[_PC] = thread_regs->ARM_pc;
++      gdb_regs[_CPSR] = thread_regs->ARM_cpsr;
++}
++
++static int compiled_break;
++
++int kgdb_arch_handle_exception(int exception_vector, int signo,
++                             int err_code, char *remcom_in_buffer,
++                             char *remcom_out_buffer,
++                             struct pt_regs *linux_regs)
++{
++      long addr;
++      char *ptr;
++
++      switch (remcom_in_buffer[0]) {
++      case 'c':
++              kgdb_contthread = NULL;
++
++              /*
++               * Try to read optional parameter, pc unchanged if no parm.
++               * If this was a compiled breakpoint, we need to move
++               * to the next instruction or we will just breakpoint
++               * over and over again.
++               */
++              ptr = &remcom_in_buffer[1];
++              if (kgdb_hex2long(&ptr, &addr)) {
++                      linux_regs->ARM_pc = addr;
++              } else if (compiled_break == 1) {
++                      linux_regs->ARM_pc += 4;
++              }
++
++              compiled_break = 0;
++
++              return 0;
++      }
++
++      return -1;
++}
++
++static int kgdb_brk_fn(struct pt_regs *regs, unsigned int instr)
++{
++      kgdb_handle_exception(1, SIGTRAP, 0, regs);
++
++      return 0;
++}
++
++static int kgdb_compiled_brk_fn(struct pt_regs *regs, unsigned int instr)
++{
++      compiled_break = 1;
++      kgdb_handle_exception(1, SIGTRAP, 0, regs);
++
++      return 0;
++}
++
++static struct undef_hook kgdb_brkpt_hook = {
++      .instr_mask = 0xffffffff,
++      .instr_val = KGDB_BREAKINST,
++      .fn = kgdb_brk_fn
++};
++
++static struct undef_hook kgdb_compiled_brkpt_hook = {
++      .instr_mask = 0xffffffff,
++      .instr_val = KGDB_COMPILED_BREAK,
++      .fn = kgdb_compiled_brk_fn
++};
++
++/*
++ * Register our undef instruction hooks with ARM undef core.
++ * We regsiter a hook specifically looking for the KGB break inst
++ * and we handle the normal undef case within the do_undefinstr
++ * handler.
++ */
++int kgdb_arch_init(void)
++{
++      register_undef_hook(&kgdb_brkpt_hook);
++      register_undef_hook(&kgdb_compiled_brkpt_hook);
++
++      return 0;
++}
++
++struct kgdb_arch arch_kgdb_ops = {
++#ifndef __ARMEB__
++      .gdb_bpt_instr = {0xfe, 0xde, 0xff, 0xe7}
++#else
++      .gdb_bpt_instr = {0xe7, 0xff, 0xde, 0xfe}
++#endif
++};
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/arm/kernel/setup.c linux-2.6.18-53.1.14.kgdb/arch/arm/kernel/setup.c
+--- linux-2.6.18-53.1.14/arch/arm/kernel/setup.c       2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/arm/kernel/setup.c  2008-06-10 15:38:56.000000000 +0400
+@@ -829,6 +829,11 @@ void __init setup_arch(char **cmdline_p)
+       conswitchp = &dummy_con;
+ #endif
+ #endif
++
++#if   defined(CONFIG_KGDB)
++      extern void __init early_trap_init(void);
++      early_trap_init();
++#endif
+ }
+ 
+ 
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/arm/kernel/traps.c linux-2.6.18-53.1.14.kgdb/arch/arm/kernel/traps.c
+--- linux-2.6.18-53.1.14/arch/arm/kernel/traps.c       2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/arm/kernel/traps.c  2008-06-10 15:38:56.000000000 +0400
+@@ -278,6 +278,7 @@ asmlinkage void do_undefinstr(struct pt_
+       unsigned int instr;
+       struct undef_hook *hook;
+       siginfo_t info;
++      mm_segment_t fs;
+       void __user *pc;
+ 
+       /*
+@@ -287,12 +288,15 @@ asmlinkage void do_undefinstr(struct pt_
+        */
+       regs->ARM_pc -= correction;
+ 
++      fs = get_fs();
++      set_fs(KERNEL_DS);
+       pc = (void __user *)instruction_pointer(regs);
+       if (thumb_mode(regs)) {
+               get_user(instr, (u16 __user *)pc);
+       } else {
+               get_user(instr, (u32 __user *)pc);
+       }
++      set_fs(fs);
+ 
+       spin_lock_irq(&undef_lock);
+       list_for_each_entry(hook, &undef_hook, node) {
+@@ -684,6 +688,13 @@ EXPORT_SYMBOL(abort);
+ 
+ void __init trap_init(void)
+ {
++#if   defined(CONFIG_KGDB)
++      return;
++}
++
++void __init early_trap_init(void)
++{
++#endif
+       unsigned long vectors = CONFIG_VECTORS_BASE;
+       extern char __stubs_start[], __stubs_end[];
+       extern char __vectors_start[], __vectors_end[];
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/arm/mach-ixp2000/core.c linux-2.6.18-53.1.14.kgdb/arch/arm/mach-ixp2000/core.c
+--- linux-2.6.18-53.1.14/arch/arm/mach-ixp2000/core.c  2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/arm/mach-ixp2000/core.c     2008-06-10 15:38:56.000000000 +0400
+@@ -34,6 +34,7 @@
+ #include <asm/system.h>
+ #include <asm/tlbflush.h>
+ #include <asm/pgtable.h>
++#include <asm/kgdb.h>
+ 
+ #include <asm/mach/map.h>
+ #include <asm/mach/time.h>
+@@ -184,6 +185,9 @@ static struct platform_device ixp2000_se
+ void __init ixp2000_uart_init(void)
+ {
+       platform_device_register(&ixp2000_serial_device);
++#ifdef CONFIG_KGDB_8250
++      kgdb8250_add_port(0, &ixp2000_serial_port);
++#endif
+ }
+ 
+ 
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/arm/mach-ixp2000/ixdp2x01.c linux-2.6.18-53.1.14.kgdb/arch/arm/mach-ixp2000/ixdp2x01.c
+--- linux-2.6.18-53.1.14/arch/arm/mach-ixp2000/ixdp2x01.c      2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/arm/mach-ixp2000/ixdp2x01.c 2008-06-10 15:38:56.000000000 +0400
+@@ -38,6 +38,7 @@
+ #include <asm/system.h>
+ #include <asm/hardware.h>
+ #include <asm/mach-types.h>
++#include <asm/kgdb.h>
+ 
+ #include <asm/mach/pci.h>
+ #include <asm/mach/map.h>
+@@ -413,6 +414,11 @@ static void __init ixdp2x01_init_machine
+       platform_add_devices(ixdp2x01_devices, ARRAY_SIZE(ixdp2x01_devices));
+       ixp2000_uart_init();
+       ixdp2x01_uart_init();
++
++#ifdef CONFIG_KGDB_8250
++      kgdb8250_add_port(0, &ixdp425_serial_ports[0]);
++      kgdb8250_add_port(1, &ixdp425_serial_ports[1]);
++#endif
+ }
+ 
+ 
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/arm/mach-ixp4xx/coyote-setup.c linux-2.6.18-53.1.14.kgdb/arch/arm/mach-ixp4xx/coyote-setup.c
+--- linux-2.6.18-53.1.14/arch/arm/mach-ixp4xx/coyote-setup.c   2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/arm/mach-ixp4xx/coyote-setup.c      2008-06-10 15:38:56.000000000 +0400
+@@ -96,6 +96,10 @@ static void __init coyote_init(void)
+       }
+ 
+       platform_add_devices(coyote_devices, ARRAY_SIZE(coyote_devices));
++
++#ifdef CONFIG_KGDB_8250
++      kgdb8250_add_port(0, &coyote_serial_port);
++#endif
+ }
+ 
+ #ifdef CONFIG_ARCH_ADI_COYOTE
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/arm/mach-ixp4xx/ixdp425-setup.c linux-2.6.18-53.1.14.kgdb/arch/arm/mach-ixp4xx/ixdp425-setup.c
+--- linux-2.6.18-53.1.14/arch/arm/mach-ixp4xx/ixdp425-setup.c  2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/arm/mach-ixp4xx/ixdp425-setup.c     2008-06-10 15:38:56.000000000 +0400
+@@ -24,6 +24,7 @@
+ #include <asm/irq.h>
+ #include <asm/mach/arch.h>
+ #include <asm/mach/flash.h>
++#include <asm/kgdb.h>
+ 
+ static struct flash_platform_data ixdp425_flash_data = {
+       .map_name       = "cfi_probe",
+@@ -76,7 +77,8 @@ static struct plat_serial8250_port ixdp4
+               .mapbase        = IXP4XX_UART1_BASE_PHYS,
+               .membase        = (char *)IXP4XX_UART1_BASE_VIRT + REG_OFFSET,
+               .irq            = IRQ_IXP4XX_UART1,
+-              .flags          = UPF_BOOT_AUTOCONF | UPF_SKIP_TEST,
++              .flags          = UPF_BOOT_AUTOCONF | UPF_SKIP_TEST |
++                                      UPF_SHARE_IRQ,
+               .iotype         = UPIO_MEM,
+               .regshift       = 2,
+               .uartclk        = IXP4XX_UART_XTAL,
+@@ -85,7 +87,8 @@ static struct plat_serial8250_port ixdp4
+               .mapbase        = IXP4XX_UART2_BASE_PHYS,
+               .membase        = (char *)IXP4XX_UART2_BASE_VIRT + REG_OFFSET,
+               .irq            = IRQ_IXP4XX_UART2,
+-              .flags          = UPF_BOOT_AUTOCONF | UPF_SKIP_TEST,
++              .flags          = UPF_BOOT_AUTOCONF | UPF_SKIP_TEST |
++                                      UPF_SHARE_IRQ,
+               .iotype         = UPIO_MEM,
+               .regshift       = 2,
+               .uartclk        = IXP4XX_UART_XTAL,
+@@ -116,6 +119,11 @@ static void __init ixdp425_init(void)
+               IXP4XX_EXP_BUS_BASE(0) + ixp4xx_exp_bus_size - 1;
+ 
+       platform_add_devices(ixdp425_devices, ARRAY_SIZE(ixdp425_devices));
++
++#ifdef CONFIG_KGDB_8250
++      kgdb8250_add_port(0, &ixdp425_serial_ports[0]);
++      kgdb8250_add_port(1, &ixdp425_serial_ports[1]);
++#endif
+ }
+ 
+ #ifdef CONFIG_ARCH_IXDP425
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/arm/mach-omap1/serial.c linux-2.6.18-53.1.14.kgdb/arch/arm/mach-omap1/serial.c
+--- linux-2.6.18-53.1.14/arch/arm/mach-omap1/serial.c  2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/arm/mach-omap1/serial.c     2008-06-10 15:38:56.000000000 +0400
+@@ -15,6 +15,7 @@
+ #include <linux/delay.h>
+ #include <linux/serial.h>
+ #include <linux/tty.h>
++#include <linux/kgdb.h>
+ #include <linux/serial_8250.h>
+ #include <linux/serial_reg.h>
+ #include <linux/clk.h>
+@@ -199,6 +200,9 @@ void __init omap_serial_init(void)
+                       break;
+               }
+               omap_serial_reset(&serial_platform_data[i]);
++#ifdef CONFIG_KGDB_8250
++              kgdb8250_add_platform_port(i, &serial_platform_data[i]);
++#endif
+       }
+ }
+ 
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/arm/mach-pxa/Makefile linux-2.6.18-53.1.14.kgdb/arch/arm/mach-pxa/Makefile
+--- linux-2.6.18-53.1.14/arch/arm/mach-pxa/Makefile    2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/arm/mach-pxa/Makefile       2008-06-10 15:38:56.000000000 +0400
+@@ -31,6 +31,7 @@ obj-$(CONFIG_LEDS) += $(led-y)
+ # Misc features
+ obj-$(CONFIG_PM) += pm.o sleep.o
+ obj-$(CONFIG_PXA_SSP) += ssp.o
++obj-$(CONFIG_KGDB_PXA_SERIAL) += kgdb-serial.o
+ 
+ ifeq ($(CONFIG_PXA27x),y)
+ obj-$(CONFIG_PM) += standby.o
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/arm/mach-pxa/kgdb-serial.c linux-2.6.18-53.1.14.kgdb/arch/arm/mach-pxa/kgdb-serial.c
+--- linux-2.6.18-53.1.14/arch/arm/mach-pxa/kgdb-serial.c       1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/arch/arm/mach-pxa/kgdb-serial.c  2008-06-10 15:38:56.000000000 +0400
+@@ -0,0 +1,98 @@
++/*
++ * linux/arch/arm/mach-pxa/kgdb-serial.c
++ *
++ * Provides low level kgdb serial support hooks for PXA2xx boards
++ *
++ * Author:    Nicolas Pitre
++ * Copyright: (C) 2002-2005 MontaVista Software Inc.
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2 as
++ * published by the Free Software Foundation.
++ */
++
++#include <linux/config.h>
++#include <linux/serial_reg.h>
++#include <linux/kgdb.h>
++#include <asm/processor.h>
++#include <asm/hardware.h>
++#include <asm/arch/pxa-regs.h>
++
++#if   defined(CONFIG_KGDB_PXA_FFUART)
++
++#define UART          FFUART
++#define CKEN_UART     CKEN6_FFUART
++#define GPIO_RX_MD    GPIO34_FFRXD_MD
++#define GPIO_TX_MD    GPIO39_FFTXD_MD
++
++#elif defined(CONFIG_KGDB_PXA_BTUART)
++
++#define UART          BTUART
++#define CKEN_UART     CKEN7_BTUART
++#define GPIO_RX_MD    GPIO42_BTRXD_MD
++#define GPIO_TX_MD    GPIO43_BTTXD_MD
++
++#elif defined(CONFIG_KGDB_PXA_STUART)
++
++#define UART          STUART
++#define CKEN_UART     CKEN5_STUART
++#define GPIO_RX_MD    GPIO46_STRXD_MD
++#define GPIO_TX_MD    GPIO47_STTXD_MD
++
++#endif
++
++#define UART_BAUDRATE (CONFIG_KGDB_BAUDRATE)
++
++static volatile unsigned long *port = (unsigned long *)&UART;
++
++static int kgdb_serial_init(void)
++{
++      pxa_set_cken(CKEN_UART, 1);
++      pxa_gpio_mode(GPIO_RX_MD);
++      pxa_gpio_mode(GPIO_TX_MD);
++
++      port[UART_IER] = 0;
++      port[UART_LCR] = LCR_DLAB;
++      port[UART_DLL] = ((921600 / UART_BAUDRATE) & 0xff);
++      port[UART_DLM] = ((921600 / UART_BAUDRATE) >> 8);
++      port[UART_LCR] = LCR_WLS1 | LCR_WLS0;
++      port[UART_MCR] = 0;
++      port[UART_IER] = IER_UUE;
++      port[UART_FCR] = FCR_ITL_16;
++
++      return 0;
++}
++
++static void kgdb_serial_putchar(int c)
++{
++      if (!(CKEN & CKEN_UART) || port[UART_IER] != IER_UUE)
++              kgdb_serial_init();
++      while (!(port[UART_LSR] & LSR_TDRQ))
++              cpu_relax();
++      port[UART_TX] = c;
++}
++
++static void kgdb_serial_flush(void)
++{
++      if ((CKEN & CKEN_UART) && (port[UART_IER] & IER_UUE))
++              while (!(port[UART_LSR] & LSR_TEMT))
++                      cpu_relax();
++}
++
++static int kgdb_serial_getchar(void)
++{
++      unsigned char c;
++      if (!(CKEN & CKEN_UART) || port[UART_IER] != IER_UUE)
++              kgdb_serial_init();
++      while (!(port[UART_LSR] & UART_LSR_DR))
++              cpu_relax();
++      c = port[UART_RX];
++      return c;
++}
++
++struct kgdb_io kgdb_io_ops = {
++      .init = kgdb_serial_init,
++      .write_char = kgdb_serial_putchar,
++      .flush = kgdb_serial_flush,
++      .read_char = kgdb_serial_getchar,
++};
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/arm/mach-versatile/kgdb_serial.c linux-2.6.18-53.1.14.kgdb/arch/arm/mach-versatile/kgdb_serial.c
+--- linux-2.6.18-53.1.14/arch/arm/mach-versatile/kgdb_serial.c 1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/arch/arm/mach-versatile/kgdb_serial.c    2008-06-10 15:38:56.000000000 +0400
+@@ -0,0 +1,121 @@
++/*
++ * arch/arm/mach-versatile/kgdb_serial.c
++ *
++ * Author: Manish Lachwani, mlachwani@mvista.com
++ *
++ * 2005 (c) MontaVista Software, Inc. This file is licensed under
++ * the terms of the GNU General Public License version 2. This program
++ * is licensed "as is" without any warranty of any kind, whether express
++ * or implied.
++ *
++ * Support for KGDB on ARM Versatile.
++ */
++#include <linux/config.h>
++#include <linux/serial_reg.h>
++#include <linux/kgdb.h>
++#include <asm/io.h>
++#include <asm/processor.h>
++#include <asm/hardware.h>
++#include <asm/hardware/amba_serial.h>
++#include <asm/arch-versatile/hardware.h>
++
++#define ARM_BAUD_38400                23
++/*
++ * Functions that will be used later
++ */
++#define UART_GET_INT_STATUS(p)        readb((p) + UART010_IIR)
++#define UART_GET_MIS(p)               readw((p) + UART011_MIS)
++#define UART_PUT_ICR(p, c)    writel((c), (p) + UART010_ICR)
++#define UART_GET_FR(p)                readb((p) + UART01x_FR)
++#define UART_GET_CHAR(p)      readb((p) + UART01x_DR)
++#define UART_PUT_CHAR(p, c)     writel((c), (p) + UART01x_DR)
++#define UART_GET_RSR(p)               readb((p) + UART01x_RSR)
++#define UART_GET_CR(p)                readb((p) + UART010_CR)
++#define UART_PUT_CR(p,c)        writel((c), (p) + UART010_CR)
++#define UART_GET_LCRL(p)      readb((p) + UART010_LCRL)
++#define UART_PUT_LCRL(p,c)    writel((c), (p) + UART010_LCRL)
++#define UART_GET_LCRM(p)        readb((p) + UART010_LCRM)
++#define UART_PUT_LCRM(p,c)    writel((c), (p) + UART010_LCRM)
++#define UART_GET_LCRH(p)      readb((p) + UART010_LCRH)
++#define UART_PUT_LCRH(p,c)    writel((c), (p) + UART010_LCRH)
++#define UART_RX_DATA(s)               (((s) & UART01x_FR_RXFE) == 0)
++#define UART_TX_READY(s)      (((s) & UART01x_FR_TXFF) == 0)
++#define UART_TX_EMPTY(p)      ((UART_GET_FR(p) & UART01x_FR_TMSK) == 0)
++
++/*
++ * KGDB IRQ
++ */
++static int kgdb_irq = 12;
++static volatile unsigned char *port = NULL;
++
++static int kgdb_serial_init(void)
++{
++      int rate = ARM_BAUD_38400;
++
++      port = IO_ADDRESS(0x101F1000);
++      UART_PUT_CR(port, 0);
++
++      /* Set baud rate */
++      UART_PUT_LCRM(port, ((rate & 0xf00) >> 8));
++      UART_PUT_LCRL(port, (rate & 0xff));
++      UART_PUT_LCRH(port, UART01x_LCRH_WLEN_8 | UART01x_LCRH_FEN);
++      UART_PUT_CR(port, UART01x_CR_UARTEN);
++
++      return 0;
++}
++
++static void kgdb_serial_putchar(int ch)
++{
++      unsigned int status;
++
++      do {
++              status = UART_GET_FR(port);
++      } while (!UART_TX_READY(status));
++
++      UART_PUT_CHAR(port, ch);
++}
++
++static int kgdb_serial_getchar(void)
++{
++      unsigned int status;
++      int ch;
++
++      do {
++              status = UART_GET_FR(port);
++      } while (!UART_RX_DATA(status));
++      ch = UART_GET_CHAR(port);
++      return ch;
++}
++
++static struct uart_port kgdb_amba_port = {
++      .irq = 12,
++      .iobase = 0,
++      .iotype = UPIO_MEM,
++      .membase = (unsigned char *)IO_ADDRESS(0x101F1000),
++};
++
++static irqreturn_t kgdb_interrupt(int irq, void *dev_id, struct pt_regs *regs)
++{
++      int status = UART_GET_MIS(port);
++
++      if (irq != kgdb_irq)
++              return IRQ_NONE;
++
++      if (status & 0x40)
++              breakpoint();
++
++      return IRQ_HANDLED;
++}
++
++static void __init kgdb_hookup_irq(void)
++{
++      request_irq(kgdb_irq, kgdb_interrupt, SA_SHIRQ, "GDB-stub",
++                  &kgdb_amba_port);
++}
++
++struct kgdb_io kgdb_io_ops = {
++      .init = kgdb_serial_init,
++      .write_char = kgdb_serial_putchar,
++      .read_char = kgdb_serial_getchar,
++      .late_init = kgdb_hookup_irq,
++};
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/arm/mm/extable.c linux-2.6.18-53.1.14.kgdb/arch/arm/mm/extable.c
+--- linux-2.6.18-53.1.14/arch/arm/mm/extable.c 2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/arm/mm/extable.c    2008-06-10 15:38:56.000000000 +0400
+@@ -2,6 +2,7 @@
+  *  linux/arch/arm/mm/extable.c
+  */
+ #include <linux/module.h>
++#include <linux/kgdb.h>
+ #include <asm/uaccess.h>
+ 
+ int fixup_exception(struct pt_regs *regs)
+@@ -11,6 +12,12 @@ int fixup_exception(struct pt_regs *regs
+       fixup = search_exception_tables(instruction_pointer(regs));
+       if (fixup)
+               regs->ARM_pc = fixup->fixup;
++#ifdef CONFIG_KGDB
++      if (atomic_read(&debugger_active) && kgdb_may_fault)
++              /* Restore our previous state. */
++              kgdb_fault_longjmp(kgdb_fault_jmp_regs);
++              /* Not reached. */
++#endif
+ 
+       return fixup != NULL;
+ }
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/i386/kernel/Makefile linux-2.6.18-53.1.14.kgdb/arch/i386/kernel/Makefile
+--- linux-2.6.18-53.1.14/arch/i386/kernel/Makefile     2008-03-06 05:54:14.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/arch/i386/kernel/Makefile        2008-06-10 15:38:03.000000000 +0400
+@@ -39,6 +39,7 @@ obj-$(CONFIG_VM86)           += vm86.o
+ obj-$(CONFIG_EARLY_PRINTK)    += early_printk.o
+ obj-$(CONFIG_HPET_TIMER)      += hpet.o
+ obj-$(CONFIG_K8_NB)           += k8.o
++obj-$(CONFIG_KGDB)            += kgdb.o kgdb-jmp.o
+ 
+ EXTRA_AFLAGS   := -traditional
+ 
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/i386/kernel/entry.S linux-2.6.18-53.1.14.kgdb/arch/i386/kernel/entry.S
+--- linux-2.6.18-53.1.14/arch/i386/kernel/entry.S      2008-03-06 05:55:00.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/arch/i386/kernel/entry.S 2008-06-10 15:39:01.000000000 +0400
+@@ -201,7 +201,7 @@ VM_MASK            = 0x00020000
+       CFI_OFFSET ecx, ECX-OLDESP;\
+       CFI_OFFSET ebx, EBX-OLDESP
+ 
+-ENTRY(ret_from_fork)
++KPROBE_ENTRY(ret_from_fork)
+       CFI_STARTPROC
+       pushl %eax
+       CFI_ADJUST_CFA_OFFSET 4
+@@ -659,7 +659,7 @@ ENTRY(simd_coprocessor_error)
+       jmp error_code
+       CFI_ENDPROC
+ 
+-ENTRY(device_not_available)
++KPROBE_ENTRY(device_not_available)
+       RING0_INT_FRAME
+       pushl $-1                       # mark this as an int
+       CFI_ADJUST_CFA_OFFSET 4
+@@ -916,7 +916,7 @@ ENTRY(machine_check)
+       CFI_ENDPROC
+ #endif
+ 
+-ENTRY(spurious_interrupt_bug)
++KPROBE_ENTRY(spurious_interrupt_bug)
+       RING0_INT_FRAME
+       pushl $0
+       CFI_ADJUST_CFA_OFFSET 4
+@@ -942,3 +942,108 @@ ENDPROC(kernel_thread_helper)
+ #include "syscall_table.S"
+ 
+ syscall_table_size=(.-sys_call_table)
++
++#     Here we do call frames.  We cheat a bit as we only really need
++#     correct frames at locations we can actually look at from a
++#     debugger.  Since the break instruction trap actually goes thru
++#     some of this code, we don't really need info on those areas, but
++#     only after the fact.  I.e. if we can not step or break in a
++#     location or end up with a return address pointing at the
++#     location, we don't need a correct call frame for it.
++
++#ifdef CONFIG_KGDB
++
++#include <linux/dwarf2-lang.h>
++/*
++ * The register numbers as known by gdb
++ */
++
++#define _EAX 0
++#define _ECX 1
++#define _EDX 2
++#define _EBX 3
++#define _ESP 4
++#define _EBP 5
++#define _ESI 6
++#define _EDI 7
++#define _PC  8
++#define _EIP 8
++#define _PS  9
++#define _EFLAGS  9
++#define _CS 10
++#define _SS 11
++#define _DS 12
++#define _ES 13
++#define _FS 14
++#define _GS 15
++      /*
++       * This code uses macros defined in linux/dwarf2-lang.h
++       * They attempt to follow the dwarf2 naming conventions... sort of..
++       */
++ENTRY(end_of_stack_stop_unwind_function)
++      .long   end_of_stack_stop_unwind_function+1
++
++      .text
++
++      CFI_preamble(c1,_PC,1,1)
++      CFA_define_reference(_ESP,OLDESP)       /* Stack pointer */
++      CFA_expression(_EIP)
++         CFA_exp_OP_dup                       /* copy old esp */
++         CFA_exp_OP_consts(CS-OLDESP)         /* offset to CS address */
++         CFA_exp_OP_plus                      /* should be CS address */
++         CFA_exp_OP_deref                     /* get the CS */
++         CFA_exp_OP_const4s(VM_MASK|3)        /* prepare to mask it */
++         CFA_exp_OP_and                       /* mask it, zero means kernel */
++         CFA_exp_OP_bra(eip_user_rtn)         /* branch if user */
++         CFA_exp_OP_const4s(EIP-OLDESP)       /* offset to return address */
++         CFA_exp_OP_plus                      /* add that in */
++         CFA_exp_OP_skip(eip_end)             /* done if kernel, skip out */
++eip_user_rtn:
++         CFA_exp_OP_addr(end_of_stack_stop_unwind_function)/*dummy function */
++eip_end:
++         CFA_expression_end
++      CFA_define_offset(_EBX,EBX-OLDESP)
++      CFA_define_offset(_ECX,ECX-OLDESP)
++      CFA_define_offset(_EDX,EDX-OLDESP)
++      CFA_define_offset(_ESI,ESI-OLDESP)
++      CFA_define_offset(_EDI,EDI-OLDESP)
++      CFA_define_offset(_EBP,EBP-OLDESP)
++      CFA_define_offset(_EAX,EAX-OLDESP)
++      CFA_define_offset(_EFLAGS,EFLAGS-OLDESP)
++      CFI_postamble()
++
++/*
++ * This provides an uwind for our dummy end of unwind function.
++ * Current convention is to provied an undefined return address.
++ */
++      CFI_preamble(c2,_PC,1,1)
++      CFA_define_reference(_ESP,0)    /* Stack pointer */
++      CFA_undefine_reg(_EIP)
++      CFI_postamble()
++
++      FDE_preamble(c2,end_of_stack_stop_unwind_function,      \
++                      end_of_stack_stop_unwind_function+5)
++      FDE_postamble()
++      /*
++         * This is VERY sloppy.  At this point all we want to do is get
++         * the frame right for back tracing.  It will not be good if
++         * you try to single step.  We use already defined labels.
++         * We want to cover all call outs.
++         * We could also recode this as just one FDE, but this works and
++         * I want to get it out.
++       */
++      FDE_preamble(c1,ret_from_fork,ret_from_exception)
++      CFA_define_cfa_offset(4)                /* one extra word on stack */
++      FDE_postamble()
++
++      FDE_preamble(c1,ret_from_exception,device_not_available_emulate)
++      FDE_postamble()
++
++              FDE_preamble(c1,device_not_available_emulate,debug)
++      CFA_define_cfa_offset(4)                /* one extra word on stack */
++      FDE_postamble()
++
++      FDE_preamble(c1, debug,spurious_interrupt_bug)
++      FDE_postamble()
++
++#endif
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/i386/kernel/head.S linux-2.6.18-53.1.14.kgdb/arch/i386/kernel/head.S
+--- linux-2.6.18-53.1.14/arch/i386/kernel/head.S       2008-03-06 05:54:34.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/arch/i386/kernel/head.S  2008-06-10 15:39:01.000000000 +0400
+@@ -10,6 +10,7 @@
+ .text
+ #include <linux/threads.h>
+ #include <linux/linkage.h>
++#include <asm/kgdb.h>
+ #include <asm/segment.h>
+ #include <asm/page.h>
+ #include <asm/pgtable.h>
+@@ -336,6 +337,10 @@ is386:    movl $2,%ecx            # set MP
+ #endif /* CONFIG_SMP */
+       jmp start_kernel
+ 
++      /* This dwarf code tells gdb that this is the end of the unwind */
++      /* This uses the CFA set up for pc=1 located in entry.S */
++      CFI_END_FRAME(is386)
++
+ /*
+  * We depend on ET to be correct. This checks for 287/387.
+  */
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/i386/kernel/kgdb-jmp.S linux-2.6.18-53.1.14.kgdb/arch/i386/kernel/kgdb-jmp.S
+--- linux-2.6.18-53.1.14/arch/i386/kernel/kgdb-jmp.S   1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/arch/i386/kernel/kgdb-jmp.S      2008-06-10 15:38:03.000000000 +0400
+@@ -0,0 +1,74 @@
++/*
++ * arch/i386/kernel/kgdb-jmp.S
++ *
++ * Save and restore system registers so that within a limited frame we
++ * may have a fault and "jump back" to a known safe location.
++ *
++ * Author: George Anzinger <george@mvista.com>
++ *
++ * Cribbed from glibc, which carries the following:
++ * Copyright (C) 1996, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
++ * Copyright (C) 2005 by MontaVista Software.
++ *
++ * This file is licensed under the terms of the GNU General Public License
++ * version 2. This program as licensed "as is" without any warranty of
++ * any kind, whether express or implied.
++ */
++
++#include <linux/linkage.h>
++
++#define PCOFF         0
++#define LINKAGE               4               /* just the return address */
++#define PTR_SIZE      4
++#define PARMS         LINKAGE         /* no space for saved regs */
++#define JMPBUF                PARMS
++#define VAL           JMPBUF+PTR_SIZE
++
++#define JB_BX         0
++#define JB_SI         1
++#define JB_DI         2
++#define JB_BP         3
++#define JB_SP         4
++#define JB_PC         5
++
++/* This must be called prior to kgdb_fault_longjmp and
++ * kgdb_fault_longjmp must not be called outside of the context of the
++ * last call to kgdb_fault_setjmp.
++ * kgdb_fault_setjmp(int *jmp_buf[6])
++ */
++ENTRY(kgdb_fault_setjmp)
++      movl JMPBUF(%esp), %eax
++
++      /* Save registers.  */
++      movl    %ebx, (JB_BX*4)(%eax)
++      movl    %esi, (JB_SI*4)(%eax)
++      movl    %edi, (JB_DI*4)(%eax)
++      /* Save SP as it will be after we return.  */
++      leal    JMPBUF(%esp), %ecx
++      movl    %ecx, (JB_SP*4)(%eax)
++      movl    PCOFF(%esp), %ecx       /* Save PC we are returning to now.  */
++      movl    %ecx, (JB_PC*4)(%eax)
++      movl    %ebp, (JB_BP*4)(%eax)   /* Save caller's frame pointer.  */
++
++      /* Restore state so we can now try the access. */
++      movl    JMPBUF(%esp), %ecx      /* User's jmp_buf in %ecx.  */
++      /* Save the return address now.  */
++      movl    (JB_PC*4)(%ecx), %edx
++      /* Restore registers.  */
++      movl    $0, %eax
++      movl    (JB_SP*4)(%ecx), %esp
++      jmp     *%edx           /* Jump to saved PC. */
++
++/* kgdb_fault_longjmp(int *jmp_buf[6]) */
++ENTRY(kgdb_fault_longjmp)
++      movl    JMPBUF(%esp), %ecx      /* User's jmp_buf in %ecx.  */
++      /* Save the return address now.  */
++      movl    (JB_PC*4)(%ecx), %edx
++      /* Restore registers.  */
++      movl    (JB_BX*4)(%ecx), %ebx
++      movl    (JB_SI*4)(%ecx), %esi
++      movl    (JB_DI*4)(%ecx), %edi
++      movl    (JB_BP*4)(%ecx), %ebp
++      movl    $1, %eax
++      movl    (JB_SP*4)(%ecx), %esp
++      jmp     *%edx           /* Jump to saved PC. */
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/i386/kernel/kgdb.c linux-2.6.18-53.1.14.kgdb/arch/i386/kernel/kgdb.c
+--- linux-2.6.18-53.1.14/arch/i386/kernel/kgdb.c       1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/arch/i386/kernel/kgdb.c  2008-06-10 15:39:27.000000000 +0400
+@@ -0,0 +1,363 @@
++/*
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License as published by the
++ * Free Software Foundation; either version 2, or (at your option) any
++ * later version.
++ *
++ * This program is distributed in the hope that it will be useful, but
++ * WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * General Public License for more details.
++ *
++ */
++
++/*
++ * Copyright (C) 2000-2001 VERITAS Software Corporation.
++ */
++/*
++ *  Contributor:     Lake Stevens Instrument Division$
++ *  Written by:      Glenn Engel $
++ *  Updated by:            Amit Kale<akale@veritas.com>
++ *  Updated by:            Tom Rini <trini@kernel.crashing.org>
++ *  Modified for 386 by Jim Kingdon, Cygnus Support.
++ *  Origianl kgdb, compatibility with 2.1.xx kernel by
++ *  David Grothe <dave@gcom.com>
++ *  Additional support from Tigran Aivazian <tigran@sco.com>
++ */
++
++#include <linux/string.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/smp.h>
++#include <linux/spinlock.h>
++#include <linux/delay.h>
++#include <asm/vm86.h>
++#include <asm/system.h>
++#include <asm/ptrace.h>               /* for linux pt_regs struct */
++#include <linux/kgdb.h>
++#include <linux/init.h>
++#include <asm/apicdef.h>
++#include <asm/desc.h>
++#include <asm/kdebug.h>
++
++#include "mach_ipi.h"
++
++/* Put the error code here just in case the user cares.  */
++int gdb_i386errcode;
++/* Likewise, the vector number here (since GDB only gets the signal
++   number through the usual means, and that's not very specific).  */
++int gdb_i386vector = -1;
++
++extern atomic_t cpu_doing_single_step;
++
++void regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
++{
++      gdb_regs[_EAX] = regs->eax;
++      gdb_regs[_EBX] = regs->ebx;
++      gdb_regs[_ECX] = regs->ecx;
++      gdb_regs[_EDX] = regs->edx;
++      gdb_regs[_ESI] = regs->esi;
++      gdb_regs[_EDI] = regs->edi;
++      gdb_regs[_EBP] = regs->ebp;
++      gdb_regs[_DS] = regs->xds;
++      gdb_regs[_ES] = regs->xes;
++      gdb_regs[_PS] = regs->eflags;
++      gdb_regs[_CS] = regs->xcs;
++      gdb_regs[_PC] = regs->eip;
++      gdb_regs[_ESP] = (int)(&regs->esp);
++      gdb_regs[_SS] = __KERNEL_DS;
++      gdb_regs[_FS] = 0xFFFF;
++      gdb_regs[_GS] = 0xFFFF;
++}
++
++/*
++ * Extracts ebp, esp and eip values understandable by gdb from the values
++ * saved by switch_to.
++ * thread.esp points to ebp. flags and ebp are pushed in switch_to hence esp
++ * prior to entering switch_to is 8 greater then the value that is saved.
++ * If switch_to changes, change following code appropriately.
++ */
++void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
++{
++      gdb_regs[_EAX] = 0;
++      gdb_regs[_EBX] = 0;
++      gdb_regs[_ECX] = 0;
++      gdb_regs[_EDX] = 0;
++      gdb_regs[_ESI] = 0;
++      gdb_regs[_EDI] = 0;
++      gdb_regs[_EBP] = *(unsigned long *)p->thread.esp;
++      gdb_regs[_DS] = __KERNEL_DS;
++      gdb_regs[_ES] = __KERNEL_DS;
++      gdb_regs[_PS] = 0;
++      gdb_regs[_CS] = __KERNEL_CS;
++      gdb_regs[_PC] = p->thread.eip;
++      gdb_regs[_ESP] = p->thread.esp;
++      gdb_regs[_SS] = __KERNEL_DS;
++      gdb_regs[_FS] = 0xFFFF;
++      gdb_regs[_GS] = 0xFFFF;
++}
++
++void gdb_regs_to_regs(unsigned long *gdb_regs, struct pt_regs *regs)
++{
++      regs->eax = gdb_regs[_EAX];
++      regs->ebx = gdb_regs[_EBX];
++      regs->ecx = gdb_regs[_ECX];
++      regs->edx = gdb_regs[_EDX];
++      regs->esi = gdb_regs[_ESI];
++      regs->edi = gdb_regs[_EDI];
++      regs->ebp = gdb_regs[_EBP];
++      regs->xds = gdb_regs[_DS];
++      regs->xes = gdb_regs[_ES];
++      regs->eflags = gdb_regs[_PS];
++      regs->xcs = gdb_regs[_CS];
++      regs->eip = gdb_regs[_PC];
++}
++
++static struct hw_breakpoint {
++      unsigned enabled;
++      unsigned type;
++      unsigned len;
++      unsigned addr;
++} breakinfo[4] = {
++      { .enabled = 0 },
++      { .enabled = 0 },
++      { .enabled = 0 },
++      { .enabled = 0 },
++};
++
++void kgdb_correct_hw_break(void)
++{
++      int breakno;
++      int correctit;
++      int breakbit;
++      unsigned dr7;
++
++      asm volatile ("movl %%db7, %0\n":"=r" (dr7)
++                    :);
++      do {
++              unsigned addr0, addr1, addr2, addr3;
++              asm volatile ("movl %%db0, %0\n"
++                            "movl %%db1, %1\n"
++                            "movl %%db2, %2\n"
++                            "movl %%db3, %3\n":"=r" (addr0), "=r"(addr1),
++                            "=r"(addr2), "=r"(addr3):);
++      } while (0);
++      correctit = 0;
++      for (breakno = 0; breakno < 3; breakno++) {
++              breakbit = 2 << (breakno << 1);
++              if (!(dr7 & breakbit) && breakinfo[breakno].enabled) {
++                      correctit = 1;
++                      dr7 |= breakbit;
++                      dr7 &= ~(0xf0000 << (breakno << 2));
++                      dr7 |= (((breakinfo[breakno].len << 2) |
++                               breakinfo[breakno].type) << 16) <<
++                          (breakno << 2);
++                      switch (breakno) {
++                      case 0:
++                              asm volatile ("movl %0, %%dr0\n"::"r"
++                                            (breakinfo[breakno].addr));
++                              break;
++
++                      case 1:
++                              asm volatile ("movl %0, %%dr1\n"::"r"
++                                            (breakinfo[breakno].addr));
++                              break;
++
++                      case 2:
++                              asm volatile ("movl %0, %%dr2\n"::"r"
++                                            (breakinfo[breakno].addr));
++                              break;
++
++                      case 3:
++                              asm volatile ("movl %0, %%dr3\n"::"r"
++                                            (breakinfo[breakno].addr));
++                              break;
++                      }
++              } else if ((dr7 & breakbit) && !breakinfo[breakno].enabled) {
++                      correctit = 1;
++                      dr7 &= ~breakbit;
++                      dr7 &= ~(0xf0000 << (breakno << 2));
++              }
++      }
++      if (correctit)
++              asm volatile ("movl %0, %%db7\n"::"r" (dr7));
++}
++
++int kgdb_remove_hw_break(unsigned long addr)
++{
++      int i, idx = -1;
++      for (i = 0; i < 4; i++) {
++              if (breakinfo[i].addr == addr && breakinfo[i].enabled) {
++                      idx = i;
++                      break;
++              }
++      }
++      if (idx == -1)
++              return -1;
++
++      breakinfo[idx].enabled = 0;
++      return 0;
++}
++
++void kgdb_remove_all_hw_break(void)
++{
++      int i;
++
++      for (i = 0; i < 4; i++) {
++              if (breakinfo[i].enabled) {
++                      /* Do what? */
++                      ;
++              }
++              memset(&breakinfo[i], 0, sizeof(struct hw_breakpoint));
++      }
++}
++
++int kgdb_set_hw_break(unsigned long addr)
++{
++      int i, idx = -1;
++      for (i = 0; i < 4; i++) {
++              if (!breakinfo[i].enabled) {
++                      idx = i;
++                      break;
++              }
++      }
++      if (idx == -1)
++              return -1;
++
++      breakinfo[idx].enabled = 1;
++      breakinfo[idx].type = 1;
++      breakinfo[idx].len = 1;
++      breakinfo[idx].addr = addr;
++      return 0;
++}
++
++void kgdb_disable_hw_debug(struct pt_regs *regs)
++{
++      /* Disable hardware debugging while we are in kgdb */
++      asm volatile ("movl %0,%%db7": /* no output */ :"r" (0));
++}
++
++void kgdb_post_master_code(struct pt_regs *regs, int e_vector, int err_code)
++{
++      /* Master processor is completely in the debugger */
++      gdb_i386vector = e_vector;
++      gdb_i386errcode = err_code;
++}
++
++void kgdb_roundup_cpus(unsigned long flags)
++{
++      send_IPI_allbutself(APIC_DM_NMI);
++}
++
++int kgdb_arch_handle_exception(int e_vector, int signo,
++                             int err_code, char *remcom_in_buffer,
++                             char *remcom_out_buffer,
++                             struct pt_regs *linux_regs)
++{
++      long addr;
++      char *ptr;
++      int newPC, dr6;
++
++      switch (remcom_in_buffer[0]) {
++      case 'c':
++      case 's':
++              /* try to read optional parameter, pc unchanged if no parm */
++              ptr = &remcom_in_buffer[1];
++              if (kgdb_hex2long(&ptr, &addr))
++                      linux_regs->eip = addr;
++              newPC = linux_regs->eip;
++
++              /* clear the trace bit */
++              linux_regs->eflags &= ~TF_MASK;
++              atomic_set(&cpu_doing_single_step, -1);
++
++              /* set the trace bit if we're stepping */
++              if (remcom_in_buffer[0] == 's') {
++                      linux_regs->eflags |= TF_MASK;
++                      debugger_step = 1;
++                      atomic_set(&cpu_doing_single_step,smp_processor_id());
++              }
++
++              asm volatile ("movl %%db6, %0\n":"=r" (dr6));
++              if (!(dr6 & 0x4000)) {
++                      long breakno;
++                      for (breakno = 0; breakno < 4; ++breakno) {
++                              if (dr6 & (1 << breakno) &&
++                                  breakinfo[breakno].type == 0) {
++                                      /* Set restore flag */
++                                      linux_regs->eflags |= X86_EFLAGS_RF;
++                                      break;
++                              }
++                      }
++              }
++              kgdb_correct_hw_break();
++              asm volatile ("movl %0, %%db6\n"::"r" (0));
++
++              return (0);
++      }                       /* switch */
++      /* this means that we do not want to exit from the handler */
++      return -1;
++}
++
++/* Register KGDB with the i386die_chain so that we hook into all of the right
++ * spots. */
++static int kgdb_notify(struct notifier_block *self, unsigned long cmd,
++                     void *ptr)
++{
++      struct die_args *args = ptr;
++      struct pt_regs *regs = args->regs;
++
++      /* Bad memory access? */
++      if (cmd == DIE_PAGE_FAULT_NO_CONTEXT && atomic_read(&debugger_active)
++                      && kgdb_may_fault) {
++              kgdb_fault_longjmp(kgdb_fault_jmp_regs);
++              return NOTIFY_STOP;
++      } else if (cmd == DIE_PAGE_FAULT)
++              /* A normal page fault, ignore. */
++              return NOTIFY_DONE;
++       else if ((cmd == DIE_NMI || cmd == DIE_NMI_IPI ||
++               cmd == DIE_NMIWATCHDOG) && atomic_read(&debugger_active)) {
++               /* CPU roundup */
++               kgdb_nmihook(smp_processor_id(), regs);
++               return NOTIFY_STOP;
++       } else if (cmd == DIE_NMI_IPI || cmd == DIE_NMI || user_mode(regs) ||
++                       (cmd == DIE_DEBUG && atomic_read(&debugger_active)))
++               /* Normal watchdog event or userspace debugging, or spurious
++                * debug exception, ignore. */
++               return NOTIFY_DONE;
++
++      kgdb_handle_exception(args->trapnr, args->signr, args->err, regs);
++
++      return NOTIFY_STOP;
++}
++
++static struct notifier_block kgdb_notifier = {
++      .notifier_call = kgdb_notify,
++};
++
++int kgdb_arch_init(void)
++{
++      atomic_notifier_chain_register(&i386die_chain, &kgdb_notifier);
++      return 0;
++}
++
++/*
++ * Skip an int3 exception when it occurs after a breakpoint has been
++ * removed. Backtrack eip by 1 since the int3 would have caused it to
++ * increment by 1.
++ */
++
++int kgdb_skipexception(int exception, struct pt_regs *regs)
++{
++      if (exception == 3 && kgdb_isremovedbreak(regs->eip - 1)) {
++              regs->eip -= 1;
++              return 1;
++      }
++      return 0;
++}
++
++struct kgdb_arch arch_kgdb_ops = {
++      .gdb_bpt_instr = {0xcc},
++      .flags = KGDB_HW_BREAKPOINT,
++};
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/i386/kernel/setup.c linux-2.6.18-53.1.14.kgdb/arch/i386/kernel/setup.c
+--- linux-2.6.18-53.1.14/arch/i386/kernel/setup.c      2008-03-06 05:54:58.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/arch/i386/kernel/setup.c 2008-06-10 15:38:03.000000000 +0400
+@@ -148,6 +148,7 @@ EXPORT_SYMBOL(ist_info);
+ struct e820map e820;
+ 
+ extern void early_cpu_init(void);
++extern void early_trap_init(void);
+ extern void generic_apic_probe(char *);
+ extern int root_mountflags;
+ 
+@@ -1470,6 +1471,7 @@ void __init setup_arch(char **cmdline_p)
+       memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
+       pre_setup_arch_hook();
+       early_cpu_init();
++      early_trap_init();
+ 
+       /*
+        * FIXME: This isn't an official loader_type right
+@@ -1526,6 +1528,7 @@ void __init setup_arch(char **cmdline_p)
+       data_resource.end = virt_to_phys(_edata)-1;
+ 
+       parse_cmdline_early(cmdline_p);
++      parse_early_param();
+ 
+ #ifdef CONFIG_EARLY_PRINTK
+       {
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/i386/kernel/smpboot.c linux-2.6.18-53.1.14.kgdb/arch/i386/kernel/smpboot.c
+--- linux-2.6.18-53.1.14/arch/i386/kernel/smpboot.c    2008-03-06 05:54:34.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/arch/i386/kernel/smpboot.c       2008-06-10 15:39:01.000000000 +0400
+@@ -592,6 +592,9 @@ void __devinit initialize_secondary(void
+ 
+       asm volatile(
+               "movl %0,%%esp\n\t"
++#ifdef CONFIG_KGDB
++              "pushl end_of_stack_stop_unwind_function\n\t"
++#endif
+               "jmp *%1"
+               :
+               :"r" (current->thread.esp),"r" (current->thread.eip));
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/i386/kernel/traps.c linux-2.6.18-53.1.14.kgdb/arch/i386/kernel/traps.c
+--- linux-2.6.18-53.1.14/arch/i386/kernel/traps.c      2008-03-06 05:55:00.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/arch/i386/kernel/traps.c 2008-06-10 15:38:03.000000000 +0400
+@@ -964,6 +964,7 @@ fastcall void __kprobes do_debug(struct 
+        */
+ clear_dr7:
+       set_debugreg(0, 7);
++      notify_die(DIE_DEBUG, "debug2", regs, condition, error_code, SIGTRAP);
+       return;
+ 
+ debug_vm86:
+@@ -1268,6 +1269,12 @@ static void __init set_task_gate(unsigne
+       _set_gate(idt_table+n,5,0,0,(gdt_entry<<3));
+ }
+ 
++/* Some traps need to be set early. */
++void __init early_trap_init(void) {
++      set_intr_gate(1,&debug);
++      set_system_intr_gate(3, &int3); /* int3 can be called from all */
++      set_intr_gate(14,&page_fault);
++}
+ 
+ void __init trap_init(void)
+ {
+@@ -1284,10 +1291,8 @@ void __init trap_init(void)
+ #endif
+ 
+       set_trap_gate(0,&divide_error);
+-      set_intr_gate(1,&debug);
+       set_intr_gate(2,&nmi);
+-      set_system_intr_gate(3, &int3); /* int3/4 can be called from all */
+-      set_system_gate(4,&overflow);
++      set_system_gate(4,&overflow); /* int4/5 can be called from all */
+       set_trap_gate(5,&bounds);
+       set_trap_gate(6,&invalid_op);
+       set_trap_gate(7,&device_not_available);
+@@ -1297,7 +1302,6 @@ void __init trap_init(void)
+       set_trap_gate(11,&segment_not_present);
+       set_trap_gate(12,&stack_segment);
+       set_trap_gate(13,&general_protection);
+-      set_intr_gate(14,&page_fault);
+       set_trap_gate(15,&spurious_interrupt_bug);
+       set_trap_gate(16,&coprocessor_error);
+       set_trap_gate(17,&alignment_check);
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/i386/mm/fault.c linux-2.6.18-53.1.14.kgdb/arch/i386/mm/fault.c
+--- linux-2.6.18-53.1.14/arch/i386/mm/fault.c  2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/i386/mm/fault.c     2008-06-10 15:38:03.000000000 +0400
+@@ -539,6 +539,10 @@ no_context:
+       if (is_prefetch(regs, address, error_code))
+               return;
+ 
++      if (notify_die(DIE_PAGE_FAULT_NO_CONTEXT, "no context", regs,
++                              error_code, 14, SIGSEGV) == NOTIFY_STOP)
++              return;
++
+ /*
+  * Oops. The kernel tried to access some bad page. We'll have to
+  * terminate things with extreme prejudice.
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ia64/kernel/Makefile linux-2.6.18-53.1.14.kgdb/arch/ia64/kernel/Makefile
+--- linux-2.6.18-53.1.14/arch/ia64/kernel/Makefile     2008-03-06 05:54:11.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/arch/ia64/kernel/Makefile        2008-06-10 15:38:32.000000000 +0400
+@@ -32,6 +32,7 @@ obj-$(CONFIG_KEXEC)          += machine_kexec.o 
+ obj-$(CONFIG_IA64_UNCACHED_ALLOCATOR) += uncached.o
+ obj-$(CONFIG_AUDIT)           += audit.o
+ mca_recovery-y                        += mca_drv.o mca_drv_asm.o
++obj-$(CONFIG_KGDB)            += kgdb.o kgdb-jmp.o
+ 
+ # The gate DSO image is built using a special linker script.
+ targets += gate.so gate-syms.o
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ia64/kernel/entry.S linux-2.6.18-53.1.14.kgdb/arch/ia64/kernel/entry.S
+--- linux-2.6.18-53.1.14/arch/ia64/kernel/entry.S      2008-03-06 05:54:43.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/arch/ia64/kernel/entry.S 2008-06-10 15:39:39.000000000 +0400
+@@ -959,9 +959,9 @@ GLOBAL_ENTRY(__ia64_leave_kernel)
+       shr.u r18=r19,16        // get byte size of existing "dirty" partition
+       ;;
+       mov r16=ar.bsp          // get existing backing store pointer
+-      addl r17=THIS_CPU(ia64_phys_stacked_size_p8),r0
++(pUStk)       addl r17=THIS_CPU(ia64_phys_stacked_size_p8),r0
+       ;;
+-      ld4 r17=[r17]           // r17 = cpu_data->phys_stacked_size_p8
++(pUStk)       ld4 r17=[r17]           // r17 = cpu_data->phys_stacked_size_p8
+ (pKStk)       br.cond.dpnt skip_rbs_switch
+ 
+       /*
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ia64/kernel/ivt.S linux-2.6.18-53.1.14.kgdb/arch/ia64/kernel/ivt.S
+--- linux-2.6.18-53.1.14/arch/ia64/kernel/ivt.S        2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/ia64/kernel/ivt.S   2008-06-10 15:39:39.000000000 +0400
+@@ -52,6 +52,14 @@
+ #include <asm/unistd.h>
+ #include <asm/errno.h>
+ 
++#ifdef CONFIG_KGDB
++#define KGDB_ENABLE_PSR_DB mov r31=psr;; movl r30=IA64_PSR_DB;;       \
++      or r31=r31,r30;;                                        \
++      mov psr.l=r31;; srlz.i;;
++#else
++#define KGDB_ENABLE_PSR_DB
++#endif
++
+ #if 1
+ # define PSR_DEFAULT_BITS     psr.ac
+ #else
+@@ -519,6 +527,7 @@ ENTRY(page_fault)
+       movl r14=ia64_leave_kernel
+       ;;
+       SAVE_REST
++      KGDB_ENABLE_PSR_DB
+       mov rp=r14
+       ;;
+       adds out2=16,r12                        // out2 = pointer to pt_regs
+@@ -863,6 +872,7 @@ ENTRY(interrupt)
+       srlz.i                  // ensure everybody knows psr.ic is back on
+       ;;
+       SAVE_REST
++      KGDB_ENABLE_PSR_DB
+       ;;
+       MCA_RECOVER_RANGE(interrupt)
+       alloc r14=ar.pfs,0,0,2,0 // must be first in an insn group
+@@ -1110,6 +1120,7 @@ ENTRY(non_syscall)
+       movl r15=ia64_leave_kernel
+       ;;
+       SAVE_REST
++      KGDB_ENABLE_PSR_DB
+       mov rp=r15
+       ;;
+       br.call.sptk.many b6=ia64_bad_break     // avoid WAW on CFM and ignore return addr
+@@ -1143,6 +1154,7 @@ ENTRY(dispatch_unaligned_handler)
+       adds r3=8,r2                            // set up second base pointer
+       ;;
+       SAVE_REST
++      KGDB_ENABLE_PSR_DB
+       movl r14=ia64_leave_kernel
+       ;;
+       mov rp=r14
+@@ -1185,6 +1197,10 @@ ENTRY(dispatch_to_fault_handler)
+       adds r3=8,r2                            // set up second base pointer for SAVE_REST
+       ;;
+       SAVE_REST
++      cmp.eq p6,p0=29,out0
++(p6)  br.cond.spnt 1f;;                       // debug_vector
++      KGDB_ENABLE_PSR_DB
++1:
+       movl r14=ia64_leave_kernel
+       ;;
+       mov rp=r14
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ia64/kernel/kgdb-jmp.S linux-2.6.18-53.1.14.kgdb/arch/ia64/kernel/kgdb-jmp.S
+--- linux-2.6.18-53.1.14/arch/ia64/kernel/kgdb-jmp.S   1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/arch/ia64/kernel/kgdb-jmp.S      2008-06-10 15:38:32.000000000 +0400
+@@ -0,0 +1,238 @@
++/* setjmp() and longjmp() assembler support for kdb on ia64.
++
++   This code was copied from glibc CVS as of 2001-06-27 and modified where
++   necessary to fit the kernel.
++   Keith Owens <kaos@melbourne.sgi.com> 2001-06-27
++ */
++
++/* Copyright (C) 1999, 2000, 2001 Free Software Foundation, Inc.
++   Contributed by David Mosberger-Tang <davidm@hpl.hp.com>.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Library General Public License as
++   published by the Free Software Foundation; either version 2 of the
++   License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Library General Public License for more details.
++
++   You should have received a copy of the GNU Library General Public
++   License along with the GNU C Library; see the file COPYING.LIB.  If
++   not, write to the Free Software Foundation, Inc.,
++   59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
++*/
++
++#include <asm/asmmacro.h>
++GLOBAL_ENTRY(kgdb_fault_setjmp)
++      .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(2)
++      alloc loc1=ar.pfs,2,2,2,0
++      mov r16=ar.unat
++      ;;
++      mov r17=ar.fpsr
++      mov r2=in0
++      add r3=8,in0
++      ;;
++.mem.offset 0,0;
++      st8.spill.nta [r2]=sp,16        // r12 (sp)
++.mem.offset 8,0;
++      st8.spill.nta [r3]=gp,16        // r1 (gp)
++      ;;
++      st8.nta [r2]=r16,16             // save caller's unat
++      st8.nta [r3]=r17,16             // save fpsr
++      add r8=0xa0,in0
++      ;;
++.mem.offset 160,0;
++      st8.spill.nta [r2]=r4,16        // r4
++.mem.offset 168,0;
++      st8.spill.nta [r3]=r5,16        // r5
++      add r9=0xb0,in0
++      ;;
++      stf.spill.nta [r8]=f2,32
++      stf.spill.nta [r9]=f3,32
++      mov loc0=rp
++      .body
++      ;;
++      stf.spill.nta [r8]=f4,32
++      stf.spill.nta [r9]=f5,32
++      mov r17=b1
++      ;;
++      stf.spill.nta [r8]=f16,32
++      stf.spill.nta [r9]=f17,32
++      mov r18=b2
++      ;;
++      stf.spill.nta [r8]=f18,32
++      stf.spill.nta [r9]=f19,32
++      mov r19=b3
++      ;;
++      stf.spill.nta [r8]=f20,32
++      stf.spill.nta [r9]=f21,32
++      mov r20=b4
++      ;;
++      stf.spill.nta [r8]=f22,32
++      stf.spill.nta [r9]=f23,32
++      mov r21=b5
++      ;;
++      stf.spill.nta [r8]=f24,32
++      stf.spill.nta [r9]=f25,32
++      mov r22=ar.lc
++      ;;
++      stf.spill.nta [r8]=f26,32
++      stf.spill.nta [r9]=f27,32
++      mov r24=pr
++      ;;
++      stf.spill.nta [r8]=f28,32
++      stf.spill.nta [r9]=f29,32
++      ;;
++      stf.spill.nta [r8]=f30
++      stf.spill.nta [r9]=f31
++
++.mem.offset 0,0;
++      st8.spill.nta [r2]=r6,16        // r6
++.mem.offset 8,0;
++      st8.spill.nta [r3]=r7,16        // r7
++      ;;
++      mov r23=ar.bsp
++      mov r25=ar.unat
++      st8.nta [r2]=loc0,16            // b0
++      st8.nta [r3]=r17,16             // b1
++      ;;
++      st8.nta [r2]=r18,16             // b2
++      st8.nta [r3]=r19,16             // b3
++      ;;
++      st8.nta [r2]=r20,16             // b4
++      st8.nta [r3]=r21,16             // b5
++      ;;
++      st8.nta [r2]=loc1,16            // ar.pfs
++      st8.nta [r3]=r22,16             // ar.lc
++      ;;
++      st8.nta [r2]=r24,16             // pr
++      st8.nta [r3]=r23,16             // ar.bsp
++      ;;
++      st8.nta [r2]=r25                // ar.unat
++      st8.nta [r3]=in0                // &__jmp_buf
++      mov r8=0
++      mov rp=loc0
++      mov ar.pfs=loc1
++      br.ret.sptk.few rp
++END(kdba_setjmp)
++#define       pPos    p6      /* is rotate count positive? */
++#define       pNeg    p7      /* is rotate count negative? */
++GLOBAL_ENTRY(kgdb_fault_longjmp)
++      alloc r8=ar.pfs,2,1,0,0
++      mov r27=ar.rsc
++      add r2=0x98,in0         // r2 <- &jmpbuf.orig_jmp_buf_addr
++      ;;
++      ld8 r8=[r2],-16         // r8 <- orig_jmp_buf_addr
++      mov r10=ar.bsp
++      and r11=~0x3,r27        // clear ar.rsc.mode
++      ;;
++      flushrs                 // flush dirty regs to backing store (must be first in insn grp)
++      ld8 r23=[r2],8          // r23 <- jmpbuf.ar_bsp
++      sub r8=r8,in0           // r8 <- &orig_jmpbuf - &jmpbuf
++      ;;
++      ld8 r25=[r2]            // r25 <- jmpbuf.ar_unat
++      extr.u r8=r8,3,6        // r8 <- (&orig_jmpbuf - &jmpbuf)/8 & 0x3f
++      ;;
++      cmp.lt pNeg,pPos=r8,r0
++      mov r2=in0
++      ;;
++(pPos)        mov r16=r8
++(pNeg)        add r16=64,r8
++(pPos)        sub r17=64,r8
++(pNeg)        sub r17=r0,r8
++      ;;
++      mov ar.rsc=r11          // put RSE in enforced lazy mode
++      shr.u r8=r25,r16
++      add r3=8,in0            // r3 <- &jmpbuf.r1
++      shl r9=r25,r17
++      ;;
++      or r25=r8,r9
++      ;;
++      mov r26=ar.rnat
++      mov ar.unat=r25         // setup ar.unat (NaT bits for r1, r4-r7, and r12)
++      ;;
++      ld8.fill.nta sp=[r2],16 // r12 (sp)
++      ld8.fill.nta gp=[r3],16         // r1 (gp)
++      dep r11=-1,r23,3,6      // r11 <- ia64_rse_rnat_addr(jmpbuf.ar_bsp)
++      ;;
++      ld8.nta r16=[r2],16             // caller's unat
++      ld8.nta r17=[r3],16             // fpsr
++      ;;
++      ld8.fill.nta r4=[r2],16 // r4
++      ld8.fill.nta r5=[r3],16         // r5 (gp)
++      cmp.geu p8,p0=r10,r11   // p8 <- (ar.bsp >= jmpbuf.ar_bsp)
++      ;;
++      ld8.fill.nta r6=[r2],16 // r6
++      ld8.fill.nta r7=[r3],16         // r7
++      ;;
++      mov ar.unat=r16                 // restore caller's unat
++      mov ar.fpsr=r17                 // restore fpsr
++      ;;
++      ld8.nta r16=[r2],16             // b0
++      ld8.nta r17=[r3],16             // b1
++      ;;
++(p8)  ld8 r26=[r11]           // r26 <- *ia64_rse_rnat_addr(jmpbuf.ar_bsp)
++      mov ar.bspstore=r23     // restore ar.bspstore
++      ;;
++      ld8.nta r18=[r2],16             // b2
++      ld8.nta r19=[r3],16             // b3
++      ;;
++      ld8.nta r20=[r2],16             // b4
++      ld8.nta r21=[r3],16             // b5
++      ;;
++      ld8.nta r11=[r2],16             // ar.pfs
++      ld8.nta r22=[r3],56             // ar.lc
++      ;;
++      ld8.nta r24=[r2],32             // pr
++      mov b0=r16
++      ;;
++      ldf.fill.nta f2=[r2],32
++      ldf.fill.nta f3=[r3],32
++      mov b1=r17
++      ;;
++      ldf.fill.nta f4=[r2],32
++      ldf.fill.nta f5=[r3],32
++      mov b2=r18
++      ;;
++      ldf.fill.nta f16=[r2],32
++      ldf.fill.nta f17=[r3],32
++      mov b3=r19
++      ;;
++      ldf.fill.nta f18=[r2],32
++      ldf.fill.nta f19=[r3],32
++      mov b4=r20
++      ;;
++      ldf.fill.nta f20=[r2],32
++      ldf.fill.nta f21=[r3],32
++      mov b5=r21
++      ;;
++      ldf.fill.nta f22=[r2],32
++      ldf.fill.nta f23=[r3],32
++      mov ar.lc=r22
++      ;;
++      ldf.fill.nta f24=[r2],32
++      ldf.fill.nta f25=[r3],32
++      cmp.eq p8,p9=0,in1
++      ;;
++      ldf.fill.nta f26=[r2],32
++      ldf.fill.nta f27=[r3],32
++      mov ar.pfs=r11
++      ;;
++      ldf.fill.nta f28=[r2],32
++      ldf.fill.nta f29=[r3],32
++      ;;
++      ldf.fill.nta f30=[r2]
++      ldf.fill.nta f31=[r3]
++(p8)  mov r8=1
++
++      mov ar.rnat=r26         // restore ar.rnat
++      ;;
++      mov ar.rsc=r27          // restore ar.rsc
++(p9)  mov r8=in1
++
++      invala                  // virt. -> phys. regnum mapping may change
++      mov pr=r24,-1
++      br.ret.sptk.few rp
++END(kgdb_fault_longjmp)
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ia64/kernel/kgdb.c linux-2.6.18-53.1.14.kgdb/arch/ia64/kernel/kgdb.c
+--- linux-2.6.18-53.1.14/arch/ia64/kernel/kgdb.c       1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/arch/ia64/kernel/kgdb.c  2008-06-10 15:38:32.000000000 +0400
+@@ -0,0 +1,1131 @@
++/*
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License as published by the
++ * Free Software Foundation; either version 2, or (at your option) any
++ * later version.
++ *
++ * This program is distributed in the hope that it will be useful, but
++ * WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * General Public License for more details.
++ *
++ */
++
++/*
++ * Copyright (C) 2000-2001 VERITAS Software Corporation.
++ * (c) Copyright 2005 Hewlett-Packard Development Company, L.P.
++ *     Bob Picco <bob.picco@hp.com>
++ */
++/*
++ *  Contributor:     Lake Stevens Instrument Division$
++ *  Written by:      Glenn Engel $
++ *  Updated by:            Amit Kale<akale@veritas.com>
++ *  Modified for 386 by Jim Kingdon, Cygnus Support.
++ *  Origianl kgdb, compatibility with 2.1.xx kernel by David Grothe <dave@gcom.com>
++ */
++
++#include <linux/string.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/smp.h>
++#include <linux/spinlock.h>
++#include <linux/delay.h>
++#include <asm/system.h>
++#include <asm/ptrace.h>               /* for linux pt_regs struct */
++#include <asm/unwind.h>
++#include <asm/rse.h>
++#include <linux/kgdb.h>
++#include <linux/init.h>
++#include <asm/cacheflush.h>
++#include <asm/kdebug.h>
++
++#define NUM_REGS 590
++#define REGISTER_BYTES (NUM_REGS*8+128*8)
++#define REGISTER_BYTE(N) (((N) * 8)                                    \
++      + ((N) <= IA64_FR0_REGNUM ?                                     \
++      0 : 8 * (((N) > IA64_FR127_REGNUM) ? 128 : (N) - IA64_FR0_REGNUM)))
++#define REGISTER_SIZE(N)                                               \
++      (((N) >= IA64_FR0_REGNUM && (N) <= IA64_FR127_REGNUM) ? 16 : 8)
++#define IA64_GR0_REGNUM         0
++#define IA64_FR0_REGNUM         128
++#define IA64_FR127_REGNUM       (IA64_FR0_REGNUM+127)
++#define IA64_PR0_REGNUM         256
++#define IA64_BR0_REGNUM         320
++#define IA64_VFP_REGNUM         328
++#define IA64_PR_REGNUM          330
++#define IA64_IP_REGNUM          331
++#define IA64_PSR_REGNUM         332
++#define IA64_CFM_REGNUM         333
++#define IA64_AR0_REGNUM         334
++#define IA64_NAT0_REGNUM        462
++#define IA64_NAT31_REGNUM       (IA64_NAT0_REGNUM+31)
++#define IA64_NAT32_REGNUM       (IA64_NAT0_REGNUM+32)
++#define IA64_RSC_REGNUM               (IA64_AR0_REGNUM+16)
++#define IA64_BSP_REGNUM               (IA64_AR0_REGNUM+17)
++#define IA64_BSPSTORE_REGNUM  (IA64_AR0_REGNUM+18)
++#define IA64_RNAT_REGNUM      (IA64_AR0_REGNUM+19)
++#define IA64_FCR_REGNUM               (IA64_AR0_REGNUM+21)
++#define IA64_EFLAG_REGNUM     (IA64_AR0_REGNUM+24)
++#define IA64_CSD_REGNUM               (IA64_AR0_REGNUM+25)
++#define IA64_SSD_REGNUM               (IA64_AR0_REGNUM+26)
++#define IA64_CFLG_REGNUM      (IA64_AR0_REGNUM+27)
++#define IA64_FSR_REGNUM               (IA64_AR0_REGNUM+28)
++#define IA64_FIR_REGNUM               (IA64_AR0_REGNUM+29)
++#define IA64_FDR_REGNUM               (IA64_AR0_REGNUM+30)
++#define IA64_CCV_REGNUM               (IA64_AR0_REGNUM+32)
++#define IA64_UNAT_REGNUM      (IA64_AR0_REGNUM+36)
++#define IA64_FPSR_REGNUM      (IA64_AR0_REGNUM+40)
++#define IA64_ITC_REGNUM               (IA64_AR0_REGNUM+44)
++#define IA64_PFS_REGNUM               (IA64_AR0_REGNUM+64)
++#define IA64_LC_REGNUM                (IA64_AR0_REGNUM+65)
++#define IA64_EC_REGNUM                (IA64_AR0_REGNUM+66)
++
++#define       REGISTER_INDEX(N)       (REGISTER_BYTE(N) / sizeof (unsigned long))
++#define BREAK_INSTR_ALIGN     (~0xfULL)
++
++#define       ptoff(V)        ((unsigned int) &((struct pt_regs *)0x0)->V)
++struct reg_to_ptreg_index {
++      unsigned int reg;
++      unsigned int ptregoff;
++};
++
++static struct reg_to_ptreg_index gr_reg_to_ptreg_index[] = {
++      {IA64_GR0_REGNUM + 1, ptoff(r1)},
++      {IA64_GR0_REGNUM + 2, ptoff(r2)},
++      {IA64_GR0_REGNUM + 3, ptoff(r3)},
++      {IA64_GR0_REGNUM + 8, ptoff(r8)},
++      {IA64_GR0_REGNUM + 9, ptoff(r9)},
++      {IA64_GR0_REGNUM + 10, ptoff(r10)},
++      {IA64_GR0_REGNUM + 11, ptoff(r11)},
++      {IA64_GR0_REGNUM + 12, ptoff(r12)},
++      {IA64_GR0_REGNUM + 13, ptoff(r13)},
++      {IA64_GR0_REGNUM + 14, ptoff(r14)},
++      {IA64_GR0_REGNUM + 15, ptoff(r15)},
++      {IA64_GR0_REGNUM + 16, ptoff(r16)},
++      {IA64_GR0_REGNUM + 17, ptoff(r17)},
++      {IA64_GR0_REGNUM + 18, ptoff(r18)},
++      {IA64_GR0_REGNUM + 19, ptoff(r19)},
++      {IA64_GR0_REGNUM + 20, ptoff(r20)},
++      {IA64_GR0_REGNUM + 21, ptoff(r21)},
++      {IA64_GR0_REGNUM + 22, ptoff(r22)},
++      {IA64_GR0_REGNUM + 23, ptoff(r23)},
++      {IA64_GR0_REGNUM + 24, ptoff(r24)},
++      {IA64_GR0_REGNUM + 25, ptoff(r25)},
++      {IA64_GR0_REGNUM + 26, ptoff(r26)},
++      {IA64_GR0_REGNUM + 27, ptoff(r27)},
++      {IA64_GR0_REGNUM + 28, ptoff(r28)},
++      {IA64_GR0_REGNUM + 29, ptoff(r29)},
++      {IA64_GR0_REGNUM + 30, ptoff(r30)},
++      {IA64_GR0_REGNUM + 31, ptoff(r31)},
++};
++
++static struct reg_to_ptreg_index br_reg_to_ptreg_index[] = {
++      {IA64_BR0_REGNUM, ptoff(b0)},
++      {IA64_BR0_REGNUM + 6, ptoff(b6)},
++      {IA64_BR0_REGNUM + 7, ptoff(b7)},
++};
++
++static struct reg_to_ptreg_index ar_reg_to_ptreg_index[] = {
++      {IA64_PFS_REGNUM, ptoff(ar_pfs)},
++      {IA64_UNAT_REGNUM, ptoff(ar_unat)},
++      {IA64_RNAT_REGNUM, ptoff(ar_rnat)},
++      {IA64_BSPSTORE_REGNUM, ptoff(ar_bspstore)},
++      {IA64_RSC_REGNUM, ptoff(ar_rsc)},
++      {IA64_CSD_REGNUM, ptoff(ar_csd)},
++      {IA64_SSD_REGNUM, ptoff(ar_ssd)},
++      {IA64_FPSR_REGNUM, ptoff(ar_fpsr)},
++      {IA64_CCV_REGNUM, ptoff(ar_ccv)},
++};
++
++extern atomic_t cpu_doing_single_step;
++
++static int kgdb_gr_reg(int regnum, struct unw_frame_info *info,
++      unsigned long *reg, int rw)
++{
++      char nat;
++
++      if ((regnum >= IA64_GR0_REGNUM && regnum <= (IA64_GR0_REGNUM + 1)) ||
++              (regnum >= (IA64_GR0_REGNUM + 4) &&
++              regnum <= (IA64_GR0_REGNUM + 7)))
++              return !unw_access_gr(info, regnum - IA64_GR0_REGNUM,
++              reg, &nat, rw);
++      else
++              return 0;
++}
++static int kgdb_gr_ptreg(int regnum, struct pt_regs * ptregs,
++      struct unw_frame_info *info, unsigned long *reg, int rw)
++{
++      int i, result = 1;
++      char nat;
++
++      if (!((regnum >= (IA64_GR0_REGNUM + 2) &&
++              regnum <= (IA64_GR0_REGNUM + 3)) ||
++              (regnum >= (IA64_GR0_REGNUM + 8) &&
++              regnum <= (IA64_GR0_REGNUM + 15)) ||
++              (regnum >= (IA64_GR0_REGNUM + 16) &&
++              regnum <= (IA64_GR0_REGNUM + 31))))
++              return 0;
++      else if (rw && ptregs) {
++              for (i = 0; i < ARRAY_SIZE(gr_reg_to_ptreg_index); i++)
++                      if (gr_reg_to_ptreg_index[i].reg == regnum) {
++                              *((unsigned long *)(((void *)ptregs) +
++                              gr_reg_to_ptreg_index[i].ptregoff)) = *reg;
++                              break;
++                      }
++      } else if (!rw && ptregs) {
++              for (i = 0; i < ARRAY_SIZE(gr_reg_to_ptreg_index); i++)
++                      if (gr_reg_to_ptreg_index[i].reg == regnum) {
++                              *reg = *((unsigned long *)
++                              (((void *)ptregs) +
++                               gr_reg_to_ptreg_index[i].ptregoff));
++                              break;
++                      }
++      } else
++              result = !unw_access_gr(info, regnum - IA64_GR0_REGNUM,
++                                      reg, &nat, rw);
++      return result;
++}
++
++static int kgdb_br_reg(int regnum, struct pt_regs * ptregs,
++      struct unw_frame_info *info, unsigned long *reg, int rw)
++{
++      int i, result = 1;
++
++      if (!(regnum >= IA64_BR0_REGNUM && regnum <= (IA64_BR0_REGNUM + 7)))
++              return 0;
++
++      switch (regnum) {
++      case IA64_BR0_REGNUM:
++      case IA64_BR0_REGNUM + 6:
++      case IA64_BR0_REGNUM + 7:
++              if (rw) {
++                      for (i = 0; i < ARRAY_SIZE(br_reg_to_ptreg_index); i++)
++                              if (br_reg_to_ptreg_index[i].reg == regnum) {
++                                      *((unsigned long *)
++                                      (((void *)ptregs) +
++                                      br_reg_to_ptreg_index[i].ptregoff)) =
++                                      *reg;
++                                      break;
++                              }
++              } else
++                      for (i = 0; i < ARRAY_SIZE(br_reg_to_ptreg_index); i++)
++                              if (br_reg_to_ptreg_index[i].reg == regnum) {
++                                              *reg = *((unsigned long *)
++                                              (((void *)ptregs) +
++                                              br_reg_to_ptreg_index[i].
++                                              ptregoff));
++                                              break;
++                              }
++              break;
++      case IA64_BR0_REGNUM + 1:
++      case IA64_BR0_REGNUM + 2:
++      case IA64_BR0_REGNUM + 3:
++      case IA64_BR0_REGNUM + 4:
++      case IA64_BR0_REGNUM + 5:
++              result = !unw_access_br(info, regnum - IA64_BR0_REGNUM,
++                              reg, rw);
++              break;
++      }
++
++      return result;
++}
++
++static int kgdb_fr_reg(int regnum, char *inbuffer, struct pt_regs * ptregs,
++      struct unw_frame_info *info, unsigned long *reg,
++      struct ia64_fpreg *freg, int rw)
++{
++      int result = 1;
++
++      if (!(regnum >= IA64_FR0_REGNUM && regnum <= (IA64_FR0_REGNUM + 127)))
++              return 0;
++
++      switch (regnum) {
++      case IA64_FR0_REGNUM + 6:
++      case IA64_FR0_REGNUM + 7:
++      case IA64_FR0_REGNUM + 8:
++      case IA64_FR0_REGNUM + 9:
++      case IA64_FR0_REGNUM + 10:
++      case IA64_FR0_REGNUM + 11:
++      case IA64_FR0_REGNUM + 12:
++              if (rw) {
++                      char *ptr = inbuffer;
++
++                      freg->u.bits[0] = *reg;
++                      kgdb_hex2long(&ptr, &freg->u.bits[1]);
++                      *(&ptregs->f6 + (regnum - (IA64_FR0_REGNUM + 6))) =
++                              *freg;
++                      break;
++              } else if (!ptregs)
++                      result = !unw_access_fr(info, regnum - IA64_FR0_REGNUM,
++                              freg, rw);
++              else
++                      *freg =
++                      *(&ptregs->f6 + (regnum - (IA64_FR0_REGNUM + 6)));
++              break;
++      default:
++              if (!rw)
++                      result = !unw_access_fr(info, regnum - IA64_FR0_REGNUM,
++                              freg, rw);
++              else
++                      result = 0;
++              break;
++      }
++
++      return result;
++}
++
++static int kgdb_ar_reg(int regnum, struct pt_regs * ptregs,
++      struct unw_frame_info *info, unsigned long *reg, int rw)
++{
++      int result = 0, i;
++
++      if (!(regnum >= IA64_AR0_REGNUM && regnum <= IA64_EC_REGNUM))
++              return 0;
++
++      if (rw && ptregs) {
++              for (i = 0; i < ARRAY_SIZE(ar_reg_to_ptreg_index); i++)
++                      if (ar_reg_to_ptreg_index[i].reg == regnum) {
++                              *((unsigned long *) (((void *)ptregs) +
++                              ar_reg_to_ptreg_index[i].ptregoff)) =
++                                      *reg;
++                              result = 1;
++                              break;
++                      }
++      } else if (ptregs) {
++              for (i = 0; i < ARRAY_SIZE(ar_reg_to_ptreg_index); i++)
++                      if (ar_reg_to_ptreg_index[i].reg == regnum) {
++                              *reg = *((unsigned long *) (((void *)ptregs) +
++                                      ar_reg_to_ptreg_index[i].ptregoff));
++                                      result = 1;
++                              break;
++                      }
++      }
++
++      if (result)
++              return result;
++
++       result = 1;
++
++      switch (regnum) {
++      case IA64_CSD_REGNUM:
++              result = !unw_access_ar(info, UNW_AR_CSD, reg, rw);
++              break;
++      case IA64_SSD_REGNUM:
++              result = !unw_access_ar(info, UNW_AR_SSD, reg, rw);
++              break;
++      case IA64_UNAT_REGNUM:
++              result = !unw_access_ar(info, UNW_AR_RNAT, reg, rw);
++              break;
++              case IA64_RNAT_REGNUM:
++              result = !unw_access_ar(info, UNW_AR_RNAT, reg, rw);
++              break;
++      case IA64_BSPSTORE_REGNUM:
++              result = !unw_access_ar(info, UNW_AR_RNAT, reg, rw);
++              break;
++      case IA64_PFS_REGNUM:
++              result = !unw_access_ar(info, UNW_AR_RNAT, reg, rw);
++              break;
++      case IA64_LC_REGNUM:
++              result = !unw_access_ar(info, UNW_AR_LC, reg, rw);
++              break;
++      case IA64_EC_REGNUM:
++              result = !unw_access_ar(info, UNW_AR_EC, reg, rw);
++              break;
++      case IA64_FPSR_REGNUM:
++              result = !unw_access_ar(info, UNW_AR_FPSR, reg, rw);
++              break;
++      case IA64_RSC_REGNUM:
++              result = !unw_access_ar(info, UNW_AR_RSC, reg, rw);
++              break;
++      case IA64_CCV_REGNUM:
++              result = !unw_access_ar(info, UNW_AR_CCV, reg, rw);
++              break;
++      default:
++              result = 0;
++      }
++
++      return result;
++}
++
++void kgdb_get_reg(char *outbuffer, int regnum, struct unw_frame_info *info,
++      struct pt_regs *ptregs)
++{
++      unsigned long reg, size = 0, *mem = &reg;
++      struct ia64_fpreg freg;
++
++      if (kgdb_gr_reg(regnum, info, &reg, 0) ||
++              kgdb_gr_ptreg(regnum, ptregs, info, &reg, 0) ||
++              kgdb_br_reg(regnum, ptregs, info, &reg, 0) ||
++              kgdb_ar_reg(regnum, ptregs, info, &reg, 0))
++                      size = sizeof(reg);
++      else if (kgdb_fr_reg(regnum, NULL, ptregs, info, &reg, &freg, 0)) {
++              size = sizeof(freg);
++              mem = (unsigned long *)&freg;
++      } else if (regnum == IA64_IP_REGNUM) {
++              if (!ptregs) {
++                      unw_get_ip(info, &reg);
++                      size = sizeof(reg);
++              } else {
++                      reg = ptregs->cr_iip;
++                      size = sizeof(reg);
++              }
++      } else if (regnum == IA64_CFM_REGNUM) {
++              if (!ptregs)
++                      unw_get_cfm(info, &reg);
++              else
++                      reg = ptregs->cr_ifs;
++              size = sizeof(reg);
++      } else if (regnum == IA64_PSR_REGNUM) {
++              if (!ptregs && kgdb_usethread)
++                      ptregs = (struct pt_regs *)
++                      ((unsigned long)kgdb_usethread +
++                      IA64_STK_OFFSET) - 1;
++              if (ptregs)
++                      reg = ptregs->cr_ipsr;
++              size = sizeof(reg);
++      } else if (regnum == IA64_PR_REGNUM) {
++              if (ptregs)
++                      reg = ptregs->pr;
++              else
++                      unw_access_pr(info, &reg, 0);
++              size = sizeof(reg);
++      } else if (regnum == IA64_BSP_REGNUM) {
++              unw_get_bsp(info, &reg);
++              size = sizeof(reg);
++      }
++
++      if (size) {
++              kgdb_mem2hex((char *) mem, outbuffer, size);
++              outbuffer[size*2] = 0;
++      }
++      else
++              strcpy(outbuffer, "E0");
++
++      return;
++}
++
++void kgdb_put_reg(char *inbuffer, char *outbuffer, int regnum,
++                struct unw_frame_info *info, struct pt_regs *ptregs)
++{
++      unsigned long reg;
++      struct ia64_fpreg freg;
++      char *ptr = inbuffer;
++
++      kgdb_hex2long(&ptr, &reg);
++      strcpy(outbuffer, "OK");
++
++      if (kgdb_gr_reg(regnum, info, &reg, 1) ||
++              kgdb_gr_ptreg(regnum, ptregs, info, &reg, 1) ||
++              kgdb_br_reg(regnum, ptregs, info, &reg, 1) ||
++              kgdb_fr_reg(regnum, inbuffer, ptregs, info, &reg, &freg, 1) ||
++              kgdb_ar_reg(regnum, ptregs, info, &reg, 1)) ;
++      else if (regnum == IA64_IP_REGNUM)
++              ptregs->cr_iip = reg;
++      else if (regnum == IA64_CFM_REGNUM)
++              ptregs->cr_ifs = reg;
++      else if (regnum == IA64_PSR_REGNUM)
++              ptregs->cr_ipsr = reg;
++      else if (regnum == IA64_PR_REGNUM)
++              ptregs->pr = reg;
++      else
++              strcpy(outbuffer, "E01");
++      return;
++}
++
++void regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
++{
++}
++
++void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
++{
++}
++
++void gdb_regs_to_regs(unsigned long *gdb_regs, struct pt_regs *regs)
++{
++
++}
++
++#define       MAX_HW_BREAKPOINT       (20)
++long hw_break_total_dbr, hw_break_total_ibr;
++#define       HW_BREAKPOINT   (hw_break_total_dbr + hw_break_total_ibr)
++#define       WATCH_INSTRUCTION       0x0
++#define WATCH_WRITE           0x1
++#define       WATCH_READ              0x2
++#define       WATCH_ACCESS            0x3
++
++#define       HWCAP_DBR       ((1 << WATCH_WRITE) | (1 << WATCH_READ))
++#define       HWCAP_IBR       (1 << WATCH_INSTRUCTION)
++struct hw_breakpoint {
++      unsigned enabled;
++      unsigned long capable;
++      unsigned long type;
++      unsigned long mask;
++      unsigned long addr;
++} *breakinfo;
++
++static struct hw_breakpoint hwbreaks[MAX_HW_BREAKPOINT];
++
++enum instruction_type { A, I, M, F, B, L, X, u };
++
++static enum instruction_type bundle_encoding[32][3] = {
++      {M, I, I},              /* 00 */
++      {M, I, I},              /* 01 */
++      {M, I, I},              /* 02 */
++      {M, I, I},              /* 03 */
++      {M, L, X},              /* 04 */
++      {M, L, X},              /* 05 */
++      {u, u, u},              /* 06 */
++      {u, u, u},              /* 07 */
++      {M, M, I},              /* 08 */
++      {M, M, I},              /* 09 */
++      {M, M, I},              /* 0A */
++      {M, M, I},              /* 0B */
++      {M, F, I},              /* 0C */
++      {M, F, I},              /* 0D */
++      {M, M, F},              /* 0E */
++      {M, M, F},              /* 0F */
++      {M, I, B},              /* 10 */
++      {M, I, B},              /* 11 */
++      {M, B, B},              /* 12 */
++      {M, B, B},              /* 13 */
++      {u, u, u},              /* 14 */
++      {u, u, u},              /* 15 */
++      {B, B, B},              /* 16 */
++      {B, B, B},              /* 17 */
++      {M, M, B},              /* 18 */
++      {M, M, B},              /* 19 */
++      {u, u, u},              /* 1A */
++      {u, u, u},              /* 1B */
++      {M, F, B},              /* 1C */
++      {M, F, B},              /* 1D */
++      {u, u, u},              /* 1E */
++      {u, u, u},              /* 1F */
++};
++
++int kgdb_validate_break_address(unsigned long addr)
++{
++      int error;
++      char tmp_variable[BREAK_INSTR_SIZE];
++      error = kgdb_get_mem((char *)(addr & BREAK_INSTR_ALIGN), tmp_variable,
++              BREAK_INSTR_SIZE);
++      return error;
++}
++
++int kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr)
++{
++      extern unsigned long _start[];
++      unsigned long slot = addr & BREAK_INSTR_ALIGN, bundle_addr;
++      unsigned long template;
++      struct bundle {
++              struct {
++                      unsigned long long template:5;
++                      unsigned long long slot0:41;
++                      unsigned long long slot1_p0:64 - 46;
++              } quad0;
++              struct {
++                      unsigned long long slot1_p1:41 - (64 - 46);
++                      unsigned long long slot2:41;
++              } quad1;
++      } bundle;
++      int ret;
++
++      bundle_addr = addr & ~0xFULL;
++
++      if (bundle_addr == (unsigned long)_start)
++              return 0;
++
++      ret = kgdb_get_mem((char *)bundle_addr, (char *)&bundle,
++                         BREAK_INSTR_SIZE);
++      if (ret < 0)
++              return ret;
++
++      if (slot > 2)
++              slot = 0;
++
++      memcpy(saved_instr, &bundle, BREAK_INSTR_SIZE);
++      template = bundle.quad0.template;
++
++      if (slot == 1 && bundle_encoding[template][1] == L)
++              slot = 2;
++
++      switch (slot) {
++      case 0:
++              bundle.quad0.slot0 = BREAKNUM;
++              break;
++      case 1:
++              bundle.quad0.slot1_p0 = BREAKNUM;
++              bundle.quad1.slot1_p1 = (BREAKNUM >> (64 - 46));
++              break;
++      case 2:
++              bundle.quad1.slot2 = BREAKNUM;
++              break;
++      }
++
++      return kgdb_set_mem((char *)bundle_addr, (char *)&bundle,
++                          BREAK_INSTR_SIZE);
++}
++
++int kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle)
++{
++      extern unsigned long _start[];
++
++      addr = addr & BREAK_INSTR_ALIGN;
++      if (addr == (unsigned long)_start)
++              return 0;
++      return kgdb_set_mem((char *)addr, (char *)bundle, BREAK_INSTR_SIZE);
++}
++
++static int hw_breakpoint_init;
++
++void do_init_hw_break(void)
++{
++      s64 status;
++      int i;
++
++      hw_breakpoint_init = 1;
++
++#ifdef        CONFIG_IA64_HP_SIM
++      hw_break_total_ibr = 8;
++      hw_break_total_dbr = 8;
++      status = 0;
++#else
++      status = ia64_pal_debug_info(&hw_break_total_ibr, &hw_break_total_dbr);
++#endif
++
++      if (status) {
++              printk(KERN_INFO "do_init_hw_break: pal call failed %d\n",
++                     (int)status);
++              return;
++      }
++
++      if (HW_BREAKPOINT > MAX_HW_BREAKPOINT) {
++              printk(KERN_INFO "do_init_hw_break: %d exceeds max %d\n",
++                     (int)HW_BREAKPOINT, (int)MAX_HW_BREAKPOINT);
++
++              while ((HW_BREAKPOINT > MAX_HW_BREAKPOINT)
++                     && hw_break_total_ibr != 1)
++                      hw_break_total_ibr--;
++              while (HW_BREAKPOINT > MAX_HW_BREAKPOINT)
++                      hw_break_total_dbr--;
++      }
++
++      breakinfo = hwbreaks;
++
++      memset(breakinfo, 0, HW_BREAKPOINT * sizeof(struct hw_breakpoint));
++
++      for (i = 0; i < hw_break_total_dbr; i++)
++              breakinfo[i].capable = HWCAP_DBR;
++
++      for (; i < HW_BREAKPOINT; i++)
++              breakinfo[i].capable = HWCAP_IBR;
++
++      return;
++}
++
++void kgdb_correct_hw_break(void)
++{
++      int breakno;
++
++      if (!breakinfo)
++              return;
++
++      for (breakno = 0; breakno < HW_BREAKPOINT; breakno++) {
++              if (breakinfo[breakno].enabled) {
++                      if (breakinfo[breakno].capable & HWCAP_IBR) {
++                              int ibreakno = breakno - hw_break_total_dbr;
++                              ia64_set_ibr(ibreakno << 1,
++                                           breakinfo[breakno].addr);
++                              ia64_set_ibr((ibreakno << 1) + 1,
++                                           (~breakinfo[breakno].mask &
++                                            ((1UL << 56UL) - 1)) |
++                                            (1UL << 56UL) | (1UL << 63UL));
++                      } else {
++                              ia64_set_dbr(breakno << 1,
++                                           breakinfo[breakno].addr);
++                              ia64_set_dbr((breakno << 1) + 1,
++                                           (~breakinfo[breakno].
++                                            mask & ((1UL << 56UL) - 1)) |
++                                           (1UL << 56UL) |
++                                           (breakinfo[breakno].type << 62UL));
++                      }
++              } else {
++                      if (breakinfo[breakno].capable & HWCAP_IBR)
++                              ia64_set_ibr(((breakno -
++                                             hw_break_total_dbr) << 1) + 1,
++                                           0);
++                      else
++                              ia64_set_dbr((breakno << 1) + 1, 0);
++              }
++      }
++
++      return;
++}
++
++int hardware_breakpoint(unsigned long addr, int length, int type, int action)
++{
++      int breakno, found, watch;
++      unsigned long mask;
++      extern unsigned long _start[];
++
++      if (!hw_breakpoint_init)
++              do_init_hw_break();
++
++      if (!breakinfo)
++              return 0;
++      else if (addr == (unsigned long)_start)
++              return 1;
++
++      if (type == WATCH_ACCESS)
++              mask = HWCAP_DBR;
++      else
++              mask = 1UL << type;
++
++      for (watch = 0, found = 0, breakno = 0; breakno < HW_BREAKPOINT;
++           breakno++) {
++              if (action) {
++                      if (breakinfo[breakno].enabled
++                          || !(breakinfo[breakno].capable & mask))
++                              continue;
++                      breakinfo[breakno].enabled = 1;
++                      breakinfo[breakno].type = type;
++                      breakinfo[breakno].mask = length - 1;
++                      breakinfo[breakno].addr = addr;
++                      watch = breakno;
++              } else if (breakinfo[breakno].enabled &&
++                         ((length < 0 && breakinfo[breakno].addr == addr) ||
++                          ((breakinfo[breakno].capable & mask) &&
++                           (breakinfo[breakno].mask == (length - 1)) &&
++                           (breakinfo[breakno].addr == addr)))) {
++                      breakinfo[breakno].enabled = 0;
++                      breakinfo[breakno].type = 0UL;
++              } else
++                      continue;
++              found++;
++              if (type != WATCH_ACCESS)
++                      break;
++              else if (found == 2)
++                      break;
++              else
++                      mask = HWCAP_IBR;
++      }
++
++      if (type == WATCH_ACCESS && found == 1) {
++              breakinfo[watch].enabled = 0;
++              found = 0;
++      }
++
++      mb();
++      return found;
++}
++
++int kgdb_arch_set_hw_breakpoint(unsigned long addr, int len,
++                              enum kgdb_bptype type)
++{
++      return hardware_breakpoint(addr, len, type - '1', 1);
++}
++
++int kgdb_arch_remove_hw_breakpoint(unsigned long addr, int len,
++                                 enum kgdb_bptype type)
++{
++      return hardware_breakpoint(addr, len, type - '1', 0);
++}
++
++int kgdb_remove_hw_break(unsigned long addr)
++{
++      return hardware_breakpoint(addr, 8, WATCH_INSTRUCTION, 0);
++
++}
++
++void kgdb_remove_all_hw_break(void)
++{
++      int i;
++
++      for (i = 0; i < HW_BREAKPOINT; i++)
++              memset(&breakinfo[i], 0, sizeof(struct hw_breakpoint));
++}
++
++int kgdb_set_hw_break(unsigned long addr)
++{
++      return hardware_breakpoint(addr, 8, WATCH_INSTRUCTION, 1);
++}
++
++void kgdb_disable_hw_debug(struct pt_regs *regs)
++{
++      unsigned long hw_breakpoint_status;
++
++      hw_breakpoint_status = ia64_getreg(_IA64_REG_PSR);
++      if (hw_breakpoint_status & IA64_PSR_DB)
++              ia64_setreg(_IA64_REG_PSR_L,
++                          hw_breakpoint_status ^ IA64_PSR_DB);
++}
++
++volatile static struct smp_unw {
++      struct unw_frame_info *unw;
++      struct task_struct *task;
++} smp_unw[NR_CPUS];
++
++static int inline kgdb_get_blocked_state(struct task_struct *p,
++                                       struct unw_frame_info *unw)
++{
++      unsigned long ip;
++      int count = 0;
++
++      unw_init_from_blocked_task(unw, p);
++      ip = 0UL;
++      do {
++              if (unw_unwind(unw) < 0)
++                      return -1;
++              unw_get_ip(unw, &ip);
++              if (!in_sched_functions(ip))
++                      break;
++      } while (count++ < 16);
++
++      if (!ip)
++              return -1;
++      else
++              return 0;
++}
++
++static void inline kgdb_wait(struct pt_regs *regs)
++{
++      unsigned long hw_breakpoint_status = ia64_getreg(_IA64_REG_PSR);
++      if (hw_breakpoint_status & IA64_PSR_DB)
++              ia64_setreg(_IA64_REG_PSR_L,
++                          hw_breakpoint_status ^ IA64_PSR_DB);
++      kgdb_nmihook(smp_processor_id(), regs);
++      if (hw_breakpoint_status & IA64_PSR_DB)
++              ia64_setreg(_IA64_REG_PSR_L, hw_breakpoint_status);
++
++      return;
++}
++
++static void inline normalize(struct unw_frame_info *running,
++                           struct pt_regs *regs)
++{
++      unsigned long sp;
++
++      do {
++              unw_get_sp(running, &sp);
++              if ((sp + 0x10) >= (unsigned long)regs)
++                      break;
++      } while (unw_unwind(running) >= 0);
++
++      return;
++}
++
++static void kgdb_init_running(struct unw_frame_info *unw, void *data)
++{
++      struct pt_regs *regs;
++
++      regs = data;
++      normalize(unw, regs);
++      smp_unw[smp_processor_id()].unw = unw;
++      kgdb_wait(regs);
++}
++
++void kgdb_wait_ipi(struct pt_regs *regs)
++{
++      struct unw_frame_info unw;
++
++      smp_unw[smp_processor_id()].task = current;
++
++      if (user_mode(regs)) {
++              smp_unw[smp_processor_id()].unw = (struct unw_frame_info *)1;
++              kgdb_wait(regs);
++      } else {
++              if (current->state == TASK_RUNNING)
++                      unw_init_running(kgdb_init_running, regs);
++              else {
++                      if (kgdb_get_blocked_state(current, &unw))
++                              smp_unw[smp_processor_id()].unw =
++                                  (struct unw_frame_info *)1;
++                      else
++                              smp_unw[smp_processor_id()].unw = &unw;
++                      kgdb_wait(regs);
++              }
++      }
++
++      smp_unw[smp_processor_id()].unw = NULL;
++      return;
++}
++
++void kgdb_roundup_cpus(unsigned long flags)
++{
++      if (num_online_cpus() > 1)
++              smp_send_nmi_allbutself();
++}
++
++static volatile int kgdb_hwbreak_sstep[NR_CPUS];
++
++static int kgdb_notify(struct notifier_block *self, unsigned long cmd,
++      void *ptr)
++{
++      struct die_args *args = ptr;
++      struct pt_regs *regs = args->regs;
++      unsigned long err = args->err;
++
++      switch (cmd) {
++      default:
++              return NOTIFY_DONE;
++      case DIE_PAGE_FAULT_NO_CONTEXT:
++              if (atomic_read(&debugger_active) && kgdb_may_fault) {
++                      kgdb_fault_longjmp(kgdb_fault_jmp_regs);
++                      return NOTIFY_STOP;
++              }
++              break;
++      case DIE_BREAK:
++              if (user_mode(regs) || err == 0x80001)
++                      return NOTIFY_DONE;
++              break;
++      case DIE_FAULT:
++              if (user_mode(regs))
++                      return NOTIFY_DONE;
++              else if (err == 36 && kgdb_hwbreak_sstep[smp_processor_id()]) {
++                      kgdb_hwbreak_sstep[smp_processor_id()] = 0;
++                      regs->cr_ipsr &= ~IA64_PSR_SS;
++                      return NOTIFY_STOP;
++              }
++      case DIE_MCA_MONARCH_PROCESS:
++      case DIE_INIT_MONARCH_PROCESS:
++              break;
++      }
++
++      kgdb_handle_exception(args->trapnr, args->signr, args->err, regs);
++      return NOTIFY_STOP;
++}
++
++static struct notifier_block kgdb_notifier = {
++      .notifier_call = kgdb_notify,
++};
++
++int kgdb_arch_init(void)
++{
++      atomic_notifier_chain_register(&ia64die_chain, &kgdb_notifier);
++      return 0;
++}
++
++static void do_kgdb_handle_exception(struct unw_frame_info *, void *data);
++
++struct kgdb_state {
++      int e_vector;
++      int signo;
++      unsigned long err_code;
++      struct pt_regs *regs;
++      struct unw_frame_info *unw;
++      char *inbuf;
++      char *outbuf;
++      int unwind;
++      int ret;
++};
++
++static void inline kgdb_pc(struct pt_regs *regs, unsigned long pc)
++{
++      regs->cr_iip = pc & ~0xf;
++      ia64_psr(regs)->ri = pc & 0x3;
++      return;
++}
++
++int kgdb_arch_handle_exception(int e_vector, int signo,
++                             int err_code, char *remcom_in_buffer,
++                             char *remcom_out_buffer,
++                             struct pt_regs *linux_regs)
++{
++      struct kgdb_state info;
++
++      info.e_vector = e_vector;
++      info.signo = signo;
++      info.err_code = err_code;
++      info.unw = (void *)0;
++      info.inbuf = remcom_in_buffer;
++      info.outbuf = remcom_out_buffer;
++      info.unwind = 0;
++      info.ret = -1;
++
++      if (remcom_in_buffer[0] == 'c' || remcom_in_buffer[0] == 's') {
++              info.regs = linux_regs;
++              do_kgdb_handle_exception(NULL, &info);
++      } else if (kgdb_usethread == current) {
++              info.regs = linux_regs;
++              info.unwind = 1;
++              unw_init_running(do_kgdb_handle_exception, &info);
++      } else if (kgdb_usethread->state != TASK_RUNNING) {
++              struct unw_frame_info unw_info;
++
++              if (kgdb_get_blocked_state(kgdb_usethread, &unw_info)) {
++                      info.ret = 1;
++                      goto bad;
++              }
++              info.regs = NULL;
++              do_kgdb_handle_exception(&unw_info, &info);
++      } else {
++              int i;
++
++              for (i = 0; i < NR_CPUS; i++)
++                      if (smp_unw[i].task == kgdb_usethread && smp_unw[i].unw
++                          && smp_unw[i].unw != (struct unw_frame_info *)1) {
++                              info.regs = NULL;
++                              do_kgdb_handle_exception(smp_unw[i].unw, &info);
++                              break;
++                      } else {
++                              info.ret = 1;
++                              goto bad;
++                      }
++      }
++
++      bad:
++      if (info.ret != -1 && remcom_in_buffer[0] == 'p') {
++              unsigned long bad = 0xbad4badbadbadbadUL;
++
++              printk("kgdb_arch_handle_exception: p packet bad (%s)\n",
++                     remcom_in_buffer);
++              kgdb_mem2hex((char *)&bad, remcom_out_buffer, sizeof(bad));
++              remcom_out_buffer[sizeof(bad) * 2] = 0;
++              info.ret = -1;
++      }
++      return info.ret;
++}
++
++/*
++ * This is done because I evidently made an incorrect 'p' encoding
++ * when my patch for gdb was committed. It was later corrected. This
++ * check supports both my wrong encoding of the register number and
++ * the correct encoding. Eventually this should be eliminated and
++ * kgdb_hex2long should be demarshalling the regnum.
++ */
++static inline int check_packet(unsigned int regnum, char *packet)
++{
++      static int check_done, swap;
++      unsigned long reglong;
++
++      if (likely(check_done)) {
++              if (swap) {
++                      kgdb_hex2long(&packet, &reglong);
++                      regnum = (int) reglong;
++              }
++
++      } else {
++              if (regnum > NUM_REGS) {
++                      kgdb_hex2long(&packet, &reglong);
++                      regnum = (int) reglong;
++                      swap = 1;
++              }
++              check_done = 1;
++      }
++      return regnum;
++}
++
++static void do_kgdb_handle_exception(struct unw_frame_info *unw_info,
++      void *data)
++{
++      long addr;
++      char *ptr;
++      unsigned long newPC;
++      int e_vector, signo;
++      unsigned long err_code;
++      struct pt_regs *linux_regs;
++      struct kgdb_state *info;
++      char *remcom_in_buffer, *remcom_out_buffer;
++
++      info = data;
++      info->unw = unw_info;
++      e_vector = info->e_vector;
++      signo = info->signo;
++      err_code = info->err_code;
++      remcom_in_buffer = info->inbuf;
++      remcom_out_buffer = info->outbuf;
++      linux_regs = info->regs;
++
++      if (info->unwind)
++              normalize(unw_info, linux_regs);
++
++      switch (remcom_in_buffer[0]) {
++      case 'p':
++              {
++                      unsigned int regnum;
++
++                      kgdb_hex2mem(&remcom_in_buffer[1], (char *)&regnum,
++                                   sizeof(regnum));
++                      regnum = check_packet(regnum, &remcom_in_buffer[1]);
++                      if (regnum >= NUM_REGS) {
++                              remcom_out_buffer[0] = 'E';
++                              remcom_out_buffer[1] = 0;
++                      } else
++                              kgdb_get_reg(remcom_out_buffer, regnum,
++                                           unw_info, linux_regs);
++                      break;
++              }
++      case 'P':
++              {
++                      unsigned int regno;
++                      long v;
++                      char *ptr;
++
++                      ptr = &remcom_in_buffer[1];
++                      if ((!kgdb_usethread || kgdb_usethread == current) &&
++                          kgdb_hex2long(&ptr, &v) &&
++                          *ptr++ == '=' && (v >= 0)) {
++                              regno = (unsigned int)v;
++                              regno = (regno >= NUM_REGS ? 0 : regno);
++                              kgdb_put_reg(ptr, remcom_out_buffer, regno,
++                                           unw_info, linux_regs);
++                      } else
++                              strcpy(remcom_out_buffer, "E01");
++                      break;
++              }
++      case 'c':
++      case 's':
++              if (e_vector == TRAP_BRKPT && err_code == KGDBBREAKNUM) {
++                      if (ia64_psr(linux_regs)->ri < 2)
++                              kgdb_pc(linux_regs, linux_regs->cr_iip +
++                                      ia64_psr(linux_regs)->ri + 1);
++                      else
++                              kgdb_pc(linux_regs, linux_regs->cr_iip + 16);
++              }
++
++              /* try to read optional parameter, pc unchanged if no parm */
++              ptr = &remcom_in_buffer[1];
++              if (kgdb_hex2long(&ptr, &addr)) {
++                      linux_regs->cr_iip = addr;
++              }
++              newPC = linux_regs->cr_iip;
++
++              /* clear the trace bit */
++              linux_regs->cr_ipsr &= ~IA64_PSR_SS;
++
++              atomic_set(&cpu_doing_single_step, -1);
++
++              /* set the trace bit if we're stepping or took a hardware break */
++              if (remcom_in_buffer[0] == 's' || e_vector == TRAP_HWBKPT) {
++                      linux_regs->cr_ipsr |= IA64_PSR_SS;
++                      debugger_step = 1;
++                      if (kgdb_contthread)
++                              atomic_set(&cpu_doing_single_step,
++                                         smp_processor_id());
++              }
++
++              kgdb_correct_hw_break();
++
++              /* if not hardware breakpoint, then reenable them */
++              if (e_vector != TRAP_HWBKPT)
++                      linux_regs->cr_ipsr |= IA64_PSR_DB;
++              else {
++                      kgdb_hwbreak_sstep[smp_processor_id()] = 1;
++                      linux_regs->cr_ipsr &= ~IA64_PSR_DB;
++              }
++
++              info->ret = 0;
++              break;
++      default:
++              break;
++      }
++
++      return;
++}
++
++struct kgdb_arch arch_kgdb_ops = {
++      .set_hw_breakpoint = kgdb_arch_set_hw_breakpoint,
++      .remove_hw_breakpoint = kgdb_arch_remove_hw_breakpoint,
++      .gdb_bpt_instr = {0xcc},
++      .flags = KGDB_HW_BREAKPOINT,
++};
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ia64/kernel/process.c linux-2.6.18-53.1.14.kgdb/arch/ia64/kernel/process.c
+--- linux-2.6.18-53.1.14/arch/ia64/kernel/process.c    2008-03-06 05:55:00.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/arch/ia64/kernel/process.c       2008-06-10 15:39:39.000000000 +0400
+@@ -463,6 +463,9 @@ copy_thread (int nr, unsigned long clone
+        */
+       child_ptregs->cr_ipsr = ((child_ptregs->cr_ipsr | IA64_PSR_BITS_TO_SET)
+                                & ~(IA64_PSR_BITS_TO_CLEAR | IA64_PSR_PP | IA64_PSR_UP));
++#ifdef        CONFIG_KGDB
++      child_ptregs->cr_ipsr |= IA64_PSR_DB;
++#endif
+ 
+       /*
+        * NOTE: The calling convention considers all floating point
+@@ -691,6 +694,9 @@ kernel_thread (int (*fn)(void *), void *
+       regs.pt.r11 = (unsigned long) arg;      /* 2nd argument */
+       /* Preserve PSR bits, except for bits 32-34 and 37-45, which we can't read.  */
+       regs.pt.cr_ipsr = ia64_getreg(_IA64_REG_PSR) | IA64_PSR_BN;
++#ifdef        CONFIG_KGDB
++      regs.pt.cr_ipsr |= IA64_PSR_DB;
++#endif
+       regs.pt.cr_ifs = 1UL << 63;             /* mark as valid, empty frame */
+       regs.sw.ar_fpsr = regs.pt.ar_fpsr = ia64_getreg(_IA64_REG_AR_FPSR);
+       regs.sw.ar_bspstore = (unsigned long) current + IA64_RBS_OFFSET;
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ia64/kernel/smp.c linux-2.6.18-53.1.14.kgdb/arch/ia64/kernel/smp.c
+--- linux-2.6.18-53.1.14/arch/ia64/kernel/smp.c        2008-03-06 05:54:27.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/arch/ia64/kernel/smp.c   2008-06-10 15:38:32.000000000 +0400
+@@ -48,6 +48,7 @@
+ #include <asm/tlbflush.h>
+ #include <asm/unistd.h>
+ #include <asm/mca.h>
++#include <linux/kgdb.h>
+ 
+ /*
+  * Structure and data for smp_call_function(). This is designed to minimise static memory
+@@ -68,6 +69,9 @@ static volatile struct call_data_struct 
+ #define IPI_CALL_FUNC         0
+ #define IPI_CPU_STOP          1
+ #define IPI_KDUMP_CPU_STOP    3
++#ifdef        CONFIG_KGDB
++#define       IPI_KGDB_INTERRUPT      2
++#endif
+ 
+ /* This needs to be cacheline aligned because it is written to by *other* CPUs.  */
+ static DEFINE_PER_CPU(u64, ipi_operation) ____cacheline_aligned;
+@@ -185,6 +189,11 @@ handle_IPI (int irq, void *dev_id, struc
+                             case IPI_CPU_STOP:
+                               stop_this_cpu();
+                               break;
++#ifdef        CONFIG_KGDB
++                            case IPI_KGDB_INTERRUPT:
++                              kgdb_wait_ipi(regs);
++                              break;
++#endif
+ #ifdef CONFIG_CRASH_DUMP
+                             case IPI_KDUMP_CPU_STOP:
+                               unw_init_running(kdump_cpu_freeze, NULL);
+@@ -359,6 +368,14 @@ smp_call_function_single (int cpuid, voi
+ }
+ EXPORT_SYMBOL(smp_call_function_single);
+ 
++#ifdef        CONFIG_KGDB
++void
++smp_send_nmi_allbutself(void)
++{
++      send_IPI_allbutself(IPI_KGDB_INTERRUPT);
++}
++#endif
++
+ /*
+  * this function sends a 'generic call function' IPI to all other CPUs
+  * in the system.
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ia64/kernel/traps.c linux-2.6.18-53.1.14.kgdb/arch/ia64/kernel/traps.c
+--- linux-2.6.18-53.1.14/arch/ia64/kernel/traps.c      2008-03-06 05:54:44.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/arch/ia64/kernel/traps.c 2008-06-10 15:38:32.000000000 +0400
+@@ -200,8 +200,12 @@ __kprobes ia64_bad_break (unsigned long 
+               break;
+ 
+             default:
+-              if (break_num < 0x40000 || break_num > 0x100000)
++              if (break_num < 0x40000 || break_num > 0x100000) {
++                      if (notify_die(DIE_BREAK, "bad break", regs,
++                              break_num, TRAP_BRKPT, SIGTRAP) == NOTIFY_STOP)
++                              return;
+                       die_if_kernel("Bad break", regs, break_num);
++              }
+ 
+               if (break_num < 0x80000) {
+                       sig = SIGILL; code = __ILL_BREAK;
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ia64/kernel/unwind.c linux-2.6.18-53.1.14.kgdb/arch/ia64/kernel/unwind.c
+--- linux-2.6.18-53.1.14/arch/ia64/kernel/unwind.c     2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/ia64/kernel/unwind.c        2008-06-10 15:39:39.000000000 +0400
+@@ -72,10 +72,68 @@
+ # define STAT(x...)
+ #endif
+ 
++#ifdef        CONFIG_KGDB
++#define       KGDB_EARLY_SIZE 100
++static struct unw_reg_state __initdata kgdb_reg_state[KGDB_EARLY_SIZE];
++static struct unw_labeled_state __initdata kgdb_labeled_state[KGDB_EARLY_SIZE];
++void __initdata *kgdb_reg_state_free, __initdata *kgdb_labeled_state_free;
++
++static void __init
++kgdb_malloc_init(void)
++{
++      int i;
++
++      kgdb_reg_state_free = kgdb_reg_state;
++      for (i = 1; i < KGDB_EARLY_SIZE; i++) {
++              *((unsigned long *) &kgdb_reg_state[i]) = (unsigned long) kgdb_reg_state_free;
++              kgdb_reg_state_free = &kgdb_reg_state[i];
++      }
++
++      kgdb_labeled_state_free = kgdb_labeled_state;
++      for (i = 1; i < KGDB_EARLY_SIZE; i++) {
++              *((unsigned long *) &kgdb_labeled_state[i]) =
++                      (unsigned long) kgdb_labeled_state_free;
++              kgdb_labeled_state_free = &kgdb_labeled_state[i];
++      }
++
++}
++
++static void * __init
++kgdb_malloc(void **mem)
++{
++      void *p;
++
++      p = *mem;
++      *mem = *((void **) p);
++      return p;
++}
++
++static void __init
++kgdb_free(void **mem, void *p)
++{
++      *((void **)p) = *mem;
++      *mem = p;
++}
++
++#define alloc_reg_state()     (!malloc_sizes[0].cs_cachep ?           \
++              kgdb_malloc(&kgdb_reg_state_free) :                     \
++              kmalloc(sizeof(struct unw_reg_state), GFP_ATOMIC))
++#define free_reg_state(usr)   (!malloc_sizes[0].cs_cachep ?           \
++              kgdb_free(&kgdb_reg_state_free, usr) :                  \
++              kfree(usr))
++#define alloc_labeled_state() (!malloc_sizes[0].cs_cachep ?           \
++              kgdb_malloc(&kgdb_labeled_state_free) :                 \
++              kmalloc(sizeof(struct unw_labeled_state), GFP_ATOMIC))
++#define free_labeled_state(usr)       (!malloc_sizes[0].cs_cachep ?           \
++              kgdb_free(&kgdb_labeled_state_free, usr) :              \
++              kfree(usr))
++
++#else
+ #define alloc_reg_state()     kmalloc(sizeof(struct unw_reg_state), GFP_ATOMIC)
+ #define free_reg_state(usr)   kfree(usr)
+ #define alloc_labeled_state() kmalloc(sizeof(struct unw_labeled_state), GFP_ATOMIC)
+ #define free_labeled_state(usr)       kfree(usr)
++#endif
+ 
+ typedef unsigned long unw_word;
+ typedef unsigned char unw_hash_index_t;
+@@ -238,6 +296,24 @@ static struct {
+ #endif
+ };
+ 
++#ifdef        CONFIG_KGDB
++/*
++ * This makes it safe to call breakpoint() very early
++ * in setup_arch providing:
++ *    1) breakpoint isn't called between lines in cpu_init
++ *       where init_mm.mm_count is incremented and ia64_mmu_init
++ *       is called.  Otherwise the test below is invalid.
++ *    2) the memory examined doesn't result in tlbmiss.
++ */
++static unsigned long inline kgdb_unimpl_va_mask(void)
++{
++      if (atomic_read(&init_mm.mm_count) > 1)
++              return local_cpu_data->unimpl_va_mask;
++      else
++              return 0UL;
++}
++#endif
++
+ static inline int
+ read_only (void *addr)
+ {
+@@ -1786,7 +1862,11 @@ run_script (struct unw_script *script, s
+ 
+                     case UNW_INSN_LOAD:
+ #ifdef UNW_DEBUG
++#ifdef        CONFIG_KGDB
++                      if ((s[val] & (kgdb_unimpl_va_mask() | 0x7)) != 0
++#else
+                       if ((s[val] & (local_cpu_data->unimpl_va_mask | 0x7)) != 0
++#endif
+                           || s[val] < TASK_SIZE)
+                       {
+                               UNW_DPRINT(0, "unwind.%s: rejecting bad psp=0x%lx\n",
+@@ -1821,7 +1901,11 @@ find_save_locs (struct unw_frame_info *i
+       struct unw_script *scr;
+       unsigned long flags = 0;
+ 
++#ifdef        CONFIG_KGDB
++      if ((info->ip & (kgdb_unimpl_va_mask() | 0xf)) || info->ip < TASK_SIZE) {
++#else
+       if ((info->ip & (local_cpu_data->unimpl_va_mask | 0xf)) || info->ip < TASK_SIZE) {
++#endif
+               /* don't let obviously bad addresses pollute the cache */
+               /* FIXME: should really be level 0 but it occurs too often. KAO */
+               UNW_DPRINT(1, "unwind.%s: rejecting bad ip=0x%lx\n", __FUNCTION__, info->ip);
+@@ -2249,6 +2333,9 @@ unw_init (void)
+ 
+       init_unwind_table(&unw.kernel_table, "kernel", KERNEL_START, (unsigned long) __gp,
+                         __start_unwind, __end_unwind);
++#ifdef        CONFIG_KGDB
++      kgdb_malloc_init();
++#endif
+ }
+ 
+ /*
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ia64/mm/extable.c linux-2.6.18-53.1.14.kgdb/arch/ia64/mm/extable.c
+--- linux-2.6.18-53.1.14/arch/ia64/mm/extable.c        2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/ia64/mm/extable.c   2008-06-10 15:38:32.000000000 +0400
+@@ -6,6 +6,7 @@
+  */
+ 
+ #include <linux/sort.h>
++#include <linux/kgdb.h>
+ 
+ #include <asm/uaccess.h>
+ #include <asm/module.h>
+@@ -73,6 +74,11 @@ search_extable (const struct exception_t
+                 else
+                         last = mid - 1;
+         }
++#ifdef CONFIG_KGDB
++      if (atomic_read(&debugger_active) && kgdb_may_fault)
++              kgdb_fault_longjmp(kgdb_fault_jmp_regs);
++              /* Not reached. */
++#endif
+         return NULL;
+ }
+ 
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ia64/mm/fault.c linux-2.6.18-53.1.14.kgdb/arch/ia64/mm/fault.c
+--- linux-2.6.18-53.1.14/arch/ia64/mm/fault.c  2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/ia64/mm/fault.c     2008-06-10 15:38:32.000000000 +0400
+@@ -266,6 +266,10 @@ ia64_do_page_fault (unsigned long addres
+        */
+       bust_spinlocks(1);
+ 
++      if (notify_die(DIE_PAGE_FAULT_NO_CONTEXT, "no context", regs,
++                      isr, 14, SIGSEGV) == NOTIFY_STOP)
++              return;
++
+       if (address < PAGE_SIZE)
+               printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference (address %016lx)\n", address);
+       else
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/mips/Kconfig.debug linux-2.6.18-53.1.14.kgdb/arch/mips/Kconfig.debug
+--- linux-2.6.18-53.1.14/arch/mips/Kconfig.debug       2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/mips/Kconfig.debug  2008-06-10 15:38:24.000000000 +0400
+@@ -37,25 +37,6 @@ config DEBUG_STACK_USAGE
+ 
+         This option will slow down process creation somewhat.
+ 
+-config KGDB
+-      bool "Remote GDB kernel debugging"
+-      depends on DEBUG_KERNEL
+-      select DEBUG_INFO
+-      help
+-        If you say Y here, it will be possible to remotely debug the MIPS
+-        kernel using gdb. This enlarges your kernel image disk size by
+-        several megabytes and requires a machine with more than 16 MB,
+-        better 32 MB RAM to avoid excessive linking time. This is only
+-        useful for kernel hackers. If unsure, say N.
+-
+-config GDB_CONSOLE
+-      bool "Console output to GDB"
+-      depends on KGDB
+-      help
+-        If you are using GDB for remote debugging over a serial port and
+-        would like kernel messages to be formatted into GDB $O packets so
+-        that GDB prints them as program output, say 'Y'.
+-
+ config SB1XXX_CORELIS
+       bool "Corelis Debugger"
+       depends on SIBYTE_SB1xxx_SOC
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/mips/kernel/Makefile linux-2.6.18-53.1.14.kgdb/arch/mips/kernel/Makefile
+--- linux-2.6.18-53.1.14/arch/mips/kernel/Makefile     2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/mips/kernel/Makefile        2008-06-10 15:38:24.000000000 +0400
+@@ -59,7 +59,8 @@ obj-$(CONFIG_MIPS32_COMPAT)  += linux32.o
+ obj-$(CONFIG_MIPS32_N32)      += binfmt_elfn32.o scall64-n32.o signal_n32.o
+ obj-$(CONFIG_MIPS32_O32)      += binfmt_elfo32.o scall64-o32.o ptrace32.o
+ 
+-obj-$(CONFIG_KGDB)            += gdb-low.o gdb-stub.o
++obj-$(CONFIG_KGDB)            += kgdb_handler.o kgdb.o kgdb-jmp.o     \
++                                      kgdb-setjmp.o
+ obj-$(CONFIG_PROC_FS)         += proc.o
+ 
+ obj-$(CONFIG_64BIT)           += cpu-bugs64.o
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/mips/kernel/gdb-low.S linux-2.6.18-53.1.14.kgdb/arch/mips/kernel/gdb-low.S
+--- linux-2.6.18-53.1.14/arch/mips/kernel/gdb-low.S    2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/mips/kernel/gdb-low.S       1970-01-01 03:00:00.000000000 +0300
+@@ -1,394 +0,0 @@
+-/*
+- * gdb-low.S contains the low-level trap handler for the GDB stub.
+- *
+- * Copyright (C) 1995 Andreas Busse
+- */
+-#include <linux/sys.h>
+-
+-#include <asm/asm.h>
+-#include <asm/errno.h>
+-#include <asm/irqflags.h>
+-#include <asm/mipsregs.h>
+-#include <asm/regdef.h>
+-#include <asm/stackframe.h>
+-#include <asm/gdb-stub.h>
+-
+-#ifdef CONFIG_32BIT
+-#define DMFC0 mfc0
+-#define DMTC0 mtc0
+-#define LDC1  lwc1
+-#define SDC1  lwc1
+-#endif
+-#ifdef CONFIG_64BIT
+-#define DMFC0 dmfc0
+-#define DMTC0 dmtc0
+-#define LDC1  ldc1
+-#define SDC1  ldc1
+-#endif
+-
+-/*
+- * [jsun] We reserves about 2x GDB_FR_SIZE in stack.  The lower (addressed)
+- * part is used to store registers and passed to exception handler.
+- * The upper part is reserved for "call func" feature where gdb client
+- * saves some of the regs, setups call frame and passes args.
+- *
+- * A trace shows about 200 bytes are used to store about half of all regs.
+- * The rest should be big enough for frame setup and passing args.
+- */
+-
+-/*
+- * The low level trap handler
+- */
+-              .align  5
+-              NESTED(trap_low, GDB_FR_SIZE, sp)
+-              .set    noat
+-              .set    noreorder
+-
+-              mfc0    k0, CP0_STATUS
+-              sll     k0, 3                   /* extract cu0 bit */
+-              bltz    k0, 1f
+-              move    k1, sp
+-
+-              /*
+-               * Called from user mode, go somewhere else.
+-               */
+-              mfc0    k0, CP0_CAUSE
+-              andi    k0, k0, 0x7c
+-#ifdef CONFIG_64BIT
+-              dsll    k0, k0, 1
+-#endif
+-              PTR_L   k1, saved_vectors(k0)
+-              jr      k1
+-              nop
+-1:
+-              move    k0, sp
+-              PTR_SUBU sp, k1, GDB_FR_SIZE*2  # see comment above
+-              LONG_S  k0, GDB_FR_REG29(sp)
+-              LONG_S  $2, GDB_FR_REG2(sp)
+-
+-/*
+- * First save the CP0 and special registers
+- */
+-
+-              mfc0    v0, CP0_STATUS
+-              LONG_S  v0, GDB_FR_STATUS(sp)
+-              mfc0    v0, CP0_CAUSE
+-              LONG_S  v0, GDB_FR_CAUSE(sp)
+-              DMFC0   v0, CP0_EPC
+-              LONG_S  v0, GDB_FR_EPC(sp)
+-              DMFC0   v0, CP0_BADVADDR
+-              LONG_S  v0, GDB_FR_BADVADDR(sp)
+-              mfhi    v0
+-              LONG_S  v0, GDB_FR_HI(sp)
+-              mflo    v0
+-              LONG_S  v0, GDB_FR_LO(sp)
+-
+-/*
+- * Now the integer registers
+- */
+-
+-              LONG_S  zero, GDB_FR_REG0(sp)           /* I know... */
+-              LONG_S  $1, GDB_FR_REG1(sp)
+-              /* v0 already saved */
+-              LONG_S  $3, GDB_FR_REG3(sp)
+-              LONG_S  $4, GDB_FR_REG4(sp)
+-              LONG_S  $5, GDB_FR_REG5(sp)
+-              LONG_S  $6, GDB_FR_REG6(sp)
+-              LONG_S  $7, GDB_FR_REG7(sp)
+-              LONG_S  $8, GDB_FR_REG8(sp)
+-              LONG_S  $9, GDB_FR_REG9(sp)
+-              LONG_S  $10, GDB_FR_REG10(sp)
+-              LONG_S  $11, GDB_FR_REG11(sp)
+-              LONG_S  $12, GDB_FR_REG12(sp)
+-              LONG_S  $13, GDB_FR_REG13(sp)
+-              LONG_S  $14, GDB_FR_REG14(sp)
+-              LONG_S  $15, GDB_FR_REG15(sp)
+-              LONG_S  $16, GDB_FR_REG16(sp)
+-              LONG_S  $17, GDB_FR_REG17(sp)
+-              LONG_S  $18, GDB_FR_REG18(sp)
+-              LONG_S  $19, GDB_FR_REG19(sp)
+-              LONG_S  $20, GDB_FR_REG20(sp)
+-              LONG_S  $21, GDB_FR_REG21(sp)
+-              LONG_S  $22, GDB_FR_REG22(sp)
+-              LONG_S  $23, GDB_FR_REG23(sp)
+-              LONG_S  $24, GDB_FR_REG24(sp)
+-              LONG_S  $25, GDB_FR_REG25(sp)
+-              LONG_S  $26, GDB_FR_REG26(sp)
+-              LONG_S  $27, GDB_FR_REG27(sp)
+-              LONG_S  $28, GDB_FR_REG28(sp)
+-              /* sp already saved */
+-              LONG_S  $30, GDB_FR_REG30(sp)
+-              LONG_S  $31, GDB_FR_REG31(sp)
+-
+-              CLI                             /* disable interrupts */
+-              TRACE_IRQS_OFF
+-
+-/*
+- * Followed by the floating point registers
+- */
+-              mfc0    v0, CP0_STATUS          /* FPU enabled? */
+-              srl     v0, v0, 16
+-              andi    v0, v0, (ST0_CU1 >> 16)
+-
+-              beqz    v0,2f                   /* disabled, skip */
+-               nop
+-
+-              SDC1    $0, GDB_FR_FPR0(sp)
+-              SDC1    $1, GDB_FR_FPR1(sp)
+-              SDC1    $2, GDB_FR_FPR2(sp)
+-              SDC1    $3, GDB_FR_FPR3(sp)
+-              SDC1    $4, GDB_FR_FPR4(sp)
+-              SDC1    $5, GDB_FR_FPR5(sp)
+-              SDC1    $6, GDB_FR_FPR6(sp)
+-              SDC1    $7, GDB_FR_FPR7(sp)
+-              SDC1    $8, GDB_FR_FPR8(sp)
+-              SDC1    $9, GDB_FR_FPR9(sp)
+-              SDC1    $10, GDB_FR_FPR10(sp)
+-              SDC1    $11, GDB_FR_FPR11(sp)
+-              SDC1    $12, GDB_FR_FPR12(sp)
+-              SDC1    $13, GDB_FR_FPR13(sp)
+-              SDC1    $14, GDB_FR_FPR14(sp)
+-              SDC1    $15, GDB_FR_FPR15(sp)
+-              SDC1    $16, GDB_FR_FPR16(sp)
+-              SDC1    $17, GDB_FR_FPR17(sp)
+-              SDC1    $18, GDB_FR_FPR18(sp)
+-              SDC1    $19, GDB_FR_FPR19(sp)
+-              SDC1    $20, GDB_FR_FPR20(sp)
+-              SDC1    $21, GDB_FR_FPR21(sp)
+-              SDC1    $22, GDB_FR_FPR22(sp)
+-              SDC1    $23, GDB_FR_FPR23(sp)
+-              SDC1    $24, GDB_FR_FPR24(sp)
+-              SDC1    $25, GDB_FR_FPR25(sp)
+-              SDC1    $26, GDB_FR_FPR26(sp)
+-              SDC1    $27, GDB_FR_FPR27(sp)
+-              SDC1    $28, GDB_FR_FPR28(sp)
+-              SDC1    $29, GDB_FR_FPR29(sp)
+-              SDC1    $30, GDB_FR_FPR30(sp)
+-              SDC1    $31, GDB_FR_FPR31(sp)
+-
+-/*
+- * FPU control registers
+- */
+-
+-              cfc1    v0, CP1_STATUS
+-              LONG_S  v0, GDB_FR_FSR(sp)
+-              cfc1    v0, CP1_REVISION
+-              LONG_S  v0, GDB_FR_FIR(sp)
+-
+-/*
+- * Current stack frame ptr
+- */
+-
+-2:
+-              LONG_S  sp, GDB_FR_FRP(sp)
+-
+-/*
+- * CP0 registers (R4000/R4400 unused registers skipped)
+- */
+-
+-              mfc0    v0, CP0_INDEX
+-              LONG_S  v0, GDB_FR_CP0_INDEX(sp)
+-              mfc0    v0, CP0_RANDOM
+-              LONG_S  v0, GDB_FR_CP0_RANDOM(sp)
+-              DMFC0   v0, CP0_ENTRYLO0
+-              LONG_S  v0, GDB_FR_CP0_ENTRYLO0(sp)
+-              DMFC0   v0, CP0_ENTRYLO1
+-              LONG_S  v0, GDB_FR_CP0_ENTRYLO1(sp)
+-              DMFC0   v0, CP0_CONTEXT
+-              LONG_S  v0, GDB_FR_CP0_CONTEXT(sp)
+-              mfc0    v0, CP0_PAGEMASK
+-              LONG_S  v0, GDB_FR_CP0_PAGEMASK(sp)
+-              mfc0    v0, CP0_WIRED
+-              LONG_S  v0, GDB_FR_CP0_WIRED(sp)
+-              DMFC0   v0, CP0_ENTRYHI
+-              LONG_S  v0, GDB_FR_CP0_ENTRYHI(sp)
+-              mfc0    v0, CP0_PRID
+-              LONG_S  v0, GDB_FR_CP0_PRID(sp)
+-
+-              .set    at
+-
+-/*
+- * Continue with the higher level handler
+- */
+-
+-              move    a0,sp
+-
+-              jal     handle_exception
+-               nop
+-
+-/*
+- * Restore all writable registers, in reverse order
+- */
+-
+-              .set    noat
+-
+-              LONG_L  v0, GDB_FR_CP0_ENTRYHI(sp)
+-              LONG_L  v1, GDB_FR_CP0_WIRED(sp)
+-              DMTC0   v0, CP0_ENTRYHI
+-              mtc0    v1, CP0_WIRED
+-              LONG_L  v0, GDB_FR_CP0_PAGEMASK(sp)
+-              LONG_L  v1, GDB_FR_CP0_ENTRYLO1(sp)
+-              mtc0    v0, CP0_PAGEMASK
+-              DMTC0   v1, CP0_ENTRYLO1
+-              LONG_L  v0, GDB_FR_CP0_ENTRYLO0(sp)
+-              LONG_L  v1, GDB_FR_CP0_INDEX(sp)
+-              DMTC0   v0, CP0_ENTRYLO0
+-              LONG_L  v0, GDB_FR_CP0_CONTEXT(sp)
+-              mtc0    v1, CP0_INDEX
+-              DMTC0   v0, CP0_CONTEXT
+-
+-
+-/*
+- * Next, the floating point registers
+- */
+-              mfc0    v0, CP0_STATUS          /* check if the FPU is enabled */
+-              srl     v0, v0, 16
+-              andi    v0, v0, (ST0_CU1 >> 16)
+-
+-              beqz    v0, 3f                  /* disabled, skip */
+-               nop
+-
+-              LDC1    $31, GDB_FR_FPR31(sp)
+-              LDC1    $30, GDB_FR_FPR30(sp)
+-              LDC1    $29, GDB_FR_FPR29(sp)
+-              LDC1    $28, GDB_FR_FPR28(sp)
+-              LDC1    $27, GDB_FR_FPR27(sp)
+-              LDC1    $26, GDB_FR_FPR26(sp)
+-              LDC1    $25, GDB_FR_FPR25(sp)
+-              LDC1    $24, GDB_FR_FPR24(sp)
+-              LDC1    $23, GDB_FR_FPR23(sp)
+-              LDC1    $22, GDB_FR_FPR22(sp)
+-              LDC1    $21, GDB_FR_FPR21(sp)
+-              LDC1    $20, GDB_FR_FPR20(sp)
+-              LDC1    $19, GDB_FR_FPR19(sp)
+-              LDC1    $18, GDB_FR_FPR18(sp)
+-              LDC1    $17, GDB_FR_FPR17(sp)
+-              LDC1    $16, GDB_FR_FPR16(sp)
+-              LDC1    $15, GDB_FR_FPR15(sp)
+-              LDC1    $14, GDB_FR_FPR14(sp)
+-              LDC1    $13, GDB_FR_FPR13(sp)
+-              LDC1    $12, GDB_FR_FPR12(sp)
+-              LDC1    $11, GDB_FR_FPR11(sp)
+-              LDC1    $10, GDB_FR_FPR10(sp)
+-              LDC1    $9, GDB_FR_FPR9(sp)
+-              LDC1    $8, GDB_FR_FPR8(sp)
+-              LDC1    $7, GDB_FR_FPR7(sp)
+-              LDC1    $6, GDB_FR_FPR6(sp)
+-              LDC1    $5, GDB_FR_FPR5(sp)
+-              LDC1    $4, GDB_FR_FPR4(sp)
+-              LDC1    $3, GDB_FR_FPR3(sp)
+-              LDC1    $2, GDB_FR_FPR2(sp)
+-              LDC1    $1, GDB_FR_FPR1(sp)
+-              LDC1    $0, GDB_FR_FPR0(sp)
+-
+-/*
+- * Now the CP0 and integer registers
+- */
+-
+-3:
+-#ifdef CONFIG_MIPS_MT_SMTC
+-              /* Read-modify write of Status must be atomic */
+-              mfc0    t2, CP0_TCSTATUS
+-              ori     t1, t2, TCSTATUS_IXMT
+-              mtc0    t1, CP0_TCSTATUS
+-              andi    t2, t2, TCSTATUS_IXMT
+-              _ehb
+-              DMT     9                               # dmt   t1
+-              jal     mips_ihb
+-              nop
+-#endif /* CONFIG_MIPS_MT_SMTC */
+-              mfc0    t0, CP0_STATUS
+-              ori     t0, 0x1f
+-              xori    t0, 0x1f
+-              mtc0    t0, CP0_STATUS
+-#ifdef CONFIG_MIPS_MT_SMTC
+-              andi    t1, t1, VPECONTROL_TE
+-              beqz    t1, 9f
+-              nop
+-              EMT                                     # emt
+-9:
+-              mfc0    t1, CP0_TCSTATUS
+-              xori    t1, t1, TCSTATUS_IXMT
+-              or      t1, t1, t2
+-              mtc0    t1, CP0_TCSTATUS
+-              _ehb
+-#endif /* CONFIG_MIPS_MT_SMTC */
+-              LONG_L  v0, GDB_FR_STATUS(sp)
+-              LONG_L  v1, GDB_FR_EPC(sp)
+-              mtc0    v0, CP0_STATUS
+-              DMTC0   v1, CP0_EPC
+-              LONG_L  v0, GDB_FR_HI(sp)
+-              LONG_L  v1, GDB_FR_LO(sp)
+-              mthi    v0
+-              mtlo    v1
+-              LONG_L  $31, GDB_FR_REG31(sp)
+-              LONG_L  $30, GDB_FR_REG30(sp)
+-              LONG_L  $28, GDB_FR_REG28(sp)
+-              LONG_L  $27, GDB_FR_REG27(sp)
+-              LONG_L  $26, GDB_FR_REG26(sp)
+-              LONG_L  $25, GDB_FR_REG25(sp)
+-              LONG_L  $24, GDB_FR_REG24(sp)
+-              LONG_L  $23, GDB_FR_REG23(sp)
+-              LONG_L  $22, GDB_FR_REG22(sp)
+-              LONG_L  $21, GDB_FR_REG21(sp)
+-              LONG_L  $20, GDB_FR_REG20(sp)
+-              LONG_L  $19, GDB_FR_REG19(sp)
+-              LONG_L  $18, GDB_FR_REG18(sp)
+-              LONG_L  $17, GDB_FR_REG17(sp)
+-              LONG_L  $16, GDB_FR_REG16(sp)
+-              LONG_L  $15, GDB_FR_REG15(sp)
+-              LONG_L  $14, GDB_FR_REG14(sp)
+-              LONG_L  $13, GDB_FR_REG13(sp)
+-              LONG_L  $12, GDB_FR_REG12(sp)
+-              LONG_L  $11, GDB_FR_REG11(sp)
+-              LONG_L  $10, GDB_FR_REG10(sp)
+-              LONG_L  $9, GDB_FR_REG9(sp)
+-              LONG_L  $8, GDB_FR_REG8(sp)
+-              LONG_L  $7, GDB_FR_REG7(sp)
+-              LONG_L  $6, GDB_FR_REG6(sp)
+-              LONG_L  $5, GDB_FR_REG5(sp)
+-              LONG_L  $4, GDB_FR_REG4(sp)
+-              LONG_L  $3, GDB_FR_REG3(sp)
+-              LONG_L  $2, GDB_FR_REG2(sp)
+-              LONG_L  $1, GDB_FR_REG1(sp)
+-#if defined(CONFIG_CPU_R3000) || defined(CONFIG_CPU_TX39XX)
+-              LONG_L  k0, GDB_FR_EPC(sp)
+-              LONG_L  $29, GDB_FR_REG29(sp)           /* Deallocate stack */
+-              jr      k0
+-              rfe
+-#else
+-              LONG_L  sp, GDB_FR_REG29(sp)            /* Deallocate stack */
+-
+-              .set    mips3
+-              eret
+-              .set    mips0
+-#endif
+-              .set    at
+-              .set    reorder
+-              END(trap_low)
+-
+-LEAF(kgdb_read_byte)
+-4:            lb      t0, (a0)
+-              sb      t0, (a1)
+-              li      v0, 0
+-              jr      ra
+-              .section __ex_table,"a"
+-              PTR     4b, kgdbfault
+-              .previous
+-              END(kgdb_read_byte)
+-
+-LEAF(kgdb_write_byte)
+-5:            sb      a0, (a1)
+-              li      v0, 0
+-              jr      ra
+-              .section __ex_table,"a"
+-              PTR     5b, kgdbfault
+-              .previous
+-              END(kgdb_write_byte)
+-
+-              .type   kgdbfault@function
+-              .ent    kgdbfault
+-
+-kgdbfault:    li      v0, -EFAULT
+-              jr      ra
+-              .end    kgdbfault
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/mips/kernel/gdb-stub.c linux-2.6.18-53.1.14.kgdb/arch/mips/kernel/gdb-stub.c
+--- linux-2.6.18-53.1.14/arch/mips/kernel/gdb-stub.c   2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/mips/kernel/gdb-stub.c      1970-01-01 03:00:00.000000000 +0300
+@@ -1,1154 +0,0 @@
+-/*
+- *  arch/mips/kernel/gdb-stub.c
+- *
+- *  Originally written by Glenn Engel, Lake Stevens Instrument Division
+- *
+- *  Contributed by HP Systems
+- *
+- *  Modified for SPARC by Stu Grossman, Cygnus Support.
+- *
+- *  Modified for Linux/MIPS (and MIPS in general) by Andreas Busse
+- *  Send complaints, suggestions etc. to <andy@waldorf-gmbh.de>
+- *
+- *  Copyright (C) 1995 Andreas Busse
+- *
+- *  Copyright (C) 2003 MontaVista Software Inc.
+- *  Author: Jun Sun, jsun@mvista.com or jsun@junsun.net
+- */
+-
+-/*
+- *  To enable debugger support, two things need to happen.  One, a
+- *  call to set_debug_traps() is necessary in order to allow any breakpoints
+- *  or error conditions to be properly intercepted and reported to gdb.
+- *  Two, a breakpoint needs to be generated to begin communication.  This
+- *  is most easily accomplished by a call to breakpoint().  Breakpoint()
+- *  simulates a breakpoint by executing a BREAK instruction.
+- *
+- *
+- *    The following gdb commands are supported:
+- *
+- * command          function                               Return value
+- *
+- *    g             return the value of the CPU registers  hex data or ENN
+- *    G             set the value of the CPU registers     OK or ENN
+- *
+- *    mAA..AA,LLLL  Read LLLL bytes at address AA..AA      hex data or ENN
+- *    MAA..AA,LLLL: Write LLLL bytes at address AA.AA      OK or ENN
+- *
+- *    c             Resume at current address              SNN   ( signal NN)
+- *    cAA..AA       Continue at address AA..AA             SNN
+- *
+- *    s             Step one instruction                   SNN
+- *    sAA..AA       Step one instruction from AA..AA       SNN
+- *
+- *    k             kill
+- *
+- *    ?             What was the last sigval ?             SNN   (signal NN)
+- *
+- *    bBB..BB     Set baud rate to BB..BB                OK or BNN, then sets
+- *                                                       baud rate
+- *
+- * All commands and responses are sent with a packet which includes a
+- * checksum.  A packet consists of
+- *
+- * $<packet info>#<checksum>.
+- *
+- * where
+- * <packet info> :: <characters representing the command or response>
+- * <checksum>    :: < two hex digits computed as modulo 256 sum of <packetinfo>>
+- *
+- * When a packet is received, it is first acknowledged with either '+' or '-'.
+- * '+' indicates a successful transfer.  '-' indicates a failed transfer.
+- *
+- * Example:
+- *
+- * Host:                  Reply:
+- * $m0,10#2a               +$00010203040506070809101112131415#42
+- *
+- *
+- *  ==============
+- *  MORE EXAMPLES:
+- *  ==============
+- *
+- *  For reference -- the following are the steps that one
+- *  company took (RidgeRun Inc) to get remote gdb debugging
+- *  going. In this scenario the host machine was a PC and the
+- *  target platform was a Galileo EVB64120A MIPS evaluation
+- *  board.
+- *
+- *  Step 1:
+- *  First download gdb-5.0.tar.gz from the internet.
+- *  and then build/install the package.
+- *
+- *  Example:
+- *    $ tar zxf gdb-5.0.tar.gz
+- *    $ cd gdb-5.0
+- *    $ ./configure --target=mips-linux-elf
+- *    $ make
+- *    $ install
+- *    $ which mips-linux-elf-gdb
+- *    /usr/local/bin/mips-linux-elf-gdb
+- *
+- *  Step 2:
+- *  Configure linux for remote debugging and build it.
+- *
+- *  Example:
+- *    $ cd ~/linux
+- *    $ make menuconfig <go to "Kernel Hacking" and turn on remote debugging>
+- *    $ make
+- *
+- *  Step 3:
+- *  Download the kernel to the remote target and start
+- *  the kernel running. It will promptly halt and wait
+- *  for the host gdb session to connect. It does this
+- *  since the "Kernel Hacking" option has defined
+- *  CONFIG_KGDB which in turn enables your calls
+- *  to:
+- *     set_debug_traps();
+- *     breakpoint();
+- *
+- *  Step 4:
+- *  Start the gdb session on the host.
+- *
+- *  Example:
+- *    $ mips-linux-elf-gdb vmlinux
+- *    (gdb) set remotebaud 115200
+- *    (gdb) target remote /dev/ttyS1
+- *    ...at this point you are connected to
+- *       the remote target and can use gdb
+- *       in the normal fasion. Setting
+- *       breakpoints, single stepping,
+- *       printing variables, etc.
+- */
+-#include <linux/string.h>
+-#include <linux/kernel.h>
+-#include <linux/signal.h>
+-#include <linux/sched.h>
+-#include <linux/mm.h>
+-#include <linux/console.h>
+-#include <linux/init.h>
+-#include <linux/smp.h>
+-#include <linux/spinlock.h>
+-#include <linux/slab.h>
+-#include <linux/reboot.h>
+-
+-#include <asm/asm.h>
+-#include <asm/cacheflush.h>
+-#include <asm/mipsregs.h>
+-#include <asm/pgtable.h>
+-#include <asm/system.h>
+-#include <asm/gdb-stub.h>
+-#include <asm/inst.h>
+-#include <asm/smp.h>
+-
+-/*
+- * external low-level support routines
+- */
+-
+-extern int putDebugChar(char c);    /* write a single character      */
+-extern char getDebugChar(void);     /* read and return a single char */
+-extern void trap_low(void);
+-
+-/*
+- * breakpoint and test functions
+- */
+-extern void breakpoint(void);
+-extern void breakinst(void);
+-extern void async_breakpoint(void);
+-extern void async_breakinst(void);
+-extern void adel(void);
+-
+-/*
+- * local prototypes
+- */
+-
+-static void getpacket(char *buffer);
+-static void putpacket(char *buffer);
+-static int computeSignal(int tt);
+-static int hex(unsigned char ch);
+-static int hexToInt(char **ptr, int *intValue);
+-static int hexToLong(char **ptr, long *longValue);
+-static unsigned char *mem2hex(char *mem, char *buf, int count, int may_fault);
+-void handle_exception(struct gdb_regs *regs);
+-
+-int kgdb_enabled;
+-
+-/*
+- * spin locks for smp case
+- */
+-static DEFINE_SPINLOCK(kgdb_lock);
+-static raw_spinlock_t kgdb_cpulock[NR_CPUS] = {
+-      [0 ... NR_CPUS-1] = __RAW_SPIN_LOCK_UNLOCKED,
+-};
+-
+-/*
+- * BUFMAX defines the maximum number of characters in inbound/outbound buffers
+- * at least NUMREGBYTES*2 are needed for register packets
+- */
+-#define BUFMAX 2048
+-
+-static char input_buffer[BUFMAX];
+-static char output_buffer[BUFMAX];
+-static int initialized;       /* !0 means we've been initialized */
+-static int kgdb_started;
+-static const char hexchars[]="0123456789abcdef";
+-
+-/* Used to prevent crashes in memory access.  Note that they'll crash anyway if
+-   we haven't set up fault handlers yet... */
+-int kgdb_read_byte(unsigned char *address, unsigned char *dest);
+-int kgdb_write_byte(unsigned char val, unsigned char *dest);
+-
+-/*
+- * Convert ch from a hex digit to an int
+- */
+-static int hex(unsigned char ch)
+-{
+-      if (ch >= 'a' && ch <= 'f')
+-              return ch-'a'+10;
+-      if (ch >= '0' && ch <= '9')
+-              return ch-'0';
+-      if (ch >= 'A' && ch <= 'F')
+-              return ch-'A'+10;
+-      return -1;
+-}
+-
+-/*
+- * scan for the sequence $<data>#<checksum>
+- */
+-static void getpacket(char *buffer)
+-{
+-      unsigned char checksum;
+-      unsigned char xmitcsum;
+-      int i;
+-      int count;
+-      unsigned char ch;
+-
+-      do {
+-              /*
+-               * wait around for the start character,
+-               * ignore all other characters
+-               */
+-              while ((ch = (getDebugChar() & 0x7f)) != '$') ;
+-
+-              checksum = 0;
+-              xmitcsum = -1;
+-              count = 0;
+-
+-              /*
+-               * now, read until a # or end of buffer is found
+-               */
+-              while (count < BUFMAX) {
+-                      ch = getDebugChar();
+-                      if (ch == '#')
+-                              break;
+-                      checksum = checksum + ch;
+-                      buffer[count] = ch;
+-                      count = count + 1;
+-              }
+-
+-              if (count >= BUFMAX)
+-                      continue;
+-
+-              buffer[count] = 0;
+-
+-              if (ch == '#') {
+-                      xmitcsum = hex(getDebugChar() & 0x7f) << 4;
+-                      xmitcsum |= hex(getDebugChar() & 0x7f);
+-
+-                      if (checksum != xmitcsum)
+-                              putDebugChar('-');      /* failed checksum */
+-                      else {
+-                              putDebugChar('+'); /* successful transfer */
+-
+-                              /*
+-                               * if a sequence char is present,
+-                               * reply the sequence ID
+-                               */
+-                              if (buffer[2] == ':') {
+-                                      putDebugChar(buffer[0]);
+-                                      putDebugChar(buffer[1]);
+-
+-                                      /*
+-                                       * remove sequence chars from buffer
+-                                       */
+-                                      count = strlen(buffer);
+-                                      for (i=3; i <= count; i++)
+-                                              buffer[i-3] = buffer[i];
+-                              }
+-                      }
+-              }
+-      }
+-      while (checksum != xmitcsum);
+-}
+-
+-/*
+- * send the packet in buffer.
+- */
+-static void putpacket(char *buffer)
+-{
+-      unsigned char checksum;
+-      int count;
+-      unsigned char ch;
+-
+-      /*
+-       * $<packet info>#<checksum>.
+-       */
+-
+-      do {
+-              putDebugChar('$');
+-              checksum = 0;
+-              count = 0;
+-
+-              while ((ch = buffer[count]) != 0) {
+-                      if (!(putDebugChar(ch)))
+-                              return;
+-                      checksum += ch;
+-                      count += 1;
+-              }
+-
+-              putDebugChar('#');
+-              putDebugChar(hexchars[checksum >> 4]);
+-              putDebugChar(hexchars[checksum & 0xf]);
+-
+-      }
+-      while ((getDebugChar() & 0x7f) != '+');
+-}
+-
+-
+-/*
+- * Convert the memory pointed to by mem into hex, placing result in buf.
+- * Return a pointer to the last char put in buf (null), in case of mem fault,
+- * return 0.
+- * may_fault is non-zero if we are reading from arbitrary memory, but is currently
+- * not used.
+- */
+-static unsigned char *mem2hex(char *mem, char *buf, int count, int may_fault)
+-{
+-      unsigned char ch;
+-
+-      while (count-- > 0) {
+-              if (kgdb_read_byte(mem++, &ch) != 0)
+-                      return 0;
+-              *buf++ = hexchars[ch >> 4];
+-              *buf++ = hexchars[ch & 0xf];
+-      }
+-
+-      *buf = 0;
+-
+-      return buf;
+-}
+-
+-/*
+- * convert the hex array pointed to by buf into binary to be placed in mem
+- * return a pointer to the character AFTER the last byte written
+- * may_fault is non-zero if we are reading from arbitrary memory, but is currently
+- * not used.
+- */
+-static char *hex2mem(char *buf, char *mem, int count, int binary, int may_fault)
+-{
+-      int i;
+-      unsigned char ch;
+-
+-      for (i=0; i<count; i++)
+-      {
+-              if (binary) {
+-                      ch = *buf++;
+-                      if (ch == 0x7d)
+-                              ch = 0x20 ^ *buf++;
+-              }
+-              else {
+-                      ch = hex(*buf++) << 4;
+-                      ch |= hex(*buf++);
+-              }
+-              if (kgdb_write_byte(ch, mem++) != 0)
+-                      return 0;
+-      }
+-
+-      return mem;
+-}
+-
+-/*
+- * This table contains the mapping between SPARC hardware trap types, and
+- * signals, which are primarily what GDB understands.  It also indicates
+- * which hardware traps we need to commandeer when initializing the stub.
+- */
+-static struct hard_trap_info {
+-      unsigned char tt;               /* Trap type code for MIPS R3xxx and R4xxx */
+-      unsigned char signo;            /* Signal that we map this trap into */
+-} hard_trap_info[] = {
+-      { 6, SIGBUS },                  /* instruction bus error */
+-      { 7, SIGBUS },                  /* data bus error */
+-      { 9, SIGTRAP },                 /* break */
+-      { 10, SIGILL },                 /* reserved instruction */
+-/*    { 11, SIGILL },         */      /* CPU unusable */
+-      { 12, SIGFPE },                 /* overflow */
+-      { 13, SIGTRAP },                /* trap */
+-      { 14, SIGSEGV },                /* virtual instruction cache coherency */
+-      { 15, SIGFPE },                 /* floating point exception */
+-      { 23, SIGSEGV },                /* watch */
+-      { 31, SIGSEGV },                /* virtual data cache coherency */
+-      { 0, 0}                         /* Must be last */
+-};
+-
+-/* Save the normal trap handlers for user-mode traps. */
+-void *saved_vectors[32];
+-
+-/*
+- * Set up exception handlers for tracing and breakpoints
+- */
+-void set_debug_traps(void)
+-{
+-      struct hard_trap_info *ht;
+-      unsigned long flags;
+-      unsigned char c;
+-
+-      local_irq_save(flags);
+-      for (ht = hard_trap_info; ht->tt && ht->signo; ht++)
+-              saved_vectors[ht->tt] = set_except_vector(ht->tt, trap_low);
+-
+-      putDebugChar('+'); /* 'hello world' */
+-      /*
+-       * In case GDB is started before us, ack any packets
+-       * (presumably "$?#xx") sitting there.
+-       */
+-      while((c = getDebugChar()) != '$');
+-      while((c = getDebugChar()) != '#');
+-      c = getDebugChar(); /* eat first csum byte */
+-      c = getDebugChar(); /* eat second csum byte */
+-      putDebugChar('+'); /* ack it */
+-
+-      initialized = 1;
+-      local_irq_restore(flags);
+-}
+-
+-void restore_debug_traps(void)
+-{
+-      struct hard_trap_info *ht;
+-      unsigned long flags;
+-
+-      local_irq_save(flags);
+-      for (ht = hard_trap_info; ht->tt && ht->signo; ht++)
+-              set_except_vector(ht->tt, saved_vectors[ht->tt]);
+-      local_irq_restore(flags);
+-}
+-
+-/*
+- * Convert the MIPS hardware trap type code to a Unix signal number.
+- */
+-static int computeSignal(int tt)
+-{
+-      struct hard_trap_info *ht;
+-
+-      for (ht = hard_trap_info; ht->tt && ht->signo; ht++)
+-              if (ht->tt == tt)
+-                      return ht->signo;
+-
+-      return SIGHUP;          /* default for things we don't know about */
+-}
+-
+-/*
+- * While we find nice hex chars, build an int.
+- * Return number of chars processed.
+- */
+-static int hexToInt(char **ptr, int *intValue)
+-{
+-      int numChars = 0;
+-      int hexValue;
+-
+-      *intValue = 0;
+-
+-      while (**ptr) {
+-              hexValue = hex(**ptr);
+-              if (hexValue < 0)
+-                      break;
+-
+-              *intValue = (*intValue << 4) | hexValue;
+-              numChars ++;
+-
+-              (*ptr)++;
+-      }
+-
+-      return (numChars);
+-}
+-
+-static int hexToLong(char **ptr, long *longValue)
+-{
+-      int numChars = 0;
+-      int hexValue;
+-
+-      *longValue = 0;
+-
+-      while (**ptr) {
+-              hexValue = hex(**ptr);
+-              if (hexValue < 0)
+-                      break;
+-
+-              *longValue = (*longValue << 4) | hexValue;
+-              numChars ++;
+-
+-              (*ptr)++;
+-      }
+-
+-      return numChars;
+-}
+-
+-
+-#if 0
+-/*
+- * Print registers (on target console)
+- * Used only to debug the stub...
+- */
+-void show_gdbregs(struct gdb_regs * regs)
+-{
+-      /*
+-       * Saved main processor registers
+-       */
+-      printk("$0 : %08lx %08lx %08lx %08lx %08lx %08lx %08lx %08lx\n",
+-             regs->reg0, regs->reg1, regs->reg2, regs->reg3,
+-               regs->reg4, regs->reg5, regs->reg6, regs->reg7);
+-      printk("$8 : %08lx %08lx %08lx %08lx %08lx %08lx %08lx %08lx\n",
+-             regs->reg8, regs->reg9, regs->reg10, regs->reg11,
+-               regs->reg12, regs->reg13, regs->reg14, regs->reg15);
+-      printk("$16: %08lx %08lx %08lx %08lx %08lx %08lx %08lx %08lx\n",
+-             regs->reg16, regs->reg17, regs->reg18, regs->reg19,
+-               regs->reg20, regs->reg21, regs->reg22, regs->reg23);
+-      printk("$24: %08lx %08lx %08lx %08lx %08lx %08lx %08lx %08lx\n",
+-             regs->reg24, regs->reg25, regs->reg26, regs->reg27,
+-             regs->reg28, regs->reg29, regs->reg30, regs->reg31);
+-
+-      /*
+-       * Saved cp0 registers
+-       */
+-      printk("epc  : %08lx\nStatus: %08lx\nCause : %08lx\n",
+-             regs->cp0_epc, regs->cp0_status, regs->cp0_cause);
+-}
+-#endif /* dead code */
+-
+-/*
+- * We single-step by setting breakpoints. When an exception
+- * is handled, we need to restore the instructions hoisted
+- * when the breakpoints were set.
+- *
+- * This is where we save the original instructions.
+- */
+-static struct gdb_bp_save {
+-      unsigned long addr;
+-      unsigned int val;
+-} step_bp[2];
+-
+-#define BP 0x0000000d  /* break opcode */
+-
+-/*
+- * Set breakpoint instructions for single stepping.
+- */
+-static void single_step(struct gdb_regs *regs)
+-{
+-      union mips_instruction insn;
+-      unsigned long targ;
+-      int is_branch, is_cond, i;
+-
+-      targ = regs->cp0_epc;
+-      insn.word = *(unsigned int *)targ;
+-      is_branch = is_cond = 0;
+-
+-      switch (insn.i_format.opcode) {
+-      /*
+-       * jr and jalr are in r_format format.
+-       */
+-      case spec_op:
+-              switch (insn.r_format.func) {
+-              case jalr_op:
+-              case jr_op:
+-                      targ = *(&regs->reg0 + insn.r_format.rs);
+-                      is_branch = 1;
+-                      break;
+-              }
+-              break;
+-
+-      /*
+-       * This group contains:
+-       * bltz_op, bgez_op, bltzl_op, bgezl_op,
+-       * bltzal_op, bgezal_op, bltzall_op, bgezall_op.
+-       */
+-      case bcond_op:
+-              is_branch = is_cond = 1;
+-              targ += 4 + (insn.i_format.simmediate << 2);
+-              break;
+-
+-      /*
+-       * These are unconditional and in j_format.
+-       */
+-      case jal_op:
+-      case j_op:
+-              is_branch = 1;
+-              targ += 4;
+-              targ >>= 28;
+-              targ <<= 28;
+-              targ |= (insn.j_format.target << 2);
+-              break;
+-
+-      /*
+-       * These are conditional.
+-       */
+-      case beq_op:
+-      case beql_op:
+-      case bne_op:
+-      case bnel_op:
+-      case blez_op:
+-      case blezl_op:
+-      case bgtz_op:
+-      case bgtzl_op:
+-      case cop0_op:
+-      case cop1_op:
+-      case cop2_op:
+-      case cop1x_op:
+-              is_branch = is_cond = 1;
+-              targ += 4 + (insn.i_format.simmediate << 2);
+-              break;
+-      }
+-
+-      if (is_branch) {
+-              i = 0;
+-              if (is_cond && targ != (regs->cp0_epc + 8)) {
+-                      step_bp[i].addr = regs->cp0_epc + 8;
+-                      step_bp[i++].val = *(unsigned *)(regs->cp0_epc + 8);
+-                      *(unsigned *)(regs->cp0_epc + 8) = BP;
+-              }
+-              step_bp[i].addr = targ;
+-              step_bp[i].val  = *(unsigned *)targ;
+-              *(unsigned *)targ = BP;
+-      } else {
+-              step_bp[0].addr = regs->cp0_epc + 4;
+-              step_bp[0].val  = *(unsigned *)(regs->cp0_epc + 4);
+-              *(unsigned *)(regs->cp0_epc + 4) = BP;
+-      }
+-}
+-
+-/*
+- *  If asynchronously interrupted by gdb, then we need to set a breakpoint
+- *  at the interrupted instruction so that we wind up stopped with a
+- *  reasonable stack frame.
+- */
+-static struct gdb_bp_save async_bp;
+-
+-/*
+- * Swap the interrupted EPC with our asynchronous breakpoint routine.
+- * This is safer than stuffing the breakpoint in-place, since no cache
+- * flushes (or resulting smp_call_functions) are required.  The
+- * assumption is that only one CPU will be handling asynchronous bp's,
+- * and only one can be active at a time.
+- */
+-extern spinlock_t smp_call_lock;
+-
+-void set_async_breakpoint(unsigned long *epc)
+-{
+-      /* skip breaking into userland */
+-      if ((*epc & 0x80000000) == 0)
+-              return;
+-
+-#ifdef CONFIG_SMP
+-      /* avoid deadlock if someone is make IPC */
+-      if (spin_is_locked(&smp_call_lock))
+-              return;
+-#endif
+-
+-      async_bp.addr = *epc;
+-      *epc = (unsigned long)async_breakpoint;
+-}
+-
+-static void kgdb_wait(void *arg)
+-{
+-      unsigned flags;
+-      int cpu = smp_processor_id();
+-
+-      local_irq_save(flags);
+-
+-      __raw_spin_lock(&kgdb_cpulock[cpu]);
+-      __raw_spin_unlock(&kgdb_cpulock[cpu]);
+-
+-      local_irq_restore(flags);
+-}
+-
+-/*
+- * GDB stub needs to call kgdb_wait on all processor with interrupts
+- * disabled, so it uses it's own special variant.
+- */
+-static int kgdb_smp_call_kgdb_wait(void)
+-{
+-#ifdef CONFIG_SMP
+-      struct call_data_struct data;
+-      int i, cpus = num_online_cpus() - 1;
+-      int cpu = smp_processor_id();
+-
+-      /*
+-       * Can die spectacularly if this CPU isn't yet marked online
+-       */
+-      BUG_ON(!cpu_online(cpu));
+-
+-      if (!cpus)
+-              return 0;
+-
+-      if (spin_is_locked(&smp_call_lock)) {
+-              /*
+-               * Some other processor is trying to make us do something
+-               * but we're not going to respond... give up
+-               */
+-              return -1;
+-              }
+-
+-      /*
+-       * We will continue here, accepting the fact that
+-       * the kernel may deadlock if another CPU attempts
+-       * to call smp_call_function now...
+-       */
+-
+-      data.func = kgdb_wait;
+-      data.info = NULL;
+-      atomic_set(&data.started, 0);
+-      data.wait = 0;
+-
+-      spin_lock(&smp_call_lock);
+-      call_data = &data;
+-      mb();
+-
+-      /* Send a message to all other CPUs and wait for them to respond */
+-      for (i = 0; i < NR_CPUS; i++)
+-              if (cpu_online(i) && i != cpu)
+-                      core_send_ipi(i, SMP_CALL_FUNCTION);
+-
+-      /* Wait for response */
+-      /* FIXME: lock-up detection, backtrace on lock-up */
+-      while (atomic_read(&data.started) != cpus)
+-              barrier();
+-
+-      call_data = NULL;
+-      spin_unlock(&smp_call_lock);
+-#endif
+-
+-      return 0;
+-}
+-
+-/*
+- * This function does all command processing for interfacing to gdb.  It
+- * returns 1 if you should skip the instruction at the trap address, 0
+- * otherwise.
+- */
+-void handle_exception (struct gdb_regs *regs)
+-{
+-      int trap;                       /* Trap type */
+-      int sigval;
+-      long addr;
+-      int length;
+-      char *ptr;
+-      unsigned long *stack;
+-      int i;
+-      int bflag = 0;
+-
+-      kgdb_started = 1;
+-
+-      /*
+-       * acquire the big kgdb spinlock
+-       */
+-      if (!spin_trylock(&kgdb_lock)) {
+-              /*
+-               * some other CPU has the lock, we should go back to
+-               * receive the gdb_wait IPC
+-               */
+-              return;
+-      }
+-
+-      /*
+-       * If we're in async_breakpoint(), restore the real EPC from
+-       * the breakpoint.
+-       */
+-      if (regs->cp0_epc == (unsigned long)async_breakinst) {
+-              regs->cp0_epc = async_bp.addr;
+-              async_bp.addr = 0;
+-      }
+-
+-      /*
+-       * acquire the CPU spinlocks
+-       */
+-      for (i = num_online_cpus()-1; i >= 0; i--)
+-              if (__raw_spin_trylock(&kgdb_cpulock[i]) == 0)
+-                      panic("kgdb: couldn't get cpulock %d\n", i);
+-
+-      /*
+-       * force other cpus to enter kgdb
+-       */
+-      kgdb_smp_call_kgdb_wait();
+-
+-      /*
+-       * If we're in breakpoint() increment the PC
+-       */
+-      trap = (regs->cp0_cause & 0x7c) >> 2;
+-      if (trap == 9 && regs->cp0_epc == (unsigned long)breakinst)
+-              regs->cp0_epc += 4;
+-
+-      /*
+-       * If we were single_stepping, restore the opcodes hoisted
+-       * for the breakpoint[s].
+-       */
+-      if (step_bp[0].addr) {
+-              *(unsigned *)step_bp[0].addr = step_bp[0].val;
+-              step_bp[0].addr = 0;
+-
+-              if (step_bp[1].addr) {
+-                      *(unsigned *)step_bp[1].addr = step_bp[1].val;
+-                      step_bp[1].addr = 0;
+-              }
+-      }
+-
+-      stack = (long *)regs->reg29;                    /* stack ptr */
+-      sigval = computeSignal(trap);
+-
+-      /*
+-       * reply to host that an exception has occurred
+-       */
+-      ptr = output_buffer;
+-
+-      /*
+-       * Send trap type (converted to signal)
+-       */
+-      *ptr++ = 'T';
+-      *ptr++ = hexchars[sigval >> 4];
+-      *ptr++ = hexchars[sigval & 0xf];
+-
+-      /*
+-       * Send Error PC
+-       */
+-      *ptr++ = hexchars[REG_EPC >> 4];
+-      *ptr++ = hexchars[REG_EPC & 0xf];
+-      *ptr++ = ':';
+-      ptr = mem2hex((char *)&regs->cp0_epc, ptr, sizeof(long), 0);
+-      *ptr++ = ';';
+-
+-      /*
+-       * Send frame pointer
+-       */
+-      *ptr++ = hexchars[REG_FP >> 4];
+-      *ptr++ = hexchars[REG_FP & 0xf];
+-      *ptr++ = ':';
+-      ptr = mem2hex((char *)&regs->reg30, ptr, sizeof(long), 0);
+-      *ptr++ = ';';
+-
+-      /*
+-       * Send stack pointer
+-       */
+-      *ptr++ = hexchars[REG_SP >> 4];
+-      *ptr++ = hexchars[REG_SP & 0xf];
+-      *ptr++ = ':';
+-      ptr = mem2hex((char *)&regs->reg29, ptr, sizeof(long), 0);
+-      *ptr++ = ';';
+-
+-      *ptr++ = 0;
+-      putpacket(output_buffer);       /* send it off... */
+-
+-      /*
+-       * Wait for input from remote GDB
+-       */
+-      while (1) {
+-              output_buffer[0] = 0;
+-              getpacket(input_buffer);
+-
+-              switch (input_buffer[0])
+-              {
+-              case '?':
+-                      output_buffer[0] = 'S';
+-                      output_buffer[1] = hexchars[sigval >> 4];
+-                      output_buffer[2] = hexchars[sigval & 0xf];
+-                      output_buffer[3] = 0;
+-                      break;
+-
+-              /*
+-               * Detach debugger; let CPU run
+-               */
+-              case 'D':
+-                      putpacket(output_buffer);
+-                      goto finish_kgdb;
+-                      break;
+-
+-              case 'd':
+-                      /* toggle debug flag */
+-                      break;
+-
+-              /*
+-               * Return the value of the CPU registers
+-               */
+-              case 'g':
+-                      ptr = output_buffer;
+-                      ptr = mem2hex((char *)&regs->reg0, ptr, 32*sizeof(long), 0); /* r0...r31 */
+-                      ptr = mem2hex((char *)&regs->cp0_status, ptr, 6*sizeof(long), 0); /* cp0 */
+-                      ptr = mem2hex((char *)&regs->fpr0, ptr, 32*sizeof(long), 0); /* f0...31 */
+-                      ptr = mem2hex((char *)&regs->cp1_fsr, ptr, 2*sizeof(long), 0); /* cp1 */
+-                      ptr = mem2hex((char *)&regs->frame_ptr, ptr, 2*sizeof(long), 0); /* frp */
+-                      ptr = mem2hex((char *)&regs->cp0_index, ptr, 16*sizeof(long), 0); /* cp0 */
+-                      break;
+-
+-              /*
+-               * set the value of the CPU registers - return OK
+-               */
+-              case 'G':
+-              {
+-                      ptr = &input_buffer[1];
+-                      hex2mem(ptr, (char *)&regs->reg0, 32*sizeof(long), 0, 0);
+-                      ptr += 32*(2*sizeof(long));
+-                      hex2mem(ptr, (char *)&regs->cp0_status, 6*sizeof(long), 0, 0);
+-                      ptr += 6*(2*sizeof(long));
+-                      hex2mem(ptr, (char *)&regs->fpr0, 32*sizeof(long), 0, 0);
+-                      ptr += 32*(2*sizeof(long));
+-                      hex2mem(ptr, (char *)&regs->cp1_fsr, 2*sizeof(long), 0, 0);
+-                      ptr += 2*(2*sizeof(long));
+-                      hex2mem(ptr, (char *)&regs->frame_ptr, 2*sizeof(long), 0, 0);
+-                      ptr += 2*(2*sizeof(long));
+-                      hex2mem(ptr, (char *)&regs->cp0_index, 16*sizeof(long), 0, 0);
+-                      strcpy(output_buffer,"OK");
+-               }
+-              break;
+-
+-              /*
+-               * mAA..AA,LLLL  Read LLLL bytes at address AA..AA
+-               */
+-              case 'm':
+-                      ptr = &input_buffer[1];
+-
+-                      if (hexToLong(&ptr, &addr)
+-                              && *ptr++ == ','
+-                              && hexToInt(&ptr, &length)) {
+-                              if (mem2hex((char *)addr, output_buffer, length, 1))
+-                                      break;
+-                              strcpy (output_buffer, "E03");
+-                      } else
+-                              strcpy(output_buffer,"E01");
+-                      break;
+-
+-              /*
+-               * XAA..AA,LLLL: Write LLLL escaped binary bytes at address AA.AA
+-               */
+-              case 'X':
+-                      bflag = 1;
+-                      /* fall through */
+-
+-              /*
+-               * MAA..AA,LLLL: Write LLLL bytes at address AA.AA return OK
+-               */
+-              case 'M':
+-                      ptr = &input_buffer[1];
+-
+-                      if (hexToLong(&ptr, &addr)
+-                              && *ptr++ == ','
+-                              && hexToInt(&ptr, &length)
+-                              && *ptr++ == ':') {
+-                              if (hex2mem(ptr, (char *)addr, length, bflag, 1))
+-                                      strcpy(output_buffer, "OK");
+-                              else
+-                                      strcpy(output_buffer, "E03");
+-                      }
+-                      else
+-                              strcpy(output_buffer, "E02");
+-                      break;
+-
+-              /*
+-               * cAA..AA    Continue at address AA..AA(optional)
+-               */
+-              case 'c':
+-                      /* try to read optional parameter, pc unchanged if no parm */
+-
+-                      ptr = &input_buffer[1];
+-                      if (hexToLong(&ptr, &addr))
+-                              regs->cp0_epc = addr;
+-
+-                      goto exit_kgdb_exception;
+-                      break;
+-
+-              /*
+-               * kill the program; let us try to restart the machine
+-               * Reset the whole machine.
+-               */
+-              case 'k':
+-              case 'r':
+-                      machine_restart("kgdb restarts machine");
+-                      break;
+-
+-              /*
+-               * Step to next instruction
+-               */
+-              case 's':
+-                      /*
+-                       * There is no single step insn in the MIPS ISA, so we
+-                       * use breakpoints and continue, instead.
+-                       */
+-                      single_step(regs);
+-                      goto exit_kgdb_exception;
+-                      /* NOTREACHED */
+-                      break;
+-
+-              /*
+-               * Set baud rate (bBB)
+-               * FIXME: Needs to be written
+-               */
+-              case 'b':
+-              {
+-#if 0
+-                      int baudrate;
+-                      extern void set_timer_3();
+-
+-                      ptr = &input_buffer[1];
+-                      if (!hexToInt(&ptr, &baudrate))
+-                      {
+-                              strcpy(output_buffer,"B01");
+-                              break;
+-                      }
+-
+-                      /* Convert baud rate to uart clock divider */
+-
+-                      switch (baudrate)
+-                      {
+-                              case 38400:
+-                                      baudrate = 16;
+-                                      break;
+-                              case 19200:
+-                                      baudrate = 33;
+-                                      break;
+-                              case 9600:
+-                                      baudrate = 65;
+-                                      break;
+-                              default:
+-                                      baudrate = 0;
+-                                      strcpy(output_buffer,"B02");
+-                                      goto x1;
+-                      }
+-
+-                      if (baudrate) {
+-                              putpacket("OK");        /* Ack before changing speed */
+-                              set_timer_3(baudrate); /* Set it */
+-                      }
+-#endif
+-              }
+-              break;
+-
+-              }                       /* switch */
+-
+-              /*
+-               * reply to the request
+-               */
+-
+-              putpacket(output_buffer);
+-
+-      } /* while */
+-
+-      return;
+-
+-finish_kgdb:
+-      restore_debug_traps();
+-
+-exit_kgdb_exception:
+-      /* release locks so other CPUs can go */
+-      for (i = num_online_cpus()-1; i >= 0; i--)
+-              __raw_spin_unlock(&kgdb_cpulock[i]);
+-      spin_unlock(&kgdb_lock);
+-
+-      __flush_cache_all();
+-      return;
+-}
+-
+-/*
+- * This function will generate a breakpoint exception.  It is used at the
+- * beginning of a program to sync up with a debugger and can be used
+- * otherwise as a quick means to stop program execution and "break" into
+- * the debugger.
+- */
+-void breakpoint(void)
+-{
+-      if (!initialized)
+-              return;
+-
+-      __asm__ __volatile__(
+-                      ".globl breakinst\n\t"
+-                      ".set\tnoreorder\n\t"
+-                      "nop\n"
+-                      "breakinst:\tbreak\n\t"
+-                      "nop\n\t"
+-                      ".set\treorder"
+-                      );
+-}
+-
+-/* Nothing but the break; don't pollute any registers */
+-void async_breakpoint(void)
+-{
+-      __asm__ __volatile__(
+-                      ".globl async_breakinst\n\t"
+-                      ".set\tnoreorder\n\t"
+-                      "nop\n"
+-                      "async_breakinst:\tbreak\n\t"
+-                      "nop\n\t"
+-                      ".set\treorder"
+-                      );
+-}
+-
+-void adel(void)
+-{
+-      __asm__ __volatile__(
+-                      ".globl\tadel\n\t"
+-                      "lui\t$8,0x8000\n\t"
+-                      "lw\t$9,1($8)\n\t"
+-                      );
+-}
+-
+-/*
+- * malloc is needed by gdb client in "call func()", even a private one
+- * will make gdb happy
+- */
+-static void * __attribute_used__ malloc(size_t size)
+-{
+-      return kmalloc(size, GFP_ATOMIC);
+-}
+-
+-static void __attribute_used__ free (void *where)
+-{
+-      kfree(where);
+-}
+-
+-#ifdef CONFIG_GDB_CONSOLE
+-
+-void gdb_putsn(const char *str, int l)
+-{
+-      char outbuf[18];
+-
+-      if (!kgdb_started)
+-              return;
+-
+-      outbuf[0]='O';
+-
+-      while(l) {
+-              int i = (l>8)?8:l;
+-              mem2hex((char *)str, &outbuf[1], i, 0);
+-              outbuf[(i*2)+1]=0;
+-              putpacket(outbuf);
+-              str += i;
+-              l -= i;
+-      }
+-}
+-
+-static void gdb_console_write(struct console *con, const char *s, unsigned n)
+-{
+-      gdb_putsn(s, n);
+-}
+-
+-static struct console gdb_console = {
+-      .name   = "gdb",
+-      .write  = gdb_console_write,
+-      .flags  = CON_PRINTBUFFER,
+-      .index  = -1
+-};
+-
+-static int __init register_gdb_console(void)
+-{
+-      register_console(&gdb_console);
+-
+-      return 0;
+-}
+-
+-console_initcall(register_gdb_console);
+-
+-#endif
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/mips/kernel/irq.c linux-2.6.18-53.1.14.kgdb/arch/mips/kernel/irq.c
+--- linux-2.6.18-53.1.14/arch/mips/kernel/irq.c        2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/mips/kernel/irq.c   2008-06-10 15:38:24.000000000 +0400
+@@ -25,6 +25,10 @@
+ #include <asm/atomic.h>
+ #include <asm/system.h>
+ #include <asm/uaccess.h>
++#include <asm/kgdb.h>
++
++/* Keep track of if we've done certain initialization already or not. */
++int kgdb_early_setup;
+ 
+ /*
+  * 'what should we do if we get a hw irq event on an illegal vector'.
+@@ -115,23 +119,13 @@ asmlinkage void spurious_interrupt(struc
+       atomic_inc(&irq_err_count);
+ }
+ 
+-#ifdef CONFIG_KGDB
+-extern void breakpoint(void);
+-extern void set_debug_traps(void);
+-
+-static int kgdb_flag = 1;
+-static int __init nokgdb(char *str)
+-{
+-      kgdb_flag = 0;
+-      return 1;
+-}
+-__setup("nokgdb", nokgdb);
+-#endif
+-
+ void __init init_IRQ(void)
+ {
+       int i;
+ 
++      if (kgdb_early_setup)
++              return;
++
+       for (i = 0; i < NR_IRQS; i++) {
+               irq_desc[i].status  = IRQ_DISABLED;
+               irq_desc[i].action  = NULL;
+@@ -144,12 +138,12 @@ void __init init_IRQ(void)
+       }
+ 
+       arch_init_irq();
+-
+ #ifdef CONFIG_KGDB
+-      if (kgdb_flag) {
+-              printk("Wait for gdb client connection ...\n");
+-              set_debug_traps();
+-              breakpoint();
+-      }
++      /*
++       * We have been called before kgdb_arch_init(). Hence,
++       * we dont want the traps to be reinitialized
++       */
++      if (kgdb_early_setup == 0)
++              kgdb_early_setup = 1;
+ #endif
+ }
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/mips/kernel/kgdb-jmp.c linux-2.6.18-53.1.14.kgdb/arch/mips/kernel/kgdb-jmp.c
+--- linux-2.6.18-53.1.14/arch/mips/kernel/kgdb-jmp.c   1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/arch/mips/kernel/kgdb-jmp.c      2008-06-10 15:38:24.000000000 +0400
+@@ -0,0 +1,116 @@
++/*
++ * arch/mips/kernel/kgdb-jmp.c
++ *
++ * Save and restore system registers so that within a limited frame we
++ * may have a fault and "jump back" to a known safe location.
++ *
++ * Author: Tom Rini <trini@kernel.crashing.org>
++ * Author: Manish Lachwani <mlachwani@mvista.com>
++ *
++ * Cribbed from glibc, which carries the following:
++ * Copyright (C) 1996, 1997, 2000, 2002, 2003 Free Software Foundation, Inc.
++ * Copyright (C) 2005 by MontaVista Software.
++ *
++ * This file is licensed under the terms of the GNU General Public License
++ * version 2. This program as licensed "as is" without any warranty of
++ * any kind, whether express or implied.
++ */
++
++#include <linux/kgdb.h>
++#include <asm/interrupt.h>
++
++#ifdef CONFIG_MIPS64
++/*
++ * MIPS 64-bit
++ */
++
++int kgdb_fault_setjmp_aux(unsigned long *curr_context, int sp, int fp)
++{
++      __asm__ __volatile__ ("sd $gp, %0" : : "m" (curr_context[0]));
++      __asm__ __volatile__ ("sd $16, %0" : : "m" (curr_context[1]));
++      __asm__ __volatile__ ("sd $17, %0" : : "m" (curr_context[2]));
++      __asm__ __volatile__ ("sd $18, %0" : : "m" (curr_context[3]));
++      __asm__ __volatile__ ("sd $19, %0" : : "m" (curr_context[4]));
++      __asm__ __volatile__ ("sd $20, %0" : : "m" (curr_context[5]));
++      __asm__ __volatile__ ("sd $21, %0" : : "m" (curr_context[6]));
++      __asm__ __volatile__ ("sd $22, %0" : : "m" (curr_context[7]));
++      __asm__ __volatile__ ("sd $23, %0" : : "m" (curr_context[8]));
++      __asm__ __volatile__ ("sd $31, %0" : : "m" (curr_context[9]));
++      curr_context[10] = (long *)sp;
++      curr_context[11] = (long *)fp;
++
++      return 0;
++}
++
++void kgdb_fault_longjmp(unsigned long *curr_context)
++{
++      unsigned long sp_val, fp_val;
++
++      __asm__ __volatile__ ("ld $gp, %0" : : "m" (curr_context[0]));
++      __asm__ __volatile__ ("ld $16, %0" : : "m" (curr_context[1]));
++      __asm__ __volatile__ ("ld $17, %0" : : "m" (curr_context[2]));
++      __asm__ __volatile__ ("ld $18, %0" : : "m" (curr_context[3]));
++      __asm__ __volatile__ ("ld $19, %0" : : "m" (curr_context[4]));
++      __asm__ __volatile__ ("ld $20, %0" : : "m" (curr_context[5]));
++      __asm__ __volatile__ ("ld $21, %0" : : "m" (curr_context[6]));
++      __asm__ __volatile__ ("ld $22, %0" : : "m" (curr_context[7]));
++      __asm__ __volatile__ ("ld $23, %0" : : "m" (curr_context[8]));
++      __asm__ __volatile__ ("ld $25, %0" : : "m" (curr_context[9]));
++      sp_val = curr_context[10];
++      fp_val = curr_context[11];
++      __asm__ __volatile__ ("ld $29, %0\n\t"
++                            "ld $30, %1\n\t" : : "m" (sp_val), "m" (fp_val));
++
++      __asm__ __volatile__ ("dli $2, 1");
++      __asm__ __volatile__ ("j $25");
++
++      for (;;);
++}
++#else
++/*
++ * MIPS 32-bit
++ */
++
++int kgdb_fault_setjmp_aux(unsigned long *curr_context, int sp, int fp)
++{
++      __asm__ __volatile__("sw $gp, %0" : : "m" (curr_context[0]));
++      __asm__ __volatile__("sw $16, %0" : : "m" (curr_context[1]));
++      __asm__ __volatile__("sw $17, %0" : : "m" (curr_context[2]));
++      __asm__ __volatile__("sw $18, %0" : : "m" (curr_context[3]));
++      __asm__ __volatile__("sw $19, %0" : : "m" (curr_context[4]));
++      __asm__ __volatile__("sw $20, %0" : : "m" (curr_context[5]));
++      __asm__ __volatile__("sw $21, %0" : : "m" (curr_context[6]));
++      __asm__ __volatile__("sw $22, %0" : : "m" (curr_context[7]));
++      __asm__ __volatile__("sw $23, %0" : : "m" (curr_context[8]));
++      __asm__ __volatile__("sw $31, %0" : : "m" (curr_context[9]));
++      curr_context[10] = (long *)sp;
++      curr_context[11] = (long *)fp;
++
++      return 0;
++}
++
++void kgdb_fault_longjmp(unsigned long *curr_context)
++{
++      unsigned long sp_val, fp_val;
++
++      __asm__ __volatile__("lw $gp, %0" : : "m" (curr_context[0]));
++      __asm__ __volatile__("lw $16, %0" : : "m" (curr_context[1]));
++      __asm__ __volatile__("lw $17, %0" : : "m" (curr_context[2]));
++      __asm__ __volatile__("lw $18, %0" : : "m" (curr_context[3]));
++      __asm__ __volatile__("lw $19, %0" : : "m" (curr_context[4]));
++      __asm__ __volatile__("lw $20, %0" : : "m" (curr_context[5]));
++      __asm__ __volatile__("lw $21, %0" : : "m" (curr_context[6]));
++      __asm__ __volatile__("lw $22, %0" : : "m" (curr_context[7]));
++      __asm__ __volatile__("lw $23, %0" : : "m" (curr_context[8]));
++      __asm__ __volatile__("lw $25, %0" : : "m" (curr_context[9]));
++      sp_val = curr_context[10];
++      fp_val = curr_context[11];
++      __asm__ __volatile__("lw $29, %0\n\t"
++                            "lw $30, %1\n\t" : : "m" (sp_val), "m" (fp_val));
++
++      __asm__ __volatile__("li $2, 1");
++      __asm__ __volatile__("jr $25");
++
++      for (;;);
++}
++#endif
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/mips/kernel/kgdb-setjmp.S linux-2.6.18-53.1.14.kgdb/arch/mips/kernel/kgdb-setjmp.S
+--- linux-2.6.18-53.1.14/arch/mips/kernel/kgdb-setjmp.S        1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/arch/mips/kernel/kgdb-setjmp.S   2008-06-10 15:38:24.000000000 +0400
+@@ -0,0 +1,28 @@
++/*
++ * arch/mips/kernel/kgdb-jmp.c
++ *
++ * Save and restore system registers so that within a limited frame we
++ * may have a fault and "jump back" to a known safe location.
++ *
++ * Copyright (C) 2005 by MontaVista Software.
++ * Author: Manish Lachwani (mlachwani@mvista.com)
++ *
++ * This file is licensed under the terms of the GNU General Public License
++ * version 2. This program as licensed "as is" without any warranty of
++ * any kind, whether express or implied.
++ */
++
++#include <asm/asm.h>
++#include <asm/mipsregs.h>
++#include <asm/regdef.h>
++#include <asm/stackframe.h>
++
++      .ent    kgdb_fault_setjmp,0
++ENTRY (kgdb_fault_setjmp)
++      move    a1, sp
++      move    a2, fp
++#ifdef CONFIG_MIPS64
++      nop
++#endif
++      j       kgdb_fault_setjmp_aux
++      .end    kgdb_fault_setjmp
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/mips/kernel/kgdb.c linux-2.6.18-53.1.14.kgdb/arch/mips/kernel/kgdb.c
+--- linux-2.6.18-53.1.14/arch/mips/kernel/kgdb.c       1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/arch/mips/kernel/kgdb.c  2008-06-10 15:38:24.000000000 +0400
+@@ -0,0 +1,297 @@
++/*
++ * arch/mips/kernel/kgdb.c
++ *
++ *  Originally written by Glenn Engel, Lake Stevens Instrument Division
++ *
++ *  Contributed by HP Systems
++ *
++ *  Modified for SPARC by Stu Grossman, Cygnus Support.
++ *
++ *  Modified for Linux/MIPS (and MIPS in general) by Andreas Busse
++ *  Send complaints, suggestions etc. to <andy@waldorf-gmbh.de>
++ *
++ *  Copyright (C) 1995 Andreas Busse
++ *
++ *  Copyright (C) 2003 MontaVista Software Inc.
++ *  Author: Jun Sun, jsun@mvista.com or jsun@junsun.net
++ *
++ *  Copyright (C) 2004-2005 MontaVista Software Inc.
++ *  Author: Manish Lachwani, mlachwani@mvista.com or manish@koffee-break.com
++ *
++ *  This file is licensed under the terms of the GNU General Public License
++ *  version 2. This program is licensed "as is" without any warranty of any
++ *  kind, whether express or implied.
++ */
++
++#include <linux/string.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/smp.h>
++#include <linux/spinlock.h>
++#include <linux/delay.h>
++#include <asm/system.h>
++#include <asm/ptrace.h>               /* for linux pt_regs struct */
++#include <linux/kgdb.h>
++#include <linux/init.h>
++#include <asm/inst.h>
++#include <asm/gdb-stub.h>
++#include <asm/cacheflush.h>
++#include <asm/kdebug.h>
++
++static struct hard_trap_info {
++      unsigned char tt;       /* Trap type code for MIPS R3xxx and R4xxx */
++      unsigned char signo;    /* Signal that we map this trap into */
++} hard_trap_info[] = {
++      { 6, SIGBUS },          /* instruction bus error */
++      { 7, SIGBUS },          /* data bus error */
++      { 9, SIGTRAP },         /* break */
++/*    { 11, SIGILL }, */      /* CPU unusable */
++      { 12, SIGFPE },         /* overflow */
++      { 13, SIGTRAP },        /* trap */
++      { 14, SIGSEGV },        /* virtual instruction cache coherency */
++      { 15, SIGFPE },         /* floating point exception */
++      { 23, SIGSEGV },        /* watch */
++      { 31, SIGSEGV },        /* virtual data cache coherency */
++      { 0, 0}                 /* Must be last */
++};
++
++/* Save the normal trap handlers for user-mode traps. */
++void *saved_vectors[32];
++
++extern void trap_low(void);
++extern void breakinst(void);
++extern void init_IRQ(void);
++
++void kgdb_call_nmi_hook(void *ignored)
++{
++      kgdb_nmihook(smp_processor_id(), (void *)0);
++}
++
++void kgdb_roundup_cpus(unsigned long flags)
++{
++      local_irq_restore(flags);
++      smp_call_function(kgdb_call_nmi_hook, 0, 0, 0);
++      local_irq_save(flags);
++}
++
++static int compute_signal(int tt)
++{
++      struct hard_trap_info *ht;
++
++      for (ht = hard_trap_info; ht->tt && ht->signo; ht++)
++              if (ht->tt == tt)
++                      return ht->signo;
++
++      return SIGHUP;          /* default for things we don't know about */
++}
++
++/*
++ * Set up exception handlers for tracing and breakpoints
++ */
++void handle_exception(struct pt_regs *regs)
++{
++      int trap = (regs->cp0_cause & 0x7c) >> 2;
++
++      if (fixup_exception(regs)) {
++              return;
++      }
++
++      if (atomic_read(&debugger_active))
++              kgdb_nmihook(smp_processor_id(), regs);
++
++      if (atomic_read(&kgdb_setting_breakpoint))
++              if ((trap == 9) && (regs->cp0_epc == (unsigned long)breakinst))
++                      regs->cp0_epc += 4;
++
++      kgdb_handle_exception(0, compute_signal(trap), 0, regs);
++
++      /* In SMP mode, __flush_cache_all does IPI */
++      __flush_cache_all();
++}
++
++void set_debug_traps(void)
++{
++      struct hard_trap_info *ht;
++      unsigned long flags;
++
++      local_irq_save(flags);
++
++      for (ht = hard_trap_info; ht->tt && ht->signo; ht++)
++              saved_vectors[ht->tt] = set_except_vector(ht->tt, trap_low);
++
++      local_irq_restore(flags);
++}
++
++#if 0
++/* This should be called before we exit kgdb_handle_exception() I believe.
++ * -- Tom
++ */
++void restore_debug_traps(void)
++{
++      struct hard_trap_info *ht;
++      unsigned long flags;
++
++      local_irq_save(flags);
++      for (ht = hard_trap_info; ht->tt && ht->signo; ht++)
++              set_except_vector(ht->tt, saved_vectors[ht->tt]);
++      local_irq_restore(flags);
++}
++#endif
++
++void regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
++{
++      int reg;
++      gdb_reg_t *ptr = (gdb_reg_t*)gdb_regs;
++
++      for (reg = 0; reg < 32; reg++)
++              *(ptr++) = regs->regs[reg];
++
++      *(ptr++) = regs->cp0_status;
++      *(ptr++) = regs->lo;
++      *(ptr++) = regs->hi;
++      *(ptr++) = regs->cp0_badvaddr;
++      *(ptr++) = regs->cp0_cause;
++      *(ptr++) = regs->cp0_epc;
++
++      return;
++}
++
++void gdb_regs_to_regs(unsigned long *gdb_regs, struct pt_regs *regs)
++{
++
++      int reg;
++      const gdb_reg_t *ptr = (gdb_reg_t*)gdb_regs;
++
++      for (reg = 0; reg < 32; reg++)
++              regs->regs[reg] = *(ptr++);
++
++      regs->cp0_status = *(ptr++);
++      regs->lo = *(ptr++);
++      regs->hi = *(ptr++);
++      regs->cp0_badvaddr = *(ptr++);
++      regs->cp0_cause = *(ptr++);
++      regs->cp0_epc = *(ptr++);
++
++      return;
++}
++
++/*
++ * Similar to regs_to_gdb_regs() except that process is sleeping and so
++ * we may not be able to get all the info.
++ */
++void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
++{
++      int reg;
++      struct thread_info *ti = p->thread_info;
++      unsigned long ksp = (unsigned long)ti + THREAD_SIZE - 32;
++      struct pt_regs *regs = (struct pt_regs *)ksp - 1;
++      gdb_reg_t *ptr = (gdb_reg_t*)gdb_regs;
++
++      for (reg = 0; reg < 16; reg++)
++              *(ptr++) = regs->regs[reg];
++
++      /* S0 - S7 */
++      for (reg = 16; reg < 24; reg++)
++              *(ptr++) = regs->regs[reg];
++
++      for (reg = 24; reg < 28; reg++)
++              *(ptr++) = 0;
++
++      /* GP, SP, FP, RA */
++      for (reg = 28; reg < 32; reg++)
++              *(ptr++) = regs->regs[reg];
++
++      *(ptr++) = regs->cp0_status;
++      *(ptr++) = regs->lo;
++      *(ptr++) = regs->hi;
++      *(ptr++) = regs->cp0_badvaddr;
++      *(ptr++) = regs->cp0_cause;
++      *(ptr++) = regs->cp0_epc;
++
++      return;
++}
++
++/*
++ * Calls linux_debug_hook before the kernel dies. If KGDB is enabled,
++ * then try to fall into the debugger
++ */
++static int kgdb_mips_notify(struct notifier_block *self, unsigned long cmd,
++                          void *ptr)
++{
++      struct die_args *args = (struct die_args *)ptr;
++      struct pt_regs *regs = args->regs;
++      int trap = (regs->cp0_cause & 0x7c) >> 2;
++
++      /* See if KGDB is interested. */
++      if (user_mode(regs))
++              /* Userpace events, ignore. */
++              return NOTIFY_DONE;
++
++      kgdb_handle_exception(trap, compute_signal(trap), 0, regs);
++      return NOTIFY_OK;
++}
++
++static struct notifier_block kgdb_notifier = {
++      .notifier_call = kgdb_mips_notify,
++};
++
++/*
++ * Handle the 's' and 'c' commands
++ */
++int kgdb_arch_handle_exception(int vector, int signo, int err_code,
++                             char *remcom_in_buffer, char *remcom_out_buffer,
++                             struct pt_regs *regs)
++{
++      char *ptr;
++      unsigned long address;
++      int cpu = smp_processor_id();
++
++      switch (remcom_in_buffer[0]) {
++      case 's':
++      case 'c':
++              /* handle the optional parameter */
++              ptr = &remcom_in_buffer[1];
++              if (kgdb_hex2long(&ptr, &address))
++                      regs->cp0_epc = address;
++
++              atomic_set(&cpu_doing_single_step, -1);
++              if (remcom_in_buffer[0] == 's')
++                      if (kgdb_contthread)
++                              atomic_set(&cpu_doing_single_step, cpu);
++
++              return 0;
++      }
++
++      return -1;
++}
++
++struct kgdb_arch arch_kgdb_ops = {
++#ifdef CONFIG_CPU_LITTLE_ENDIAN
++      .gdb_bpt_instr = {0xd},
++#else
++      .gdb_bpt_instr = {0x00, 0x00, 0x00, 0x0d},
++#endif
++};
++
++/*
++ * We use kgdb_early_setup so that functions we need to call now don't
++ * cause trouble when called again later.
++ */
++int kgdb_arch_init(void)
++{
++      /* Board-specifics. */
++      /* Force some calls to happen earlier. */
++      if (kgdb_early_setup == 0) {
++              trap_init();
++              init_IRQ();
++              kgdb_early_setup = 1;
++      }
++
++      /* Set our traps. */
++      /* This needs to be done more finely grained again, paired in
++       * a before/after in kgdb_handle_exception(...) -- Tom */
++      set_debug_traps();
++      notifier_chain_register(&mips_die_chain, &kgdb_notifier);
++
++      return 0;
++}
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/mips/kernel/kgdb_handler.S linux-2.6.18-53.1.14.kgdb/arch/mips/kernel/kgdb_handler.S
+--- linux-2.6.18-53.1.14/arch/mips/kernel/kgdb_handler.S       1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/arch/mips/kernel/kgdb_handler.S  2008-06-10 15:38:24.000000000 +0400
+@@ -0,0 +1,57 @@
++/*
++ * arch/mips/kernel/kgdb_handler.S
++ *
++ * Copyright (C) 2004-2005 MontaVista Software Inc.
++ * Author: Manish Lachwani, mlachwani@mvista.com or manish@koffee-break.com
++ *
++ * This file is licensed under the terms of the GNU General Public
++ * version 2. This program is licensed "as is" without any warranty of any
++ * kind, whether express or implied.
++ */
++
++/*
++ * Trap Handler for the new KGDB framework. The main KGDB handler is
++ * handle_exception that will be called from here
++ *
++ */
++
++#include <linux/config.h>
++#include <linux/sys.h>
++
++#include <asm/asm.h>
++#include <asm/errno.h>
++#include <asm/mipsregs.h>
++#include <asm/regdef.h>
++#include <asm/stackframe.h>
++
++      .align  5
++      NESTED(trap_low, PT_SIZE, sp)
++              .set    noat
++              .set    noreorder
++
++              /*
++               * Check for privileged instructions in user mode. For
++               * this, check the cu0 bit in the CPU status register.
++               */
++              mfc0    k0, CP0_STATUS
++              sll     k0, 3
++              bltz    k0, 1f
++              move    k1, sp
++
++              /*
++               * GDB userland from within KGDB. If a user mode address
++               * then jump to the saved exception handler
++               */
++              mfc0    k1, CP0_CAUSE
++              andi    k1, k1, 0x7c
++              PTR_L   k0, saved_vectors(k1)
++              jr      k0
++              nop
++1:
++              SAVE_ALL
++              .set    at
++              .set    reorder
++              move    a0, sp
++              jal     handle_exception
++              j       ret_from_exception
++      END(trap_low)
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/mips/kernel/traps.c linux-2.6.18-53.1.14.kgdb/arch/mips/kernel/traps.c
+--- linux-2.6.18-53.1.14/arch/mips/kernel/traps.c      2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/mips/kernel/traps.c 2008-06-10 15:38:24.000000000 +0400
+@@ -10,6 +10,8 @@
+  * Kevin D. Kissell, kevink@mips.com and Carsten Langgaard, carstenl@mips.com
+  * Copyright (C) 2000, 01 MIPS Technologies, Inc.
+  * Copyright (C) 2002, 2003, 2004, 2005  Maciej W. Rozycki
++ *
++ * KGDB specific changes - Manish Lachwani (mlachwani@mvista.com)
+  */
+ #include <linux/init.h>
+ #include <linux/mm.h>
+@@ -20,6 +22,7 @@
+ #include <linux/spinlock.h>
+ #include <linux/kallsyms.h>
+ #include <linux/bootmem.h>
++#include <linux/kgdb.h>
+ 
+ #include <asm/bootinfo.h>
+ #include <asm/branch.h>
+@@ -40,6 +43,7 @@
+ #include <asm/mmu_context.h>
+ #include <asm/watch.h>
+ #include <asm/types.h>
++#include <asm/kdebug.h>
+ 
+ extern asmlinkage void handle_int(void);
+ extern asmlinkage void handle_tlbm(void);
+@@ -78,6 +82,21 @@ void (*board_bind_eic_interrupt)(int irq
+  */
+ #define MODULE_RANGE (8*1024*1024)
+ 
++struct notifier_block *mips_die_chain;
++static spinlock_t die_notifier_lock = SPIN_LOCK_UNLOCKED;
++
++int register_die_notifier(struct notifier_block *nb)
++{
++      int err = 0;
++      unsigned long flags;
++
++      spin_lock_irqsave(&die_notifier_lock, flags);
++      err = notifier_chain_register(&mips_die_chain, nb);
++      spin_unlock_irqrestore(&die_notifier_lock, flags);
++
++      return err;
++}
++
+ /*
+  * This routine abuses get_user()/put_user() to reference pointers
+  * with at least a bit of error checking ...
+@@ -1387,6 +1406,11 @@ void __init trap_init(void)
+       extern char except_vec4;
+       unsigned long i;
+ 
++#if defined(CONFIG_KGDB)
++      if (kgdb_early_setup)
++              return; /* Already done */
++#endif
++
+       if (cpu_has_veic || cpu_has_vint)
+               ebase = (unsigned long) alloc_bootmem_low_pages (0x200 + VECTORSPACING*64);
+       else
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/mips/mips-boards/generic/Makefile linux-2.6.18-53.1.14.kgdb/arch/mips/mips-boards/generic/Makefile
+--- linux-2.6.18-53.1.14/arch/mips/mips-boards/generic/Makefile        2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/mips/mips-boards/generic/Makefile   2008-06-10 15:38:24.000000000 +0400
+@@ -21,6 +21,5 @@
+ obj-y                         := reset.o display.o init.o memory.o printf.o \
+                                  cmdline.o time.o
+ obj-$(CONFIG_PCI)             += pci.o
+-obj-$(CONFIG_KGDB)            += gdb_hook.o
+ 
+ EXTRA_AFLAGS := $(CFLAGS)
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/mips/mips-boards/generic/init.c linux-2.6.18-53.1.14.kgdb/arch/mips/mips-boards/generic/init.c
+--- linux-2.6.18-53.1.14/arch/mips/mips-boards/generic/init.c  2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/mips/mips-boards/generic/init.c     2008-06-10 15:38:24.000000000 +0400
+@@ -37,15 +37,6 @@
+ 
+ #include <asm/mips-boards/malta.h>
+ 
+-#ifdef CONFIG_KGDB
+-extern int rs_kgdb_hook(int, int);
+-extern int rs_putDebugChar(char);
+-extern char rs_getDebugChar(void);
+-extern int saa9730_kgdb_hook(int);
+-extern int saa9730_putDebugChar(char);
+-extern char saa9730_getDebugChar(void);
+-#endif
+-
+ int prom_argc;
+ int *_prom_argv, *_prom_envp;
+ 
+@@ -172,58 +163,6 @@ static void __init console_config(void)
+ }
+ #endif
+ 
+-#ifdef CONFIG_KGDB
+-void __init kgdb_config (void)
+-{
+-      extern int (*generic_putDebugChar)(char);
+-      extern char (*generic_getDebugChar)(void);
+-      char *argptr;
+-      int line, speed;
+-
+-      argptr = prom_getcmdline();
+-      if ((argptr = strstr(argptr, "kgdb=ttyS")) != NULL) {
+-              argptr += strlen("kgdb=ttyS");
+-              if (*argptr != '0' && *argptr != '1')
+-                      printk("KGDB: Unknown serial line /dev/ttyS%c, "
+-                             "falling back to /dev/ttyS1\n", *argptr);
+-              line = *argptr == '0' ? 0 : 1;
+-              printk("KGDB: Using serial line /dev/ttyS%d for session\n", line);
+-
+-              speed = 0;
+-              if (*++argptr == ',')
+-              {
+-                      int c;
+-                      while ((c = *++argptr) && ('0' <= c && c <= '9'))
+-                              speed = speed * 10 + c - '0';
+-              }
+-#ifdef CONFIG_MIPS_ATLAS
+-              if (line == 1) {
+-                      speed = saa9730_kgdb_hook(speed);
+-                      generic_putDebugChar = saa9730_putDebugChar;
+-                      generic_getDebugChar = saa9730_getDebugChar;
+-              }
+-              else
+-#endif
+-              {
+-                      speed = rs_kgdb_hook(line, speed);
+-                      generic_putDebugChar = rs_putDebugChar;
+-                      generic_getDebugChar = rs_getDebugChar;
+-              }
+-
+-              prom_printf("KGDB: Using serial line /dev/ttyS%d at %d for session, "
+-                          "please connect your debugger\n", line ? 1 : 0, speed);
+-
+-              {
+-                      char *s;
+-                      for (s = "Please connect GDB to this port\r\n"; *s; )
+-                              generic_putDebugChar (*s++);
+-              }
+-
+-              /* Breakpoint is invoked after interrupts are initialised */
+-      }
+-}
+-#endif
+-
+ void __init mips_nmi_setup (void)
+ {
+       void *base;
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/mips/mips-boards/malta/malta_setup.c linux-2.6.18-53.1.14.kgdb/arch/mips/mips-boards/malta/malta_setup.c
+--- linux-2.6.18-53.1.14/arch/mips/mips-boards/malta/malta_setup.c     2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/mips/mips-boards/malta/malta_setup.c        2008-06-10 15:38:24.000000000 +0400
+@@ -46,10 +46,6 @@ extern void mips_reboot_setup(void);
+ extern void mips_time_init(void);
+ extern unsigned long mips_rtc_get_time(void);
+ 
+-#ifdef CONFIG_KGDB
+-extern void kgdb_config(void);
+-#endif
+-
+ struct resource standard_io_resources[] = {
+       { .name = "dma1", .start = 0x00, .end = 0x1f, .flags = IORESOURCE_BUSY },
+       { .name = "timer", .start = 0x40, .end = 0x5f, .flags = IORESOURCE_BUSY },
+@@ -124,10 +120,6 @@ void __init plat_mem_setup(void)
+        */
+       enable_dma(4);
+ 
+-#ifdef CONFIG_KGDB
+-      kgdb_config ();
+-#endif
+-
+       if ((mips_revision_corid == MIPS_REVISION_CORID_BONITO64) ||
+           (mips_revision_corid == MIPS_REVISION_CORID_CORE_20K) ||
+           (mips_revision_corid == MIPS_REVISION_CORID_CORE_EMUL_BON)) {
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/mips/mm/extable.c linux-2.6.18-53.1.14.kgdb/arch/mips/mm/extable.c
+--- linux-2.6.18-53.1.14/arch/mips/mm/extable.c        2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/mips/mm/extable.c   2008-06-10 15:38:24.000000000 +0400
+@@ -3,6 +3,7 @@
+  */
+ #include <linux/module.h>
+ #include <linux/spinlock.h>
++#include <linux/kgdb.h>
+ #include <asm/branch.h>
+ #include <asm/uaccess.h>
+ 
+@@ -16,6 +17,12 @@ int fixup_exception(struct pt_regs *regs
+ 
+               return 1;
+       }
++#ifdef CONFIG_KGDB
++      if (atomic_read(&debugger_active) && kgdb_may_fault)
++              /* Restore our previous state. */
++              kgdb_fault_longjmp(kgdb_fault_jmp_regs);
++              /* Not reached. */
++#endif
+ 
+       return 0;
+ }
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/mips/sibyte/cfe/setup.c linux-2.6.18-53.1.14.kgdb/arch/mips/sibyte/cfe/setup.c
+--- linux-2.6.18-53.1.14/arch/mips/sibyte/cfe/setup.c  2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/mips/sibyte/cfe/setup.c     2008-06-10 15:38:24.000000000 +0400
+@@ -58,10 +58,6 @@ int cfe_cons_handle;
+ extern unsigned long initrd_start, initrd_end;
+ #endif
+ 
+-#ifdef CONFIG_KGDB
+-extern int kgdb_port;
+-#endif
+-
+ static void ATTRIB_NORET cfe_linux_exit(void *arg)
+ {
+       int warm = *(int *)arg;
+@@ -242,9 +238,6 @@ void __init prom_init(void)
+       int argc = fw_arg0;
+       char **envp = (char **) fw_arg2;
+       int *prom_vec = (int *) fw_arg3;
+-#ifdef CONFIG_KGDB
+-      char *arg;
+-#endif
+ 
+       _machine_restart   = cfe_linux_restart;
+       _machine_halt      = cfe_linux_halt;
+@@ -308,13 +301,6 @@ void __init prom_init(void)
+               }
+       }
+ 
+-#ifdef CONFIG_KGDB
+-      if ((arg = strstr(arcs_cmdline,"kgdb=duart")) != NULL)
+-              kgdb_port = (arg[10] == '0') ? 0 : 1;
+-      else
+-              kgdb_port = 1;
+-#endif
+-
+ #ifdef CONFIG_BLK_DEV_INITRD
+       {
+               char *ptr;
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/mips/sibyte/sb1250/Makefile linux-2.6.18-53.1.14.kgdb/arch/mips/sibyte/sb1250/Makefile
+--- linux-2.6.18-53.1.14/arch/mips/sibyte/sb1250/Makefile      2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/mips/sibyte/sb1250/Makefile 2008-06-10 15:38:24.000000000 +0400
+@@ -4,5 +4,6 @@ obj-$(CONFIG_SMP)                      += smp.o
+ obj-$(CONFIG_SIBYTE_TBPROF)           += bcm1250_tbprof.o
+ obj-$(CONFIG_SIBYTE_STANDALONE)               += prom.o
+ obj-$(CONFIG_SIBYTE_BUS_WATCHER)      += bus_watcher.o
++obj-$(CONFIG_KGDB_SIBYTE)             += kgdb_sibyte.o
+ 
+ EXTRA_AFLAGS := $(CFLAGS)
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/mips/sibyte/sb1250/irq.c linux-2.6.18-53.1.14.kgdb/arch/mips/sibyte/sb1250/irq.c
+--- linux-2.6.18-53.1.14/arch/mips/sibyte/sb1250/irq.c 2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/mips/sibyte/sb1250/irq.c    2008-06-10 15:38:24.000000000 +0400
+@@ -30,6 +30,7 @@
+ #include <asm/system.h>
+ #include <asm/ptrace.h>
+ #include <asm/io.h>
++#include <asm/kgdb.h>
+ 
+ #include <asm/sibyte/sb1250_regs.h>
+ #include <asm/sibyte/sb1250_int.h>
+@@ -59,16 +60,6 @@ static void sb1250_set_affinity(unsigned
+ extern unsigned long ldt_eoi_space;
+ #endif
+ 
+-#ifdef CONFIG_KGDB
+-static int kgdb_irq;
+-
+-/* Default to UART1 */
+-int kgdb_port = 1;
+-#ifdef CONFIG_SIBYTE_SB1250_DUART
+-extern char sb1250_duart_present[];
+-#endif
+-#endif
+-
+ static struct irq_chip sb1250_irq_type = {
+       .typename = "SB1250-IMR",
+       .startup = startup_sb1250_irq,
+@@ -324,6 +315,11 @@ void __init arch_init_irq(void)
+       unsigned int imask = STATUSF_IP4 | STATUSF_IP3 | STATUSF_IP2 |
+               STATUSF_IP1 | STATUSF_IP0;
+ 
++#ifdef CONFIG_KGDB
++      if (kgdb_early_setup)
++              return;
++#endif
++
+       /* Default everything to IP2 */
+       for (i = 0; i < SB1250_NR_IRQS; i++) {  /* was I0 */
+               __raw_writeq(IMR_IP2_VAL,
+@@ -375,50 +371,6 @@ void __init arch_init_irq(void)
+       /* Enable necessary IPs, disable the rest */
+       change_c0_status(ST0_IM, imask);
+ 
+-#ifdef CONFIG_KGDB
+-      if (kgdb_flag) {
+-              kgdb_irq = K_INT_UART_0 + kgdb_port;
+-
+-#ifdef CONFIG_SIBYTE_SB1250_DUART
+-              sb1250_duart_present[kgdb_port] = 0;
+-#endif
+-              /* Setup uart 1 settings, mapper */
+-              __raw_writeq(M_DUART_IMR_BRK,
+-                           IOADDR(A_DUART_IMRREG(kgdb_port)));
+-
+-              sb1250_steal_irq(kgdb_irq);
+-              __raw_writeq(IMR_IP6_VAL,
+-                           IOADDR(A_IMR_REGISTER(0,
+-                                                 R_IMR_INTERRUPT_MAP_BASE) +
+-                                  (kgdb_irq << 3)));
+-              sb1250_unmask_irq(0, kgdb_irq);
+-      }
+-#endif
+-}
+-
+-#ifdef CONFIG_KGDB
+-
+-#include <linux/delay.h>
+-
+-#define duart_out(reg, val)     csr_out32(val, IOADDR(A_DUART_CHANREG(kgdb_port,reg)))
+-#define duart_in(reg)           csr_in32(IOADDR(A_DUART_CHANREG(kgdb_port,reg)))
+-
+-static void sb1250_kgdb_interrupt(struct pt_regs *regs)
+-{
+-      /*
+-       * Clear break-change status (allow some time for the remote
+-       * host to stop the break, since we would see another
+-       * interrupt on the end-of-break too)
+-       */
+-      kstat_this_cpu.irqs[kgdb_irq]++;
+-      mdelay(500);
+-      duart_out(R_DUART_CMD, V_DUART_MISC_CMD_RESET_BREAK_INT |
+-                              M_DUART_RX_EN | M_DUART_TX_EN);
+-      set_async_breakpoint(&regs->cp0_epc);
+-}
+-
+-#endif        /* CONFIG_KGDB */
+-
+ static inline int dclz(unsigned long long x)
+ {
+       int lz;
+@@ -473,7 +425,7 @@ asmlinkage void plat_irq_dispatch(struct
+               sb1250_mailbox_interrupt(regs);
+ #endif
+ 
+-#ifdef CONFIG_KGDB
++#ifdef CONFIG_KGDB_SIBYTE
+       else if (pending & CAUSEF_IP6)                  /* KGDB (uart 1) */
+               sb1250_kgdb_interrupt(regs);
+ #endif
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/mips/sibyte/sb1250/kgdb_sibyte.c linux-2.6.18-53.1.14.kgdb/arch/mips/sibyte/sb1250/kgdb_sibyte.c
+--- linux-2.6.18-53.1.14/arch/mips/sibyte/sb1250/kgdb_sibyte.c 1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/arch/mips/sibyte/sb1250/kgdb_sibyte.c    2008-06-10 15:38:24.000000000 +0400
+@@ -0,0 +1,164 @@
++/*
++ * arch/mips/sibyte/sb1250/kgdb_sibyte.c
++ *
++ * Author: Manish Lachwani, mlachwani@mvista.com or manish@koffee-break.com
++ *
++ * 2004 (c) MontaVista Software, Inc. This file is licensed under
++ * the terms of the GNU General Public License version 2. This program
++ * is licensed "as is" without any warranty of any kind, whether express
++ * or implied.
++ */
++
++/*
++ * Support for KGDB on the Broadcom Sibyte. The SWARM board
++ * for example does not have a 8250/16550 compatible serial
++ * port. Hence, we need to have a driver for the serial
++ * ports to handle KGDB.  This board needs nothing in addition
++ * to what is normally provided by the gdb portion of the stub.
++ */
++
++#include <linux/delay.h>
++#include <linux/kernel_stat.h>
++#include <linux/init.h>
++#include <linux/kgdb.h>
++
++#include <asm/io.h>
++#include <asm/sibyte/sb1250.h>
++#include <asm/sibyte/sb1250_regs.h>
++#include <asm/sibyte/sb1250_uart.h>
++#include <asm/sibyte/sb1250_int.h>
++#include <asm/addrspace.h>
++
++int kgdb_port = 1;
++static int kgdb_irq;
++
++extern char sb1250_duart_present[];
++extern int sb1250_steal_irq(int irq);
++
++/* Forward declarations. */
++static void kgdbsibyte_init_duart(void);
++static int kgdb_init_io(void);
++
++#define IMR_IP6_VAL   K_INT_MAP_I4
++#define       duart_out(reg, val)     csr_out32(val, IOADDR(A_DUART_CHANREG(kgdb_port,reg)))
++#define duart_in(reg)         csr_in32(IOADDR(A_DUART_CHANREG(kgdb_port,reg)))
++
++static void kgdb_swarm_write_char(int c)
++{
++      while ((duart_in(R_DUART_STATUS) & M_DUART_TX_RDY) == 0) ;
++      duart_out(R_DUART_TX_HOLD, c);
++}
++
++static int kgdb_swarm_read_char(void)
++{
++      int ret_char;
++      unsigned int status;
++
++      status = duart_in(R_DUART_STATUS);
++      while ((status & M_DUART_RX_RDY) == 0) {
++              status = duart_in(R_DUART_STATUS);
++      }
++
++      /*
++       * Check for framing error
++       */
++      if (status & M_DUART_FRM_ERR) {
++              kgdbsibyte_init_duart();
++              kgdb_swarm_write_char('-');
++              return '-';
++      }
++
++      ret_char = duart_in(R_DUART_RX_HOLD);
++
++      return ret_char;
++}
++
++void sb1250_kgdb_interrupt(struct pt_regs *regs)
++{
++      int kgdb_irq = K_INT_UART_0 + kgdb_port;
++      /*
++       * Clear break-change status (allow some time for the remote
++       * host to stop the break, since we would see another
++       * interrupt on the end-of-break too)
++       */
++      kstat_this_cpu.irqs[kgdb_irq]++;
++      mdelay(500);
++      duart_out(R_DUART_CMD, V_DUART_MISC_CMD_RESET_BREAK_INT |
++                M_DUART_RX_EN | M_DUART_TX_EN);
++      if (kgdb_io_ops.init != kgdb_init_io) {
++              /* Throw away the data if another I/O routine is
++               * active.
++               */
++              unsigned int status;
++
++              status = duart_in(R_DUART_STATUS);
++              while ((status & M_DUART_RX_RDY) == 0) {
++                      status = duart_in(R_DUART_STATUS);
++              }
++              /*
++               * Check for framing error
++               */
++              if (status & M_DUART_FRM_ERR) {
++                      kgdbsibyte_init_duart();
++              }
++              duart_in(R_DUART_RX_HOLD);
++      } else
++              breakpoint();
++
++}
++
++/*
++ * We use port #1 and we set it for 115200 BAUD, 8n1.
++ */
++static void kgdbsibyte_init_duart(void)
++{
++      /* Set 8n1. */
++      duart_out(R_DUART_MODE_REG_1,
++                V_DUART_BITS_PER_CHAR_8 | V_DUART_PARITY_MODE_NONE);
++      duart_out(R_DUART_MODE_REG_2, M_DUART_STOP_BIT_LEN_1);
++      /* Set baud rate of 115200. */
++      duart_out(R_DUART_CLK_SEL, V_DUART_BAUD_RATE(115200));
++      /* Enable rx and tx */
++      duart_out(R_DUART_CMD, M_DUART_RX_EN | M_DUART_TX_EN);
++}
++
++static int kgdb_init_io(void)
++{
++#ifdef CONFIG_SIBYTE_SB1250_DUART
++      sb1250_duart_present[kgdb_port] = 0;
++#endif
++
++      kgdbsibyte_init_duart();
++
++      return 0;
++}
++
++/*
++ * Hookup our IRQ line.  We will already have been initialized a
++ * this point.
++ */
++static void __init kgdbsibyte_hookup_irq(void)
++{
++      /* Steal the IRQ. */
++      kgdb_irq = K_INT_UART_0 + kgdb_port;
++
++      /* Setup uart 1 settings, mapper */
++      __raw_writeq(M_DUART_IMR_BRK, IOADDR(A_DUART_IMRREG(kgdb_port)));
++
++      sb1250_steal_irq(kgdb_irq);
++
++      __raw_writeq(IMR_IP6_VAL,
++                   IOADDR(A_IMR_REGISTER(0, R_IMR_INTERRUPT_MAP_BASE) +
++                          (kgdb_irq << 3)));
++
++      sb1250_unmask_irq(0, kgdb_irq);
++}
++
++struct kgdb_io kgdb_io_ops = {
++      .read_char = kgdb_swarm_read_char,
++      .write_char = kgdb_swarm_write_char,
++      .init = kgdb_init_io,
++      .late_init = kgdbsibyte_hookup_irq,
++      .pre_exception = NULL,
++      .post_exception = NULL
++};
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/mips/sibyte/swarm/Makefile linux-2.6.18-53.1.14.kgdb/arch/mips/sibyte/swarm/Makefile
+--- linux-2.6.18-53.1.14/arch/mips/sibyte/swarm/Makefile       2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/mips/sibyte/swarm/Makefile  2008-06-10 15:38:24.000000000 +0400
+@@ -1,3 +1 @@
+ lib-y                         = setup.o rtc_xicor1241.o rtc_m41t81.o
+-
+-lib-$(CONFIG_KGDB)            += dbg_io.o
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/mips/sibyte/swarm/dbg_io.c linux-2.6.18-53.1.14.kgdb/arch/mips/sibyte/swarm/dbg_io.c
+--- linux-2.6.18-53.1.14/arch/mips/sibyte/swarm/dbg_io.c       2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/mips/sibyte/swarm/dbg_io.c  1970-01-01 03:00:00.000000000 +0300
+@@ -1,76 +0,0 @@
+-/*
+- * kgdb debug routines for SiByte boards.
+- *
+- * Copyright (C) 2001 MontaVista Software Inc.
+- * Author: Jun Sun, jsun@mvista.com or jsun@junsun.net
+- *
+- * This program is free software; you can redistribute  it and/or modify it
+- * under  the terms of  the GNU General  Public License as published by the
+- * Free Software Foundation;  either version 2 of the  License, or (at your
+- * option) any later version.
+- *
+- */
+-
+-/* -------------------- BEGINNING OF CONFIG --------------------- */
+-
+-#include <linux/delay.h>
+-#include <asm/io.h>
+-#include <asm/sibyte/sb1250.h>
+-#include <asm/sibyte/sb1250_regs.h>
+-#include <asm/sibyte/sb1250_uart.h>
+-#include <asm/sibyte/sb1250_int.h>
+-#include <asm/addrspace.h>
+-
+-/*
+- * We use the second serial port for kgdb traffic.
+- *    115200, 8, N, 1.
+- */
+-
+-#define       BAUD_RATE               115200
+-#define       CLK_DIVISOR             V_DUART_BAUD_RATE(BAUD_RATE)
+-#define       DATA_BITS               V_DUART_BITS_PER_CHAR_8         /* or 7    */
+-#define       PARITY                  V_DUART_PARITY_MODE_NONE        /* or even */
+-#define       STOP_BITS               M_DUART_STOP_BIT_LEN_1          /* or 2    */
+-
+-static int duart_initialized = 0;     /* 0: need to be init'ed by kgdb */
+-
+-/* -------------------- END OF CONFIG --------------------- */
+-extern int kgdb_port;
+-
+-#define       duart_out(reg, val)     csr_out32(val, IOADDR(A_DUART_CHANREG(kgdb_port,reg)))
+-#define duart_in(reg)         csr_in32(IOADDR(A_DUART_CHANREG(kgdb_port,reg)))
+-
+-void putDebugChar(unsigned char c);
+-unsigned char getDebugChar(void);
+-static void
+-duart_init(int clk_divisor, int data, int parity, int stop)
+-{
+-      duart_out(R_DUART_MODE_REG_1, data | parity);
+-      duart_out(R_DUART_MODE_REG_2, stop);
+-      duart_out(R_DUART_CLK_SEL, clk_divisor);
+-
+-      duart_out(R_DUART_CMD, M_DUART_RX_EN | M_DUART_TX_EN);  /* enable rx and tx */
+-}
+-
+-void
+-putDebugChar(unsigned char c)
+-{
+-      if (!duart_initialized) {
+-              duart_initialized = 1;
+-              duart_init(CLK_DIVISOR, DATA_BITS, PARITY, STOP_BITS);
+-      }
+-      while ((duart_in(R_DUART_STATUS) & M_DUART_TX_RDY) == 0);
+-      duart_out(R_DUART_TX_HOLD, c);
+-}
+-
+-unsigned char
+-getDebugChar(void)
+-{
+-      if (!duart_initialized) {
+-              duart_initialized = 1;
+-              duart_init(CLK_DIVISOR, DATA_BITS, PARITY, STOP_BITS);
+-      }
+-      while ((duart_in(R_DUART_STATUS) & M_DUART_RX_RDY) == 0) ;
+-      return duart_in(R_DUART_RX_HOLD);
+-}
+-
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/mips/tx4938/common/Makefile linux-2.6.18-53.1.14.kgdb/arch/mips/tx4938/common/Makefile
+--- linux-2.6.18-53.1.14/arch/mips/tx4938/common/Makefile      2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/mips/tx4938/common/Makefile 2008-06-10 15:38:24.000000000 +0400
+@@ -7,5 +7,5 @@
+ #
+ 
+ obj-y += prom.o setup.o irq.o rtc_rx5c348.o
+-obj-$(CONFIG_KGDB) += dbgio.o
++obj-$(CONFIG_KGDB_8250) += dbgio.o
+ 
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/powerpc/Kconfig.debug linux-2.6.18-53.1.14.kgdb/arch/powerpc/Kconfig.debug
+--- linux-2.6.18-53.1.14/arch/powerpc/Kconfig.debug    2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/powerpc/Kconfig.debug       2008-06-10 15:38:14.000000000 +0400
+@@ -18,52 +18,9 @@ config DEBUG_STACK_USAGE
+ 
+         This option will slow down process creation somewhat.
+ 
+-config DEBUGGER
+-      bool "Enable debugger hooks"
+-      depends on DEBUG_KERNEL
+-      help
+-        Include in-kernel hooks for kernel debuggers. Unless you are
+-        intending to debug the kernel, say N here.
+-
+-config KGDB
+-      bool "Include kgdb kernel debugger"
+-      depends on DEBUGGER && (BROKEN || PPC_GEN550 || 4xx)
+-      select DEBUG_INFO
+-      help
+-        Include in-kernel hooks for kgdb, the Linux kernel source level
+-        debugger.  See <http://kgdb.sourceforge.net/> for more information.
+-        Unless you are intending to debug the kernel, say N here.
+-
+-choice
+-      prompt "Serial Port"
+-      depends on KGDB
+-      default KGDB_TTYS1
+-
+-config KGDB_TTYS0
+-      bool "ttyS0"
+-
+-config KGDB_TTYS1
+-      bool "ttyS1"
+-
+-config KGDB_TTYS2
+-      bool "ttyS2"
+-
+-config KGDB_TTYS3
+-      bool "ttyS3"
+-
+-endchoice
+-
+-config KGDB_CONSOLE
+-      bool "Enable serial console thru kgdb port"
+-      depends on KGDB && 8xx || CPM2
+-      help
+-        If you enable this, all serial console messages will be sent
+-        over the gdb stub.
+-        If unsure, say N.
+-
+ config XMON
+       bool "Include xmon kernel debugger"
+-      depends on DEBUGGER && !PPC_ISERIES
++      depends on DEBUG_KERNEL && !PPC_ISERIES
+       help
+         Include in-kernel hooks for the xmon kernel monitor/debugger.
+         Unless you are intending to debug the kernel, say N here.
+@@ -82,6 +39,11 @@ config XMON_DEFAULT
+         xmon is normally disabled unless booted with 'xmon=on'.
+         Use 'xmon=off' to disable xmon init during runtime.
+ 
++config DEBUGGER
++      bool
++      depends on KGDB || XMON
++      default y
++
+ config IRQSTACKS
+       bool "Use separate kernel stacks when processing interrupts"
+       depends on PPC64
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/powerpc/kernel/Makefile linux-2.6.18-53.1.14.kgdb/arch/powerpc/kernel/Makefile
+--- linux-2.6.18-53.1.14/arch/powerpc/kernel/Makefile  2008-03-06 05:54:47.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/arch/powerpc/kernel/Makefile     2008-06-10 15:38:14.000000000 +0400
+@@ -59,6 +59,7 @@ obj-$(CONFIG_PPC64)          += misc_64.o dma_64
+ obj-$(CONFIG_PPC_MULTIPLATFORM)       += prom_init.o
+ obj-$(CONFIG_MODULES)         += ppc_ksyms.o
+ obj-$(CONFIG_BOOTX_TEXT)      += btext.o
++obj-$(CONFIG_KGDB)            += kgdb.o
+ obj-$(CONFIG_SMP)             += smp.o
+ obj-$(CONFIG_KPROBES)         += kprobes.o
+ obj-$(CONFIG_PPC_UDBG_16550)  += legacy_serial.o udbg_16550.o
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/powerpc/kernel/kgdb.c linux-2.6.18-53.1.14.kgdb/arch/powerpc/kernel/kgdb.c
+--- linux-2.6.18-53.1.14/arch/powerpc/kernel/kgdb.c    1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/arch/powerpc/kernel/kgdb.c       2008-06-10 15:38:14.000000000 +0400
+@@ -0,0 +1,568 @@
++/*
++ * arch/powerpc/kernel/kgdb.c
++ *
++ * PowerPC backend to the KGDB stub.
++ *
++ * Maintainer: Tom Rini <trini@kernel.crashing.org>
++ *
++ * Copied from arch/ppc/kernel/kgdb.c, updated for ppc64
++ *
++ * Copyright (C) 1996 Paul Mackerras (setjmp/longjmp)
++ * 1998 (c) Michael AK Tesch (tesch@cs.wisc.edu)
++ * Copyright (C) 2003 Timesys Corporation.
++ * Copyright (C) 2004-2006 MontaVista Software, Inc.
++ * PPC64 Mods (C) 2005 Frank Rowand (frowand@mvista.com)
++ * PPC32 support restored by Vitaly Wool <vwool@ru.mvista.com> and
++ * Sergei Shtylyov <sshtylyov@ru.mvista.com>
++ *
++ * This file is licensed under the terms of the GNU General Public License
++ * version 2. This program as licensed "as is" without any warranty of any
++ * kind, whether express or implied.
++ */
++
++#include <linux/config.h>
++#include <linux/kernel.h>
++#include <linux/init.h>
++#include <linux/kgdb.h>
++#include <linux/smp.h>
++#include <linux/signal.h>
++#include <linux/ptrace.h>
++#include <asm/current.h>
++#include <asm/ptrace.h>
++#include <asm/processor.h>
++#include <asm/machdep.h>
++
++/*
++ * This table contains the mapping between PowerPC hardware trap types, and
++ * signals, which are primarily what GDB understands.  GDB and the kernel
++ * don't always agree on values, so we use constants taken from gdb-6.2.
++ */
++static struct hard_trap_info
++{
++      unsigned int tt;                /* Trap type code for powerpc */
++      unsigned char signo;            /* Signal that we map this trap into */
++} hard_trap_info[] = {
++      { 0x0100, 0x02 /* SIGINT */  },         /* system reset */
++      { 0x0200, 0x0b /* SIGSEGV */ },         /* machine check */
++      { 0x0300, 0x0b /* SIGSEGV */ },         /* data access */
++      { 0x0400, 0x0b /* SIGSEGV */ },  /* instruction access */
++      { 0x0500, 0x02 /* SIGINT */  },  /* external interrupt */
++      { 0x0600, 0x0a /* SIGBUS */  },         /* alignment */
++      { 0x0700, 0x05 /* SIGTRAP */ },  /* program check */
++      { 0x0800, 0x08 /* SIGFPE */  },  /* fp unavailable */
++      { 0x0900, 0x0e /* SIGALRM */ },  /* decrementer */
++      { 0x0c00, 0x14 /* SIGCHLD */ },  /* system call */
++#if defined(CONFIG_40x) || defined(CONFIG_BOOKE)
++      { 0x2002, 0x05 /* SIGTRAP */ },  /* debug */
++#if defined(CONFIG_FSL_BOOKE)
++      { 0x2010, 0x08 /* SIGFPE */  },  /* spe unavailable */
++      { 0x2020, 0x08 /* SIGFPE */  },  /* spe unavailable */
++      { 0x2030, 0x08 /* SIGFPE */  },  /* spe fp data */
++      { 0x2040, 0x08 /* SIGFPE */  },  /* spe fp data */
++      { 0x2050, 0x08 /* SIGFPE */  },  /* spe fp round */
++      { 0x2060, 0x0e /* SIGILL */  },  /* performace monitor */
++      { 0x2900, 0x08 /* SIGFPE */  },  /* apu unavailable */
++      { 0x3100, 0x0e /* SIGALRM */ },  /* fixed interval timer */
++      { 0x3200, 0x02 /* SIGINT */  },  /* watchdog */
++#else
++      { 0x1000, 0x0e /* SIGALRM */ },  /* programmable interval timer */
++      { 0x1010, 0x0e /* SIGALRM */ },  /* fixed interval timer */
++      { 0x1020, 0x02 /* SIGINT */  },  /* watchdog */
++      { 0x2010, 0x08 /* SIGFPE */  },  /* fp unavailable */
++      { 0x2020, 0x08 /* SIGFPE */  },  /* ap unavailable */
++#endif
++#else
++      { 0x0d00, 0x05 /* SIGTRAP */ },  /* single-step */
++#if defined(CONFIG_8xx)
++      { 0x1000, 0x04 /* SIGILL */  },  /* software emulation */
++#else
++      { 0x0f00, 0x04 /* SIGILL */  },         /* performance monitor */
++      { 0x0f20, 0x08 /* SIGFPE */  },         /* altivec unavailable */
++      { 0x1300, 0x05 /* SIGTRAP */ },         /* instruction address break */
++#if defined(CONFIG_PPC64)
++      { 0x1200, 0x05 /* SIGILL */  },         /* system error */
++      { 0x1500, 0x04 /* SIGILL */  },         /* soft patch */
++      { 0x1600, 0x04 /* SIGILL */  },         /* maintenance */
++      { 0x1700, 0x08 /* SIGFPE */  },  /* altivec assist */
++      { 0x1800, 0x04 /* SIGILL */  },         /* thermal */
++#else
++      { 0x1400, 0x02 /* SIGINT */  },  /* SMI */
++      { 0x1600, 0x08 /* SIGFPE */  },  /* altivec assist */
++      { 0x1700, 0x04 /* SIGILL */  },  /* TAU */
++      { 0x2000, 0x05 /* SIGTRAP */ },  /* run mode */
++#endif
++#endif
++#endif
++      { 0x0000, 0x00 }                        /* Must be last */
++};
++
++extern atomic_t cpu_doing_single_step;
++
++static int computeSignal(unsigned int tt)
++{
++      struct hard_trap_info *ht;
++
++      for (ht = hard_trap_info; ht->tt && ht->signo; ht++)
++              if (ht->tt == tt)
++                      return ht->signo;
++
++      return SIGHUP;          /* default for things we don't know about */
++}
++
++static int kgdb_call_nmi_hook(struct pt_regs *regs)
++{
++      kgdb_nmihook(smp_processor_id(), regs);
++      return 0;
++}
++
++#ifdef CONFIG_SMP
++void kgdb_roundup_cpus(unsigned long flags)
++{
++      smp_send_debugger_break(MSG_ALL_BUT_SELF);
++}
++#endif
++
++/* KGDB functions to use existing PowerPC64 hooks. */
++static int kgdb_debugger(struct pt_regs *regs)
++{
++      return kgdb_handle_exception(0, computeSignal(TRAP(regs)), 0, regs);
++}
++
++static int kgdb_breakpoint(struct pt_regs *regs)
++{
++      if (user_mode(regs))
++              return 0;
++
++      kgdb_handle_exception(0, SIGTRAP, 0, regs);
++
++      if (*(u32 *) (regs->nip) == *(u32 *) (&arch_kgdb_ops.gdb_bpt_instr))
++              regs->nip += 4;
++
++      return 1;
++}
++
++static int kgdb_singlestep(struct pt_regs *regs)
++{
++      struct thread_info *thread_info, *exception_thread_info;
++      if (user_mode(regs))
++              return 0;
++      /*
++      * On Book E and perhaps other processsors, singlestep is handled on
++      * the critical exception stack.  This causes current_thread_info()
++      * to fail, since it it locates the thread_info by masking off
++      * the low bits of the current stack pointer.  We work around
++      * this issue by copying the thread_info from the kernel stack
++      * before calling kgdb_handle_exception, and copying it back
++      * afterwards.  On most processors the copy is avoided since
++      * exception_thread_info == thread_info.
++      */
++      thread_info = (struct thread_info *)(regs->gpr[1] & ~(THREAD_SIZE-1));
++      exception_thread_info = current_thread_info();
++
++      if (thread_info != exception_thread_info)
++              memcpy(exception_thread_info, thread_info, sizeof *thread_info);
++
++      kgdb_handle_exception(0, SIGTRAP, 0, regs);
++
++      if (thread_info != exception_thread_info)
++              memcpy(thread_info, exception_thread_info, sizeof *thread_info);
++
++      return 1;
++}
++
++int kgdb_iabr_match(struct pt_regs *regs)
++{
++      if (user_mode(regs))
++              return 0;
++
++      kgdb_handle_exception(0, computeSignal(TRAP(regs)), 0, regs);
++      return 1;
++}
++
++int kgdb_dabr_match(struct pt_regs *regs)
++{
++      if (user_mode(regs))
++              return 0;
++
++      kgdb_handle_exception(0, computeSignal(TRAP(regs)), 0, regs);
++      return 1;
++}
++
++#define PACK64(ptr,src) do { *(ptr++) = (src); } while(0)
++
++#define PACK32(ptr,src) do {          \
++      u32 *ptr32;                   \
++      ptr32 = (u32 *)ptr;           \
++      *(ptr32++) = (src);           \
++      ptr = (unsigned long *)ptr32; \
++      } while(0)
++
++void regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
++{
++      unsigned long *ptr = gdb_regs;
++      int reg;
++
++      memset(gdb_regs, 0, NUMREGBYTES);
++
++      for (reg = 0; reg < 32; reg++)
++              PACK64(ptr, regs->gpr[reg]);
++
++#ifdef CONFIG_FSL_BOOKE
++#ifdef CONFIG_SPE
++      for (reg = 0; reg < 32; reg++)
++              PACK64(ptr, current->thread.evr[reg]);
++#else
++      ptr += 32;
++#endif
++#else
++      /* fp registers not used by kernel, leave zero */
++      ptr += 32 * 8 / sizeof(long);
++#endif
++
++      PACK64(ptr, regs->nip);
++      PACK64(ptr, regs->msr);
++      PACK32(ptr, regs->ccr);
++      PACK64(ptr, regs->link);
++      PACK64(ptr, regs->ctr);
++      PACK32(ptr, regs->xer);
++
++#if 0
++      Following are in struct thread_struct, not struct pt_regs,
++      ignoring for now since kernel does not use them.  Would it
++      make sense to get them from the thread that kgdb is set to?
++
++      If this code is enabled, update the definition of NUMREGBYTES to
++      include the vector registers and vector state registers.
++
++      PACK32(ptr, current->thread->fpscr);
++
++      /* vr registers not used by kernel, leave zero */
++      ptr += 32 * 16 / sizeof(long);
++
++#ifdef CONFIG_ALTIVEC
++      PACK32(ptr, current->thread->vscr);
++      PACK32(ptr, current->thread->vrsave);
++#else
++      ptr += 2 * 4 / sizeof(long);
++#endif
++#else
++#ifdef CONFIG_FSL_BOOKE
++#ifdef CONFIG_SPE
++      /* u64 acc */
++      PACK32(ptr, current->thread.acc >> 32);
++      PACK32(ptr, current->thread.acc & 0xffffffff);
++      PACK64(ptr, current->thread.spefscr);
++#else
++      ptr += 2 + 1;
++#endif
++#else
++      /* fpscr not used by kernel, leave zero */
++      PACK32(ptr, 0);
++#endif
++#endif
++
++      BUG_ON((unsigned long)ptr >
++             (unsigned long)(((void *)gdb_regs) + NUMREGBYTES));
++}
++
++void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
++{
++      struct pt_regs *regs = (struct pt_regs *)(p->thread.ksp +
++                                                STACK_FRAME_OVERHEAD);
++      unsigned long *ptr = gdb_regs;
++      int reg;
++
++      memset(gdb_regs, 0, NUMREGBYTES);
++
++      /* Regs GPR0-2 */
++      for (reg = 0; reg < 3; reg++)
++              PACK64(ptr, regs->gpr[reg]);
++
++      /* Regs GPR3-13 are caller saved, not in regs->gpr[] */
++      ptr += 11;
++
++      /* Regs GPR14-31 */
++      for (reg = 14; reg < 32; reg++)
++              PACK64(ptr, regs->gpr[reg]);
++
++#ifdef CONFIG_FSL_BOOKE
++#ifdef CONFIG_SPE
++      for (reg = 0; reg < 32; reg++)
++              PACK64(ptr, p->thread.evr[reg]);
++#else
++      ptr += 32;
++#endif
++#else
++      /* fp registers not used by kernel, leave zero */
++      ptr += 32 * 8 / sizeof(long);
++#endif
++      PACK64(ptr, regs->nip);
++      PACK64(ptr, regs->msr);
++      PACK32(ptr, regs->ccr);
++      PACK64(ptr, regs->link);
++      PACK64(ptr, regs->ctr);
++      PACK32(ptr, regs->xer);
++
++#if 0
++      Following are in struct thread_struct, not struct pt_regs,
++      ignoring for now since kernel does not use them.  Would it
++      make sense to get them from the thread that kgdb is set to?
++
++      If this code is enabled, update the definition of NUMREGBYTES to
++      include the vector registers and vector state registers.
++
++      PACK32(ptr, p->thread->fpscr);
++
++      /* vr registers not used by kernel, leave zero */
++      ptr += 32 * 16 / sizeof(long);
++
++#ifdef CONFIG_ALTIVEC
++      PACK32(ptr, p->thread->vscr);
++      PACK32(ptr, p->thread->vrsave);
++#else
++      ptr += 2 * 4 / sizeof(long);
++#endif
++#else
++#ifdef CONFIG_FSL_BOOKE
++#ifdef CONFIG_SPE
++      /* u64 acc */
++      PACK32(ptr, p->thread.acc >> 32);
++      PACK32(ptr, p->thread.acc & 0xffffffff);
++      PACK64(ptr, p->thread.spefscr);
++#else
++      ptr += 2 + 1;
++#endif
++#else
++      /* fpscr not used by kernel, leave zero */
++      PACK32(ptr, 0);
++#endif
++#endif
++
++      BUG_ON((unsigned long)ptr >
++             (unsigned long)(((void *)gdb_regs) + NUMREGBYTES));
++}
++
++#define UNPACK64(dest,ptr) do { dest = *(ptr++); } while(0)
++
++#define UNPACK32(dest,ptr) do {       \
++      u32 *ptr32;                   \
++      ptr32 = (u32 *)ptr;           \
++      dest = *(ptr32++);            \
++      ptr = (unsigned long *)ptr32; \
++      } while(0)
++
++void gdb_regs_to_regs(unsigned long *gdb_regs, struct pt_regs *regs)
++{
++      unsigned long *ptr = gdb_regs;
++      int reg;
++
++#ifdef CONFIG_SPE
++      union {
++              u32 v32[2];
++              u64 v64;
++      } acc;
++#endif
++      for (reg = 0; reg < 32; reg++)
++              UNPACK64(regs->gpr[reg], ptr);
++
++#ifdef CONFIG_FSL_BOOKE
++#ifdef CONFIG_SPE
++      for (reg = 0; reg < 32; reg++)
++              UNPACK64(current->thread.evr[reg], ptr);
++#else
++      ptr += 32;
++#endif
++#else
++      /* fp registers not used by kernel, leave zero */
++      ptr += 32 * 8 / sizeof(int);
++#endif
++      UNPACK64(regs->nip, ptr);
++      UNPACK64(regs->msr, ptr);
++      UNPACK32(regs->ccr, ptr);
++      UNPACK64(regs->link, ptr);
++      UNPACK64(regs->ctr, ptr);
++      UNPACK32(regs->xer, ptr);
++
++#if 0
++      Following are in struct thread_struct, not struct pt_regs,
++      ignoring for now since kernel does not use them.  Would it
++      make sense to get them from the thread that kgdb is set to?
++
++      If this code is enabled, update the definition of NUMREGBYTES to
++      include the vector registers and vector state registers.
++
++      /* fpscr, vscr, vrsave not used by kernel, leave unchanged */
++
++      UNPACK32(current->thread->fpscr, ptr);
++
++      /* vr registers not used by kernel, leave zero */
++      ptr += 32 * 16 / sizeof(long);
++
++      #ifdef CONFIG_ALTIVEC
++      UNPACK32(current->thread->vscr, ptr);
++      UNPACK32(current->thread->vrsave, ptr);
++#else
++      ptr += 2 * 4 / sizeof(long);
++#endif
++#else
++#ifdef CONFIG_FSL_BOOKE
++#ifdef CONFIG_SPE
++      /* u64 acc */
++      UNPACK32(acc.v32[0], ptr);
++      UNPACK32(acc.v32[1], ptr);
++      current->thread.acc = acc.v64;
++      UNPACK64(current->thread.spefscr, ptr);
++#else
++      ptr += 2 + 1;
++#endif
++#endif
++#endif
++
++      BUG_ON((unsigned long)ptr >
++             (unsigned long)(((void *)gdb_regs) + NUMREGBYTES));
++}
++
++/*
++ * This function does PowerPC specific procesing for interfacing to gdb.
++ */
++int kgdb_arch_handle_exception(int vector, int signo, int err_code,
++                             char *remcom_in_buffer, char *remcom_out_buffer,
++                             struct pt_regs *linux_regs)
++{
++      char *ptr = &remcom_in_buffer[1];
++      unsigned long addr;
++
++      switch (remcom_in_buffer[0]) {
++              /*
++               * sAA..AA   Step one instruction from AA..AA
++               * This will return an error to gdb ..
++               */
++      case 's':
++      case 'c':
++              /* handle the optional parameter */
++              if (kgdb_hex2long(&ptr, &addr))
++                      linux_regs->nip = addr;
++
++              atomic_set(&cpu_doing_single_step, -1);
++              /* set the trace bit if we're stepping */
++              if (remcom_in_buffer[0] == 's') {
++#if defined(CONFIG_40x) || defined(CONFIG_BOOKE)
++                      mtspr(SPRN_DBCR0,
++                              mfspr(SPRN_DBCR0) | DBCR0_IC | DBCR0_IDM);
++                      linux_regs->msr |= MSR_DE;
++#else
++                      linux_regs->msr |= MSR_SE;
++#endif
++                      debugger_step = 1;
++                      if (kgdb_contthread)
++                              atomic_set(&cpu_doing_single_step,
++                                         smp_processor_id());
++              }
++              return 0;
++      }
++
++      return -1;
++}
++
++int kgdb_fault_setjmp(unsigned long *curr_context)
++{
++#ifdef CONFIG_PPC32
++      __asm__ __volatile__("mflr 0; stw 0,0(%0);\n\
++                              stw 1,4(%0); stw 2,8(%0);\n\
++                              mfcr 0; stw 0,12(%0);\n\
++                              stmw 13,16(%0)\n" : : "r" (curr_context));
++#else
++      __asm__ __volatile__("mflr 0; std 0,0(%0)\n\
++                            std       1,8(%0)\n\
++                            std       2,16(%0)\n\
++                            mfcr 0; std 0,24(%0)\n\
++                            std       13,32(%0)\n\
++                            std       14,40(%0)\n\
++                            std       15,48(%0)\n\
++                            std       16,56(%0)\n\
++                            std       17,64(%0)\n\
++                            std       18,72(%0)\n\
++                            std       19,80(%0)\n\
++                            std       20,88(%0)\n\
++                            std       21,96(%0)\n\
++                            std       22,104(%0)\n\
++                            std       23,112(%0)\n\
++                            std       24,120(%0)\n\
++                            std       25,128(%0)\n\
++                            std       26,136(%0)\n\
++                            std       27,144(%0)\n\
++                            std       28,152(%0)\n\
++                            std       29,160(%0)\n\
++                            std       30,168(%0)\n\
++                            std       31,176(%0)\n" : : "r" (curr_context));
++#endif
++      return 0;
++}
++
++void kgdb_fault_longjmp(unsigned long *curr_context)
++{
++#ifdef CONFIG_PPC32
++      __asm__ __volatile__("lmw 13,16(%0);\n\
++                            lwz 0,12(%0); mtcrf 0x38,0;\n\
++                            lwz 0,0(%0); lwz 1,4(%0); lwz 2,8(%0);\n\
++                            mtlr 0; mr 3,1\n" : : "r" (curr_context));
++#else
++      __asm__ __volatile__("ld        13,32(%0)\n\
++                            ld        14,40(%0)\n\
++                            ld        15,48(%0)\n\
++                            ld        16,56(%0)\n\
++                            ld        17,64(%0)\n\
++                            ld        18,72(%0)\n\
++                            ld        19,80(%0)\n\
++                            ld        20,88(%0)\n\
++                            ld        21,96(%0)\n\
++                            ld        22,104(%0)\n\
++                            ld        23,112(%0)\n\
++                            ld        24,120(%0)\n\
++                            ld        25,128(%0)\n\
++                            ld        26,136(%0)\n\
++                            ld        27,144(%0)\n\
++                            ld        28,152(%0)\n\
++                            ld        29,160(%0)\n\
++                            ld        30,168(%0)\n\
++                            ld        31,176(%0)\n\
++                            ld        0,24(%0)\n\
++                            mtcrf     0x38,0\n\
++                            ld        0,0(%0)\n\
++                            ld        1,8(%0)\n\
++                            ld        2,16(%0)\n\
++                            mtlr      0\n\
++                            mr        3,1\n" : : "r" (curr_context));
++#endif
++}
++
++/*
++ * Global data
++ */
++struct kgdb_arch arch_kgdb_ops = {
++      .gdb_bpt_instr = {0x7d, 0x82, 0x10, 0x08},
++};
++
++int kgdb_not_implemented(struct pt_regs *regs)
++{
++      return 0;
++}
++
++int kgdb_arch_init(void)
++{
++#ifdef CONFIG_XMON
++#error Both XMON and KGDB selected in .config.  Unselect one of them.
++#endif
++
++      __debugger_ipi = kgdb_call_nmi_hook;
++      __debugger = kgdb_debugger;
++      __debugger_bpt = kgdb_breakpoint;
++      __debugger_sstep = kgdb_singlestep;
++      __debugger_iabr_match = kgdb_iabr_match;
++      __debugger_dabr_match = kgdb_dabr_match;
++      __debugger_fault_handler = kgdb_not_implemented;
++
++      return 0;
++}
++
++arch_initcall(kgdb_arch_init);
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/powerpc/kernel/legacy_serial.c linux-2.6.18-53.1.14.kgdb/arch/powerpc/kernel/legacy_serial.c
+--- linux-2.6.18-53.1.14/arch/powerpc/kernel/legacy_serial.c   2008-03-06 05:54:47.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/arch/powerpc/kernel/legacy_serial.c      2008-06-10 15:38:14.000000000 +0400
+@@ -11,6 +11,9 @@
+ #include <asm/udbg.h>
+ #include <asm/pci-bridge.h>
+ #include <asm/ppc-pci.h>
++#ifdef CONFIG_KGDB_8250
++#include <linux/kgdb.h>
++#endif
+ 
+ #undef DEBUG
+ 
+@@ -485,6 +488,9 @@ static int __init serial_dev_init(void)
+                       fixup_port_pio(i, np, port);
+               if ((port->iotype == UPIO_MEM) || (port->iotype == UPIO_TSI))
+                       fixup_port_mmio(i, np, port);
++#ifdef CONFIG_KGDB_8250
++              kgdb8250_add_platform_port(i, port);
++#endif
+       }
+ 
+       DBG("Registering platform serial ports\n");
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/powerpc/kernel/setup_32.c linux-2.6.18-53.1.14.kgdb/arch/powerpc/kernel/setup_32.c
+--- linux-2.6.18-53.1.14/arch/powerpc/kernel/setup_32.c        2008-03-06 05:54:45.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/arch/powerpc/kernel/setup_32.c   2008-06-10 15:38:14.000000000 +0400
+@@ -45,10 +45,6 @@
+ 
+ #define DBG(fmt...)
+ 
+-#if defined CONFIG_KGDB
+-#include <asm/kgdb.h>
+-#endif
+-
+ extern void bootx_init(unsigned long r4, unsigned long phys);
+ 
+ struct ide_machdep_calls ppc_ide_md;
+@@ -251,18 +247,6 @@ void __init setup_arch(char **cmdline_p)
+ 
+       xmon_setup();
+ 
+-#if defined(CONFIG_KGDB)
+-      if (ppc_md.kgdb_map_scc)
+-              ppc_md.kgdb_map_scc();
+-      set_debug_traps();
+-      if (strstr(cmd_line, "gdb")) {
+-              if (ppc_md.progress)
+-                      ppc_md.progress("setup_arch: kgdb breakpoint", 0x4000);
+-              printk("kgdb breakpoint activated\n");
+-              breakpoint();
+-      }
+-#endif
+-
+       /*
+        * Set cache line size based on type of cpu as a default.
+        * Systems with OF can look in the properties on the cpu node(s)
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/powerpc/mm/fault.c linux-2.6.18-53.1.14.kgdb/arch/powerpc/mm/fault.c
+--- linux-2.6.18-53.1.14/arch/powerpc/mm/fault.c       2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/powerpc/mm/fault.c  2008-06-10 15:38:14.000000000 +0400
+@@ -28,6 +28,7 @@
+ #include <linux/highmem.h>
+ #include <linux/module.h>
+ #include <linux/kprobes.h>
++#include <linux/kgdb.h>
+ 
+ #include <asm/page.h>
+ #include <asm/pgtable.h>
+@@ -424,6 +425,13 @@ void bad_page_fault(struct pt_regs *regs
+               return;
+       }
+ 
++#ifdef CONFIG_KGDB
++      if (atomic_read(&debugger_active) && kgdb_may_fault)
++              /* Restore our previous state. */
++              kgdb_fault_longjmp(kgdb_fault_jmp_regs);
++              /* Not reached. */
++#endif
++
+       /* kernel has accessed a bad area */
+ 
+       printk(KERN_ALERT "Unable to handle kernel paging request for ");
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/powerpc/platforms/powermac/setup.c linux-2.6.18-53.1.14.kgdb/arch/powerpc/platforms/powermac/setup.c
+--- linux-2.6.18-53.1.14/arch/powerpc/platforms/powermac/setup.c       2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/powerpc/platforms/powermac/setup.c  2008-06-10 15:38:14.000000000 +0400
+@@ -98,8 +98,6 @@ extern struct machdep_calls pmac_md;
+ int sccdbg;
+ #endif
+ 
+-extern void zs_kgdb_hook(int tty_num);
+-
+ sys_ctrler_t sys_ctrler = SYS_CTRLER_UNKNOWN;
+ EXPORT_SYMBOL(sys_ctrler);
+ 
+@@ -319,10 +317,6 @@ static void __init pmac_setup_arch(void)
+       l2cr_init();
+ #endif /* CONFIG_PPC32 */
+ 
+-#ifdef CONFIG_KGDB
+-      zs_kgdb_hook(0);
+-#endif
+-
+       find_via_cuda();
+       find_via_pmu();
+       smu_init();
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ppc/Kconfig.debug linux-2.6.18-53.1.14.kgdb/arch/ppc/Kconfig.debug
+--- linux-2.6.18-53.1.14/arch/ppc/Kconfig.debug        2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/ppc/Kconfig.debug   2008-06-10 15:38:14.000000000 +0400
+@@ -2,42 +2,6 @@ menu "Kernel hacking"
+ 
+ source "lib/Kconfig.debug"
+ 
+-config KGDB
+-      bool "Include kgdb kernel debugger"
+-      depends on DEBUG_KERNEL && (BROKEN || PPC_GEN550 || 4xx)
+-      select DEBUG_INFO
+-      help
+-        Include in-kernel hooks for kgdb, the Linux kernel source level
+-        debugger.  See <http://kgdb.sourceforge.net/> for more information.
+-        Unless you are intending to debug the kernel, say N here.
+-
+-choice
+-      prompt "Serial Port"
+-      depends on KGDB
+-      default KGDB_TTYS1
+-
+-config KGDB_TTYS0
+-      bool "ttyS0"
+-
+-config KGDB_TTYS1
+-      bool "ttyS1"
+-
+-config KGDB_TTYS2
+-      bool "ttyS2"
+-
+-config KGDB_TTYS3
+-      bool "ttyS3"
+-
+-endchoice
+-
+-config KGDB_CONSOLE
+-      bool "Enable serial console thru kgdb port"
+-      depends on KGDB && 8xx || CPM2
+-      help
+-        If you enable this, all serial console messages will be sent
+-        over the gdb stub.
+-        If unsure, say N.
+-
+ config XMON
+       bool "Include xmon kernel debugger"
+       depends on DEBUG_KERNEL
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ppc/kernel/kgdb.c linux-2.6.18-53.1.14.kgdb/arch/ppc/kernel/kgdb.c
+--- linux-2.6.18-53.1.14/arch/ppc/kernel/kgdb.c        1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/arch/ppc/kernel/kgdb.c   2008-06-10 15:39:34.000000000 +0400
+@@ -0,0 +1,350 @@
++/*
++ * arch/ppc/kernel/kgdb.c
++ *
++ * PowerPC backend to the KGDB stub.
++ *
++ * Maintainer: Tom Rini <trini@kernel.crashing.org>
++ *
++ * 1998 (c) Michael AK Tesch (tesch@cs.wisc.edu)
++ * Copyright (C) 2003 Timesys Corporation.
++ * 2004 (c) MontaVista Software, Inc.
++ *
++ * This file is licensed under the terms of the GNU General Public License
++ * version 2. This program as licensed "as is" without any warranty of any
++ * kind, whether express or implied.
++ */
++
++#include <linux/config.h>
++#include <linux/kernel.h>
++#include <linux/init.h>
++#include <linux/kgdb.h>
++#include <linux/smp.h>
++#include <linux/signal.h>
++#include <linux/ptrace.h>
++#include <asm/current.h>
++#include <asm/ptrace.h>
++#include <asm/processor.h>
++#include <asm/machdep.h>
++
++/*
++ * This table contains the mapping between PowerPC hardware trap types, and
++ * signals, which are primarily what GDB understands.  GDB and the kernel
++ * don't always agree on values, so we use constants taken from gdb-6.2.
++ */
++static struct hard_trap_info
++{
++      unsigned int tt;                /* Trap type code for powerpc */
++      unsigned char signo;            /* Signal that we map this trap into */
++} hard_trap_info[] = {
++#if defined(CONFIG_40x) || defined(CONFIG_BOOKE)
++      { 0x0100, 0x02 /* SIGINT */  },         /* critical input interrupt */
++      { 0x0200, 0x0b /* SIGSEGV */ },         /* machine check */
++      { 0x0300, 0x0b /* SIGSEGV */ },         /* data storage */
++      { 0x0400, 0x0a /* SIGBUS */  },         /* instruction storage */
++      { 0x0500, 0x02 /* SIGINT */  },         /* interrupt */
++      { 0x0600, 0x0a /* SIGBUS */  },         /* alignment */
++      { 0x0700, 0x04 /* SIGILL */  },         /* program */
++      { 0x0800, 0x04 /* SIGILL */  },         /* reserved */
++      { 0x0900, 0x04 /* SIGILL */  },         /* reserved */
++      { 0x0a00, 0x04 /* SIGILL */  },         /* reserved */
++      { 0x0b00, 0x04 /* SIGILL */  },         /* reserved */
++      { 0x0c00, 0x14 /* SIGCHLD */ },         /* syscall */
++      { 0x0d00, 0x04 /* SIGILL */  },         /* reserved */
++      { 0x0e00, 0x04 /* SIGILL */  },         /* reserved */
++      { 0x0f00, 0x04 /* SIGILL */  },         /* reserved */
++      { 0x2002, 0x05 /* SIGTRAP */},          /* debug */
++#else
++      { 0x0200, 0x0b /* SIGSEGV */ },         /* machine check */
++      { 0x0300, 0x0b /* SIGSEGV */ },         /* address error (store) */
++      { 0x0400, 0x0a /* SIGBUS */ },          /* instruction bus error */
++      { 0x0500, 0x02 /* SIGINT */ },          /* interrupt */
++      { 0x0600, 0x0a /* SIGBUS */ },          /* alingment */
++      { 0x0700, 0x05 /* SIGTRAP */ },         /* breakpoint trap */
++      { 0x0800, 0x08 /* SIGFPE */},           /* fpu unavail */
++      { 0x0900, 0x0e /* SIGALRM */ },         /* decrementer */
++      { 0x0a00, 0x04 /* SIGILL */ },          /* reserved */
++      { 0x0b00, 0x04 /* SIGILL */ },          /* reserved */
++      { 0x0c00, 0x14 /* SIGCHLD */ },         /* syscall */
++      { 0x0d00, 0x05 /* SIGTRAP */ },         /* single-step/watch */
++      { 0x0e00, 0x08 /* SIGFPE */ },          /* fp assist */
++#endif
++      { 0x0000, 0x000 }                       /* Must be last */
++};
++
++extern atomic_t cpu_doing_single_step;
++
++static int computeSignal(unsigned int tt)
++{
++      struct hard_trap_info *ht;
++
++      for (ht = hard_trap_info; ht->tt && ht->signo; ht++)
++              if (ht->tt == tt)
++                      return ht->signo;
++
++      return SIGHUP;          /* default for things we don't know about */
++}
++
++/* KGDB functions to use existing PowerPC hooks. */
++static void kgdb_debugger(struct pt_regs *regs)
++{
++      kgdb_handle_exception(0, computeSignal(TRAP(regs)), 0, regs);
++}
++
++static int kgdb_breakpoint(struct pt_regs *regs)
++{
++      if (user_mode(regs))
++              return 0;
++
++      kgdb_handle_exception(0, SIGTRAP, 0, regs);
++
++      if (*(u32 *) (regs->nip) == *(u32 *) (&arch_kgdb_ops.gdb_bpt_instr))
++              regs->nip += 4;
++
++      return 1;
++}
++
++static int kgdb_singlestep(struct pt_regs *regs)
++{
++      struct thread_info *thread_info, *exception_thread_info;
++
++      if (user_mode(regs))
++              return 0;
++      /*
++      * On Book E and perhaps other processsors, singlestep is handled on
++      * the critical exception stack.  This causes current_thread_info()
++      * to fail, since it it locates the thread_info by masking off
++      * the low bits of the current stack pointer.  We work around
++      * this issue by copying the thread_info from the kernel stack
++      * before calling kgdb_handle_exception, and copying it back
++      * afterwards.  On most processors the copy is avoided since
++      * exception_thread_info == thread_info.
++      */
++      thread_info = (struct thread_info *)(regs->gpr[1] & ~(THREAD_SIZE-1));
++      exception_thread_info = current_thread_info();
++
++      if (thread_info != exception_thread_info)
++              memcpy(exception_thread_info, thread_info, sizeof *thread_info);
++
++      kgdb_handle_exception(0, SIGTRAP, 0, regs);
++
++      if (thread_info != exception_thread_info)
++              memcpy(thread_info, exception_thread_info, sizeof *thread_info);
++
++      return 1;
++}
++
++int kgdb_iabr_match(struct pt_regs *regs)
++{
++      if (user_mode(regs))
++              return 0;
++
++      kgdb_handle_exception(0, computeSignal(TRAP(regs)), 0, regs);
++      return 1;
++}
++
++int kgdb_dabr_match(struct pt_regs *regs)
++{
++      if (user_mode(regs))
++              return 0;
++
++      kgdb_handle_exception(0, computeSignal(TRAP(regs)), 0, regs);
++      return 1;
++}
++
++void regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
++{
++      int reg;
++      unsigned long *ptr = gdb_regs;
++
++      memset(gdb_regs, 0, MAXREG * 4);
++
++      for (reg = 0; reg < 32; reg++)
++              *(ptr++) = regs->gpr[reg];
++
++#ifndef CONFIG_E500
++      for (reg = 0; reg < 64; reg++)
++              *(ptr++) = 0;
++#else
++      for (reg = 0; reg < 32; reg++)
++              *(ptr++) = current->thread.evr[reg];
++#endif
++
++      *(ptr++) = regs->nip;
++      *(ptr++) = regs->msr;
++      *(ptr++) = regs->ccr;
++      *(ptr++) = regs->link;
++      *(ptr++) = regs->ctr;
++      *(ptr++) = regs->xer;
++
++#ifdef CONFIG_SPE
++      /* u64 acc */
++      *(ptr++) = (current->thread.acc >> 32);
++      *(ptr++) = (current->thread.acc & 0xffffffff);
++      *(ptr++) = current->thread.spefscr;
++#endif
++}
++
++void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
++{
++      struct pt_regs *regs = (struct pt_regs *)(p->thread.ksp +
++                                                STACK_FRAME_OVERHEAD);
++      int reg;
++      unsigned long *ptr = gdb_regs;
++
++      memset(gdb_regs, 0, MAXREG * 4);
++
++      /* Regs GPR0-2 */
++      for (reg = 0; reg < 3; reg++)
++              *(ptr++) = regs->gpr[reg];
++
++      /* Regs GPR3-13 are not saved */
++      for (reg = 3; reg < 14; reg++)
++              *(ptr++) = 0;
++
++      /* Regs GPR14-31 */
++      for (reg = 14; reg < 32; reg++)
++              *(ptr++) = regs->gpr[reg];
++
++#ifndef CONFIG_E500
++      for (reg = 0; reg < 64; reg++)
++              *(ptr++) = 0;
++#else
++      for (reg = 0; reg < 32; reg++)
++              *(ptr++) = current->thread.evr[reg];
++#endif
++
++      *(ptr++) = regs->nip;
++      *(ptr++) = regs->msr;
++      *(ptr++) = regs->ccr;
++      *(ptr++) = regs->link;
++      *(ptr++) = regs->ctr;
++      *(ptr++) = regs->xer;
++
++#ifdef CONFIG_SPE
++      /* u64 acc */
++      *(ptr++) = (current->thread.acc >> 32);
++      *(ptr++) = (current->thread.acc & 0xffffffff);
++      *(ptr++) = current->thread.spefscr;
++#endif
++}
++
++void gdb_regs_to_regs(unsigned long *gdb_regs, struct pt_regs *regs)
++{
++      int reg;
++      unsigned long *ptr = gdb_regs;
++#ifdef CONFIG_SPE
++      union {
++              u32 v32[2];
++              u64 v64;
++      } u;
++#endif
++
++      for (reg = 0; reg < 32; reg++)
++              regs->gpr[reg] = *(ptr++);
++
++#ifndef CONFIG_E500
++      for (reg = 0; reg < 64; reg++)
++              ptr++;
++#else
++      for (reg = 0; reg < 32; reg++)
++              current->thread.evr[reg] = *(ptr++);
++#endif
++
++      regs->nip = *(ptr++);
++      regs->msr = *(ptr++);
++      regs->ccr = *(ptr++);
++      regs->link = *(ptr++);
++      regs->ctr = *(ptr++);
++      regs->xer = *(ptr++);
++
++#ifdef CONFIG_SPE
++      /* u64 acc */
++      u.v32[0] = *(ptr++);
++      u.v32[1] = *(ptr++);
++      current->thread.acc = u.v64;
++      current->thread.spefscr = *(ptr++);
++#endif
++}
++
++/*
++ * Save/restore state in case a memory access causes a fault.
++ */
++int kgdb_fault_setjmp(unsigned long *curr_context)
++{
++      __asm__ __volatile__("mflr 0; stw 0,0(%0);"
++                           "stw 1,4(%0); stw 2,8(%0);"
++                           "mfcr 0; stw 0,12(%0);"
++                           "stmw 13,16(%0)"::"r"(curr_context));
++      return 0;
++}
++
++void kgdb_fault_longjmp(unsigned long *curr_context)
++{
++      __asm__ __volatile__("lmw 13,16(%0);"
++                           "lwz 0,12(%0); mtcrf 0x38,0;"
++                           "lwz 0,0(%0); lwz 1,4(%0); lwz 2,8(%0);"
++                           "mtlr 0; mr 3,1"::"r"(curr_context));
++}
++
++/*
++ * This function does PoerPC specific procesing for interfacing to gdb.
++ */
++int kgdb_arch_handle_exception(int vector, int signo, int err_code,
++                             char *remcom_in_buffer, char *remcom_out_buffer,
++                             struct pt_regs *linux_regs)
++{
++      char *ptr = &remcom_in_buffer[1];
++      unsigned long addr;
++
++      switch (remcom_in_buffer[0])
++              {
++              /*
++               * sAA..AA   Step one instruction from AA..AA
++               * This will return an error to gdb ..
++               */
++              case 's':
++              case 'c':
++                      /* handle the optional parameter */
++                      if (kgdb_hex2long (&ptr, &addr))
++                              linux_regs->nip = addr;
++
++                      atomic_set(&cpu_doing_single_step, -1);
++                      /* set the trace bit if we're stepping */
++                      if (remcom_in_buffer[0] == 's') {
++#if defined (CONFIG_40x) || defined(CONFIG_BOOKE)
++                              mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) |
++                                              DBCR0_IC | DBCR0_IDM);
++                              linux_regs->msr |= MSR_DE;
++#else
++                              linux_regs->msr |= MSR_SE;
++#endif
++                              debugger_step = 1;
++                              if (kgdb_contthread)
++                                      atomic_set(&cpu_doing_single_step,
++                                                      smp_processor_id());
++                      }
++                      return 0;
++      }
++
++      return -1;
++}
++
++/*
++ * Global data
++ */
++struct kgdb_arch arch_kgdb_ops = {
++      .gdb_bpt_instr = {0x7d, 0x82, 0x10, 0x08},
++};
++
++int kgdb_arch_init(void)
++{
++      debugger = kgdb_debugger;
++      debugger_bpt = kgdb_breakpoint;
++      debugger_sstep = kgdb_singlestep;
++      debugger_iabr_match = kgdb_iabr_match;
++      debugger_dabr_match = kgdb_dabr_match;
++
++      return 0;
++}
++
++arch_initcall(kgdb_arch_init);
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ppc/kernel/ppc-stub.c linux-2.6.18-53.1.14.kgdb/arch/ppc/kernel/ppc-stub.c
+--- linux-2.6.18-53.1.14/arch/ppc/kernel/ppc-stub.c    2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/ppc/kernel/ppc-stub.c       1970-01-01 03:00:00.000000000 +0300
+@@ -1,866 +0,0 @@
+-/*
+- * ppc-stub.c:  KGDB support for the Linux kernel.
+- *
+- * adapted from arch/sparc/kernel/sparc-stub.c for the PowerPC
+- * some stuff borrowed from Paul Mackerras' xmon
+- * Copyright (C) 1998 Michael AK Tesch (tesch@cs.wisc.edu)
+- *
+- * Modifications to run under Linux
+- * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu)
+- *
+- * This file originally came from the gdb sources, and the
+- * copyright notices have been retained below.
+- */
+-
+-/****************************************************************************
+-
+-              THIS SOFTWARE IS NOT COPYRIGHTED
+-
+-   HP offers the following for use in the public domain.  HP makes no
+-   warranty with regard to the software or its performance and the
+-   user accepts the software "AS IS" with all faults.
+-
+-   HP DISCLAIMS ANY WARRANTIES, EXPRESS OR IMPLIED, WITH REGARD
+-   TO THIS SOFTWARE INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+-   OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+-
+-****************************************************************************/
+-
+-/****************************************************************************
+- *  Header: remcom.c,v 1.34 91/03/09 12:29:49 glenne Exp $
+- *
+- *  Module name: remcom.c $
+- *  Revision: 1.34 $
+- *  Date: 91/03/09 12:29:49 $
+- *  Contributor:     Lake Stevens Instrument Division$
+- *
+- *  Description:     low level support for gdb debugger. $
+- *
+- *  Considerations:  only works on target hardware $
+- *
+- *  Written by:      Glenn Engel $
+- *  ModuleState:     Experimental $
+- *
+- *  NOTES:           See Below $
+- *
+- *  Modified for SPARC by Stu Grossman, Cygnus Support.
+- *
+- *  This code has been extensively tested on the Fujitsu SPARClite demo board.
+- *
+- *  To enable debugger support, two things need to happen.  One, a
+- *  call to set_debug_traps() is necessary in order to allow any breakpoints
+- *  or error conditions to be properly intercepted and reported to gdb.
+- *  Two, a breakpoint needs to be generated to begin communication.  This
+- *  is most easily accomplished by a call to breakpoint().  Breakpoint()
+- *  simulates a breakpoint by executing a trap #1.
+- *
+- *************
+- *
+- *    The following gdb commands are supported:
+- *
+- * command          function                    Return value
+- *
+- *    g             return the value of the CPU registers  hex data or ENN
+- *    G             set the value of the CPU registers     OK or ENN
+- *    qOffsets      Get section offsets.  Reply is Text=xxx;Data=yyy;Bss=zzz
+- *
+- *    mAA..AA,LLLL  Read LLLL bytes at address AA..AA      hex data or ENN
+- *    MAA..AA,LLLL: Write LLLL bytes at address AA.AA      OK or ENN
+- *
+- *    c             Resume at current address              SNN   ( signal NN)
+- *    cAA..AA       Continue at address AA..AA             SNN
+- *
+- *    s             Step one instruction                   SNN
+- *    sAA..AA       Step one instruction from AA..AA       SNN
+- *
+- *    k             kill
+- *
+- *    ?             What was the last sigval ?             SNN   (signal NN)
+- *
+- *    bBB..BB     Set baud rate to BB..BB                OK or BNN, then sets
+- *                                                       baud rate
+- *
+- * All commands and responses are sent with a packet which includes a
+- * checksum.  A packet consists of
+- *
+- * $<packet info>#<checksum>.
+- *
+- * where
+- * <packet info> :: <characters representing the command or response>
+- * <checksum>    :: <two hex digits computed as modulo 256 sum of <packetinfo>>
+- *
+- * When a packet is received, it is first acknowledged with either '+' or '-'.
+- * '+' indicates a successful transfer.  '-' indicates a failed transfer.
+- *
+- * Example:
+- *
+- * Host:                  Reply:
+- * $m0,10#2a               +$00010203040506070809101112131415#42
+- *
+- ****************************************************************************/
+-
+-#include <linux/kernel.h>
+-#include <linux/string.h>
+-#include <linux/mm.h>
+-#include <linux/smp.h>
+-#include <linux/smp_lock.h>
+-#include <linux/init.h>
+-#include <linux/sysrq.h>
+-
+-#include <asm/cacheflush.h>
+-#include <asm/system.h>
+-#include <asm/signal.h>
+-#include <asm/kgdb.h>
+-#include <asm/pgtable.h>
+-#include <asm/ptrace.h>
+-
+-void breakinst(void);
+-
+-/*
+- * BUFMAX defines the maximum number of characters in inbound/outbound buffers
+- * at least NUMREGBYTES*2 are needed for register packets
+- */
+-#define BUFMAX 2048
+-static char remcomInBuffer[BUFMAX];
+-static char remcomOutBuffer[BUFMAX];
+-
+-static int initialized;
+-static int kgdb_active;
+-static int kgdb_started;
+-static u_int fault_jmp_buf[100];
+-static int kdebug;
+-
+-
+-static const char hexchars[]="0123456789abcdef";
+-
+-/* Place where we save old trap entries for restoration - sparc*/
+-/* struct tt_entry kgdb_savettable[256]; */
+-/* typedef void (*trapfunc_t)(void); */
+-
+-static void kgdb_fault_handler(struct pt_regs *regs);
+-static int handle_exception (struct pt_regs *regs);
+-
+-#if 0
+-/* Install an exception handler for kgdb */
+-static void exceptionHandler(int tnum, unsigned int *tfunc)
+-{
+-      /* We are dorking with a live trap table, all irqs off */
+-}
+-#endif
+-
+-int
+-kgdb_setjmp(long *buf)
+-{
+-      asm ("mflr 0; stw 0,0(%0);"
+-           "stw 1,4(%0); stw 2,8(%0);"
+-           "mfcr 0; stw 0,12(%0);"
+-           "stmw 13,16(%0)"
+-           : : "r" (buf));
+-      /* XXX should save fp regs as well */
+-      return 0;
+-}
+-void
+-kgdb_longjmp(long *buf, int val)
+-{
+-      if (val == 0)
+-              val = 1;
+-      asm ("lmw 13,16(%0);"
+-           "lwz 0,12(%0); mtcrf 0x38,0;"
+-           "lwz 0,0(%0); lwz 1,4(%0); lwz 2,8(%0);"
+-           "mtlr 0; mr 3,%1"
+-           : : "r" (buf), "r" (val));
+-}
+-/* Convert ch from a hex digit to an int */
+-static int
+-hex(unsigned char ch)
+-{
+-      if (ch >= 'a' && ch <= 'f')
+-              return ch-'a'+10;
+-      if (ch >= '0' && ch <= '9')
+-              return ch-'0';
+-      if (ch >= 'A' && ch <= 'F')
+-              return ch-'A'+10;
+-      return -1;
+-}
+-
+-/* Convert the memory pointed to by mem into hex, placing result in buf.
+- * Return a pointer to the last char put in buf (null), in case of mem fault,
+- * return 0.
+- */
+-static unsigned char *
+-mem2hex(const char *mem, char *buf, int count)
+-{
+-      unsigned char ch;
+-      unsigned short tmp_s;
+-      unsigned long tmp_l;
+-
+-      if (kgdb_setjmp((long*)fault_jmp_buf) == 0) {
+-              debugger_fault_handler = kgdb_fault_handler;
+-
+-              /* Accessing 16 bit and 32 bit objects in a single
+-              ** load instruction is required to avoid bad side
+-              ** effects for some IO registers.
+-              */
+-
+-              if ((count == 2) && (((long)mem & 1) == 0)) {
+-                      tmp_s = *(unsigned short *)mem;
+-                      mem += 2;
+-                      *buf++ = hexchars[(tmp_s >> 12) & 0xf];
+-                      *buf++ = hexchars[(tmp_s >> 8) & 0xf];
+-                      *buf++ = hexchars[(tmp_s >> 4) & 0xf];
+-                      *buf++ = hexchars[tmp_s & 0xf];
+-
+-              } else if ((count == 4) && (((long)mem & 3) == 0)) {
+-                      tmp_l = *(unsigned int *)mem;
+-                      mem += 4;
+-                      *buf++ = hexchars[(tmp_l >> 28) & 0xf];
+-                      *buf++ = hexchars[(tmp_l >> 24) & 0xf];
+-                      *buf++ = hexchars[(tmp_l >> 20) & 0xf];
+-                      *buf++ = hexchars[(tmp_l >> 16) & 0xf];
+-                      *buf++ = hexchars[(tmp_l >> 12) & 0xf];
+-                      *buf++ = hexchars[(tmp_l >> 8) & 0xf];
+-                      *buf++ = hexchars[(tmp_l >> 4) & 0xf];
+-                      *buf++ = hexchars[tmp_l & 0xf];
+-
+-              } else {
+-                      while (count-- > 0) {
+-                              ch = *mem++;
+-                              *buf++ = hexchars[ch >> 4];
+-                              *buf++ = hexchars[ch & 0xf];
+-                      }
+-              }
+-
+-      } else {
+-              /* error condition */
+-      }
+-      debugger_fault_handler = NULL;
+-      *buf = 0;
+-      return buf;
+-}
+-
+-/* convert the hex array pointed to by buf into binary to be placed in mem
+- * return a pointer to the character AFTER the last byte written.
+-*/
+-static char *
+-hex2mem(char *buf, char *mem, int count)
+-{
+-      unsigned char ch;
+-      int i;
+-      char *orig_mem;
+-      unsigned short tmp_s;
+-      unsigned long tmp_l;
+-
+-      orig_mem = mem;
+-
+-      if (kgdb_setjmp((long*)fault_jmp_buf) == 0) {
+-              debugger_fault_handler = kgdb_fault_handler;
+-
+-              /* Accessing 16 bit and 32 bit objects in a single
+-              ** store instruction is required to avoid bad side
+-              ** effects for some IO registers.
+-              */
+-
+-              if ((count == 2) && (((long)mem & 1) == 0)) {
+-                      tmp_s = hex(*buf++) << 12;
+-                      tmp_s |= hex(*buf++) << 8;
+-                      tmp_s |= hex(*buf++) << 4;
+-                      tmp_s |= hex(*buf++);
+-
+-                      *(unsigned short *)mem = tmp_s;
+-                      mem += 2;
+-
+-              } else if ((count == 4) && (((long)mem & 3) == 0)) {
+-                      tmp_l = hex(*buf++) << 28;
+-                      tmp_l |= hex(*buf++) << 24;
+-                      tmp_l |= hex(*buf++) << 20;
+-                      tmp_l |= hex(*buf++) << 16;
+-                      tmp_l |= hex(*buf++) << 12;
+-                      tmp_l |= hex(*buf++) << 8;
+-                      tmp_l |= hex(*buf++) << 4;
+-                      tmp_l |= hex(*buf++);
+-
+-                      *(unsigned long *)mem = tmp_l;
+-                      mem += 4;
+-
+-              } else {
+-                      for (i=0; i<count; i++) {
+-                              ch = hex(*buf++) << 4;
+-                              ch |= hex(*buf++);
+-                              *mem++ = ch;
+-                      }
+-              }
+-
+-
+-              /*
+-              ** Flush the data cache, invalidate the instruction cache.
+-              */
+-              flush_icache_range((int)orig_mem, (int)orig_mem + count - 1);
+-
+-      } else {
+-              /* error condition */
+-      }
+-      debugger_fault_handler = NULL;
+-      return mem;
+-}
+-
+-/*
+- * While we find nice hex chars, build an int.
+- * Return number of chars processed.
+- */
+-static int
+-hexToInt(char **ptr, int *intValue)
+-{
+-      int numChars = 0;
+-      int hexValue;
+-
+-      *intValue = 0;
+-
+-      if (kgdb_setjmp((long*)fault_jmp_buf) == 0) {
+-              debugger_fault_handler = kgdb_fault_handler;
+-              while (**ptr) {
+-                      hexValue = hex(**ptr);
+-                      if (hexValue < 0)
+-                              break;
+-
+-                      *intValue = (*intValue << 4) | hexValue;
+-                      numChars ++;
+-
+-                      (*ptr)++;
+-              }
+-      } else {
+-              /* error condition */
+-      }
+-      debugger_fault_handler = NULL;
+-
+-      return (numChars);
+-}
+-
+-/* scan for the sequence $<data>#<checksum> */
+-static void
+-getpacket(char *buffer)
+-{
+-      unsigned char checksum;
+-      unsigned char xmitcsum;
+-      int i;
+-      int count;
+-      unsigned char ch;
+-
+-      do {
+-              /* wait around for the start character, ignore all other
+-               * characters */
+-              while ((ch = (getDebugChar() & 0x7f)) != '$') ;
+-
+-              checksum = 0;
+-              xmitcsum = -1;
+-
+-              count = 0;
+-
+-              /* now, read until a # or end of buffer is found */
+-              while (count < BUFMAX) {
+-                      ch = getDebugChar() & 0x7f;
+-                      if (ch == '#')
+-                              break;
+-                      checksum = checksum + ch;
+-                      buffer[count] = ch;
+-                      count = count + 1;
+-              }
+-
+-              if (count >= BUFMAX)
+-                      continue;
+-
+-              buffer[count] = 0;
+-
+-              if (ch == '#') {
+-                      xmitcsum = hex(getDebugChar() & 0x7f) << 4;
+-                      xmitcsum |= hex(getDebugChar() & 0x7f);
+-                      if (checksum != xmitcsum)
+-                              putDebugChar('-');      /* failed checksum */
+-                      else {
+-                              putDebugChar('+'); /* successful transfer */
+-                              /* if a sequence char is present, reply the ID */
+-                              if (buffer[2] == ':') {
+-                                      putDebugChar(buffer[0]);
+-                                      putDebugChar(buffer[1]);
+-                                      /* remove sequence chars from buffer */
+-                                      count = strlen(buffer);
+-                                      for (i=3; i <= count; i++)
+-                                              buffer[i-3] = buffer[i];
+-                              }
+-                      }
+-              }
+-      } while (checksum != xmitcsum);
+-}
+-
+-/* send the packet in buffer. */
+-static void putpacket(unsigned char *buffer)
+-{
+-      unsigned char checksum;
+-      int count;
+-      unsigned char ch, recv;
+-
+-      /* $<packet info>#<checksum>. */
+-      do {
+-              putDebugChar('$');
+-              checksum = 0;
+-              count = 0;
+-
+-              while ((ch = buffer[count])) {
+-                      putDebugChar(ch);
+-                      checksum += ch;
+-                      count += 1;
+-              }
+-
+-              putDebugChar('#');
+-              putDebugChar(hexchars[checksum >> 4]);
+-              putDebugChar(hexchars[checksum & 0xf]);
+-              recv = getDebugChar();
+-      } while ((recv & 0x7f) != '+');
+-}
+-
+-static void kgdb_flush_cache_all(void)
+-{
+-      flush_instruction_cache();
+-}
+-
+-/* Set up exception handlers for tracing and breakpoints
+- * [could be called kgdb_init()]
+- */
+-void set_debug_traps(void)
+-{
+-#if 0
+-      unsigned char c;
+-
+-      save_and_cli(flags);
+-
+-      /* In case GDB is started before us, ack any packets (presumably
+-       * "$?#xx") sitting there.
+-       *
+-       * I've found this code causes more problems than it solves,
+-       * so that's why it's commented out.  GDB seems to work fine
+-       * now starting either before or after the kernel   -bwb
+-       */
+-
+-      while((c = getDebugChar()) != '$');
+-      while((c = getDebugChar()) != '#');
+-      c = getDebugChar(); /* eat first csum byte */
+-      c = getDebugChar(); /* eat second csum byte */
+-      putDebugChar('+'); /* ack it */
+-#endif
+-      debugger = kgdb;
+-      debugger_bpt = kgdb_bpt;
+-      debugger_sstep = kgdb_sstep;
+-      debugger_iabr_match = kgdb_iabr_match;
+-      debugger_dabr_match = kgdb_dabr_match;
+-
+-      initialized = 1;
+-}
+-
+-static void kgdb_fault_handler(struct pt_regs *regs)
+-{
+-      kgdb_longjmp((long*)fault_jmp_buf, 1);
+-}
+-
+-int kgdb_bpt(struct pt_regs *regs)
+-{
+-      return handle_exception(regs);
+-}
+-
+-int kgdb_sstep(struct pt_regs *regs)
+-{
+-      return handle_exception(regs);
+-}
+-
+-void kgdb(struct pt_regs *regs)
+-{
+-      handle_exception(regs);
+-}
+-
+-int kgdb_iabr_match(struct pt_regs *regs)
+-{
+-      printk(KERN_ERR "kgdb doesn't support iabr, what?!?\n");
+-      return handle_exception(regs);
+-}
+-
+-int kgdb_dabr_match(struct pt_regs *regs)
+-{
+-      printk(KERN_ERR "kgdb doesn't support dabr, what?!?\n");
+-      return handle_exception(regs);
+-}
+-
+-/* Convert the hardware trap type code to a unix signal number. */
+-/*
+- * This table contains the mapping between PowerPC hardware trap types, and
+- * signals, which are primarily what GDB understands.
+- */
+-static struct hard_trap_info
+-{
+-      unsigned int tt;                /* Trap type code for powerpc */
+-      unsigned char signo;            /* Signal that we map this trap into */
+-} hard_trap_info[] = {
+-#if defined(CONFIG_40x) || defined(CONFIG_BOOKE)
+-      { 0x100, SIGINT  },             /* critical input interrupt */
+-      { 0x200, SIGSEGV },             /* machine check */
+-      { 0x300, SIGSEGV },             /* data storage */
+-      { 0x400, SIGBUS  },             /* instruction storage */
+-      { 0x500, SIGINT  },             /* interrupt */
+-      { 0x600, SIGBUS  },             /* alignment */
+-      { 0x700, SIGILL  },             /* program */
+-      { 0x800, SIGILL  },             /* reserved */
+-      { 0x900, SIGILL  },             /* reserved */
+-      { 0xa00, SIGILL  },             /* reserved */
+-      { 0xb00, SIGILL  },             /* reserved */
+-      { 0xc00, SIGCHLD },             /* syscall */
+-      { 0xd00, SIGILL  },             /* reserved */
+-      { 0xe00, SIGILL  },             /* reserved */
+-      { 0xf00, SIGILL  },             /* reserved */
+-      /*
+-      ** 0x1000  PIT
+-      ** 0x1010  FIT
+-      ** 0x1020  watchdog
+-      ** 0x1100  data TLB miss
+-      ** 0x1200  instruction TLB miss
+-      */
+-      { 0x2002, SIGTRAP},             /* debug */
+-#else
+-      { 0x200, SIGSEGV },             /* machine check */
+-      { 0x300, SIGSEGV },             /* address error (store) */
+-      { 0x400, SIGBUS },              /* instruction bus error */
+-      { 0x500, SIGINT },              /* interrupt */
+-      { 0x600, SIGBUS },              /* alingment */
+-      { 0x700, SIGTRAP },             /* breakpoint trap */
+-      { 0x800, SIGFPE },              /* fpu unavail */
+-      { 0x900, SIGALRM },             /* decrementer */
+-      { 0xa00, SIGILL },              /* reserved */
+-      { 0xb00, SIGILL },              /* reserved */
+-      { 0xc00, SIGCHLD },             /* syscall */
+-      { 0xd00, SIGTRAP },             /* single-step/watch */
+-      { 0xe00, SIGFPE },              /* fp assist */
+-#endif
+-      { 0, 0}                         /* Must be last */
+-
+-};
+-
+-static int computeSignal(unsigned int tt)
+-{
+-      struct hard_trap_info *ht;
+-
+-      for (ht = hard_trap_info; ht->tt && ht->signo; ht++)
+-              if (ht->tt == tt)
+-                      return ht->signo;
+-
+-      return SIGHUP; /* default for things we don't know about */
+-}
+-
+-#define PC_REGNUM 64
+-#define SP_REGNUM 1
+-
+-/*
+- * This function does all command processing for interfacing to gdb.
+- */
+-static int
+-handle_exception (struct pt_regs *regs)
+-{
+-      int sigval;
+-      int addr;
+-      int length;
+-      char *ptr;
+-      unsigned int msr;
+-
+-      /* We don't handle user-mode breakpoints. */
+-      if (user_mode(regs))
+-              return 0;
+-
+-      if (debugger_fault_handler) {
+-              debugger_fault_handler(regs);
+-              panic("kgdb longjump failed!\n");
+-      }
+-      if (kgdb_active) {
+-              printk(KERN_ERR "interrupt while in kgdb, returning\n");
+-              return 0;
+-      }
+-
+-      kgdb_active = 1;
+-      kgdb_started = 1;
+-
+-#ifdef KGDB_DEBUG
+-      printk("kgdb: entering handle_exception; trap [0x%x]\n",
+-                      (unsigned int)regs->trap);
+-#endif
+-
+-      kgdb_interruptible(0);
+-      lock_kernel();
+-      msr = mfmsr();
+-      mtmsr(msr & ~MSR_EE);   /* disable interrupts */
+-
+-      if (regs->nip == (unsigned long)breakinst) {
+-              /* Skip over breakpoint trap insn */
+-              regs->nip += 4;
+-      }
+-
+-      /* reply to host that an exception has occurred */
+-      sigval = computeSignal(regs->trap);
+-      ptr = remcomOutBuffer;
+-
+-      *ptr++ = 'T';
+-      *ptr++ = hexchars[sigval >> 4];
+-      *ptr++ = hexchars[sigval & 0xf];
+-      *ptr++ = hexchars[PC_REGNUM >> 4];
+-      *ptr++ = hexchars[PC_REGNUM & 0xf];
+-      *ptr++ = ':';
+-      ptr = mem2hex((char *)&regs->nip, ptr, 4);
+-      *ptr++ = ';';
+-      *ptr++ = hexchars[SP_REGNUM >> 4];
+-      *ptr++ = hexchars[SP_REGNUM & 0xf];
+-      *ptr++ = ':';
+-      ptr = mem2hex(((char *)regs) + SP_REGNUM*4, ptr, 4);
+-      *ptr++ = ';';
+-      *ptr++ = 0;
+-
+-      putpacket(remcomOutBuffer);
+-      if (kdebug)
+-              printk("remcomOutBuffer: %s\n", remcomOutBuffer);
+-
+-      /* XXX We may want to add some features dealing with poking the
+-       * XXX page tables, ... (look at sparc-stub.c for more info)
+-       * XXX also required hacking to the gdb sources directly...
+-       */
+-
+-      while (1) {
+-              remcomOutBuffer[0] = 0;
+-
+-              getpacket(remcomInBuffer);
+-              switch (remcomInBuffer[0]) {
+-              case '?': /* report most recent signal */
+-                      remcomOutBuffer[0] = 'S';
+-                      remcomOutBuffer[1] = hexchars[sigval >> 4];
+-                      remcomOutBuffer[2] = hexchars[sigval & 0xf];
+-                      remcomOutBuffer[3] = 0;
+-                      break;
+-#if 0
+-              case 'q': /* this screws up gdb for some reason...*/
+-              {
+-                      extern long _start, sdata, __bss_start;
+-
+-                      ptr = &remcomInBuffer[1];
+-                      if (strncmp(ptr, "Offsets", 7) != 0)
+-                              break;
+-
+-                      ptr = remcomOutBuffer;
+-                      sprintf(ptr, "Text=%8.8x;Data=%8.8x;Bss=%8.8x",
+-                              &_start, &sdata, &__bss_start);
+-                      break;
+-              }
+-#endif
+-              case 'd':
+-                      /* toggle debug flag */
+-                      kdebug ^= 1;
+-                      break;
+-
+-              case 'g':       /* return the value of the CPU registers.
+-                               * some of them are non-PowerPC names :(
+-                               * they are stored in gdb like:
+-                               * struct {
+-                               *     u32 gpr[32];
+-                               *     f64 fpr[32];
+-                               *     u32 pc, ps, cnd, lr; (ps=msr)
+-                               *     u32 cnt, xer, mq;
+-                               * }
+-                               */
+-              {
+-                      int i;
+-                      ptr = remcomOutBuffer;
+-                      /* General Purpose Regs */
+-                      ptr = mem2hex((char *)regs, ptr, 32 * 4);
+-                      /* Floating Point Regs - FIXME */
+-                      /*ptr = mem2hex((char *), ptr, 32 * 8);*/
+-                      for(i=0; i<(32*8*2); i++) { /* 2chars/byte */
+-                              ptr[i] = '0';
+-                      }
+-                      ptr += 32*8*2;
+-                      /* pc, msr, cr, lr, ctr, xer, (mq is unused) */
+-                      ptr = mem2hex((char *)&regs->nip, ptr, 4);
+-                      ptr = mem2hex((char *)&regs->msr, ptr, 4);
+-                      ptr = mem2hex((char *)&regs->ccr, ptr, 4);
+-                      ptr = mem2hex((char *)&regs->link, ptr, 4);
+-                      ptr = mem2hex((char *)&regs->ctr, ptr, 4);
+-                      ptr = mem2hex((char *)&regs->xer, ptr, 4);
+-              }
+-                      break;
+-
+-              case 'G': /* set the value of the CPU registers */
+-              {
+-                      ptr = &remcomInBuffer[1];
+-
+-                      /*
+-                       * If the stack pointer has moved, you should pray.
+-                       * (cause only god can help you).
+-                       */
+-
+-                      /* General Purpose Regs */
+-                      hex2mem(ptr, (char *)regs, 32 * 4);
+-
+-                      /* Floating Point Regs - FIXME?? */
+-                      /*ptr = hex2mem(ptr, ??, 32 * 8);*/
+-                      ptr += 32*8*2;
+-
+-                      /* pc, msr, cr, lr, ctr, xer, (mq is unused) */
+-                      ptr = hex2mem(ptr, (char *)&regs->nip, 4);
+-                      ptr = hex2mem(ptr, (char *)&regs->msr, 4);
+-                      ptr = hex2mem(ptr, (char *)&regs->ccr, 4);
+-                      ptr = hex2mem(ptr, (char *)&regs->link, 4);
+-                      ptr = hex2mem(ptr, (char *)&regs->ctr, 4);
+-                      ptr = hex2mem(ptr, (char *)&regs->xer, 4);
+-
+-                      strcpy(remcomOutBuffer,"OK");
+-              }
+-                      break;
+-              case 'H':
+-                      /* don't do anything, yet, just acknowledge */
+-                      hexToInt(&ptr, &addr);
+-                      strcpy(remcomOutBuffer,"OK");
+-                      break;
+-
+-              case 'm':       /* mAA..AA,LLLL  Read LLLL bytes at address AA..AA */
+-                              /* Try to read %x,%x.  */
+-
+-                      ptr = &remcomInBuffer[1];
+-
+-                      if (hexToInt(&ptr, &addr) && *ptr++ == ','
+-                                      && hexToInt(&ptr, &length)) {
+-                              if (mem2hex((char *)addr, remcomOutBuffer,
+-                                                      length))
+-                                      break;
+-                              strcpy(remcomOutBuffer, "E03");
+-                      } else
+-                              strcpy(remcomOutBuffer, "E01");
+-                      break;
+-
+-              case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA.AA return OK */
+-                      /* Try to read '%x,%x:'.  */
+-
+-                      ptr = &remcomInBuffer[1];
+-
+-                      if (hexToInt(&ptr, &addr) && *ptr++ == ','
+-                                      && hexToInt(&ptr, &length)
+-                                      && *ptr++ == ':') {
+-                              if (hex2mem(ptr, (char *)addr, length))
+-                                      strcpy(remcomOutBuffer, "OK");
+-                              else
+-                                      strcpy(remcomOutBuffer, "E03");
+-                              flush_icache_range(addr, addr+length);
+-                      } else
+-                              strcpy(remcomOutBuffer, "E02");
+-                      break;
+-
+-
+-              case 'k': /* kill the program, actually just continue */
+-              case 'c': /* cAA..AA  Continue; address AA..AA optional */
+-                      /* try to read optional parameter, pc unchanged if no parm */
+-
+-                      ptr = &remcomInBuffer[1];
+-                      if (hexToInt(&ptr, &addr))
+-                              regs->nip = addr;
+-
+-/* Need to flush the instruction cache here, as we may have deposited a
+- * breakpoint, and the icache probably has no way of knowing that a data ref to
+- * some location may have changed something that is in the instruction cache.
+- */
+-                      kgdb_flush_cache_all();
+-                      mtmsr(msr);
+-
+-                      kgdb_interruptible(1);
+-                      unlock_kernel();
+-                      kgdb_active = 0;
+-                      if (kdebug) {
+-                              printk("remcomInBuffer: %s\n", remcomInBuffer);
+-                              printk("remcomOutBuffer: %s\n", remcomOutBuffer);
+-                      }
+-                      return 1;
+-
+-              case 's':
+-                      kgdb_flush_cache_all();
+-#if defined(CONFIG_40x) || defined(CONFIG_BOOKE)
+-                      mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) | DBCR0_IC);
+-                      regs->msr |= MSR_DE;
+-#else
+-                      regs->msr |= MSR_SE;
+-#endif
+-                      unlock_kernel();
+-                      kgdb_active = 0;
+-                      if (kdebug) {
+-                              printk("remcomInBuffer: %s\n", remcomInBuffer);
+-                              printk("remcomOutBuffer: %s\n", remcomOutBuffer);
+-                      }
+-                      return 1;
+-
+-              case 'r':               /* Reset (if user process..exit ???)*/
+-                      panic("kgdb reset.");
+-                      break;
+-              }                       /* switch */
+-              if (remcomOutBuffer[0] && kdebug) {
+-                      printk("remcomInBuffer: %s\n", remcomInBuffer);
+-                      printk("remcomOutBuffer: %s\n", remcomOutBuffer);
+-              }
+-              /* reply to the request */
+-              putpacket(remcomOutBuffer);
+-      } /* while(1) */
+-}
+-
+-/* This function will generate a breakpoint exception.  It is used at the
+-   beginning of a program to sync up with a debugger and can be used
+-   otherwise as a quick means to stop program execution and "break" into
+-   the debugger. */
+-
+-void
+-breakpoint(void)
+-{
+-      if (!initialized) {
+-              printk("breakpoint() called b4 kgdb init\n");
+-              return;
+-      }
+-
+-      asm("   .globl breakinst        \n\
+-           breakinst: .long 0x7d821008");
+-}
+-
+-#ifdef CONFIG_KGDB_CONSOLE
+-/* Output string in GDB O-packet format if GDB has connected. If nothing
+-   output, returns 0 (caller must then handle output). */
+-int
+-kgdb_output_string (const char* s, unsigned int count)
+-{
+-      char buffer[512];
+-
+-      if (!kgdb_started)
+-              return 0;
+-
+-      count = (count <= (sizeof(buffer) / 2 - 2))
+-              ? count : (sizeof(buffer) / 2 - 2);
+-
+-      buffer[0] = 'O';
+-      mem2hex (s, &buffer[1], count);
+-      putpacket(buffer);
+-
+-      return 1;
+-}
+-#endif
+-
+-static void sysrq_handle_gdb(int key, struct pt_regs *pt_regs,
+-                           struct tty_struct *tty)
+-{
+-      printk("Entering GDB stub\n");
+-      breakpoint();
+-}
+-static struct sysrq_key_op sysrq_gdb_op = {
+-        .handler        = sysrq_handle_gdb,
+-        .help_msg       = "Gdb",
+-        .action_msg     = "GDB",
+-};
+-
+-static int gdb_register_sysrq(void)
+-{
+-      printk("Registering GDB sysrq handler\n");
+-      register_sysrq_key('g', &sysrq_gdb_op);
+-      return 0;
+-}
+-module_init(gdb_register_sysrq);
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ppc/kernel/setup.c linux-2.6.18-53.1.14.kgdb/arch/ppc/kernel/setup.c
+--- linux-2.6.18-53.1.14/arch/ppc/kernel/setup.c       2008-03-06 05:54:43.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/arch/ppc/kernel/setup.c  2008-06-10 15:38:14.000000000 +0400
+@@ -48,10 +48,6 @@
+ #include <asm/ppc_sys.h>
+ #endif
+ 
+-#if defined CONFIG_KGDB
+-#include <asm/kgdb.h>
+-#endif
+-
+ extern void platform_init(unsigned long r3, unsigned long r4,
+               unsigned long r5, unsigned long r6, unsigned long r7);
+ extern void reloc_got2(unsigned long offset);
+@@ -506,18 +502,6 @@ void __init setup_arch(char **cmdline_p)
+ #endif /* CONFIG_XMON */
+       if ( ppc_md.progress ) ppc_md.progress("setup_arch: enter", 0x3eab);
+ 
+-#if defined(CONFIG_KGDB)
+-      if (ppc_md.kgdb_map_scc)
+-              ppc_md.kgdb_map_scc();
+-      set_debug_traps();
+-      if (strstr(cmd_line, "gdb")) {
+-              if (ppc_md.progress)
+-                      ppc_md.progress("setup_arch: kgdb breakpoint", 0x4000);
+-              printk("kgdb breakpoint activated\n");
+-              breakpoint();
+-      }
+-#endif
+-
+       /*
+        * Set cache line size based on type of cpu as a default.
+        * Systems with OF can look in the properties on the cpu node(s)
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ppc/mm/fault.c linux-2.6.18-53.1.14.kgdb/arch/ppc/mm/fault.c
+--- linux-2.6.18-53.1.14/arch/ppc/mm/fault.c   2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/ppc/mm/fault.c      2008-06-10 15:38:14.000000000 +0400
+@@ -25,6 +25,7 @@
+ #include <linux/interrupt.h>
+ #include <linux/highmem.h>
+ #include <linux/module.h>
++#include <linux/kgdb.h>
+ 
+ #include <asm/page.h>
+ #include <asm/pgtable.h>
+@@ -329,6 +330,14 @@ bad_page_fault(struct pt_regs *regs, uns
+               return;
+       }
+ 
++#ifdef CONFIG_KGDB
++      if (atomic_read(&debugger_active) && kgdb_may_fault) {
++              /* Restore our previous state. */
++              kgdb_fault_longjmp(kgdb_fault_jmp_regs);
++              /* Not reached. */
++      }
++#endif
++
+       /* kernel has accessed a bad area */
+ #if defined(CONFIG_XMON) || defined(CONFIG_KGDB)
+       if (debugger_kernel_faults)
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ppc/platforms/4xx/bubinga.c linux-2.6.18-53.1.14.kgdb/arch/ppc/platforms/4xx/bubinga.c
+--- linux-2.6.18-53.1.14/arch/ppc/platforms/4xx/bubinga.c      2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/ppc/platforms/4xx/bubinga.c 2008-06-10 15:38:14.000000000 +0400
+@@ -4,7 +4,7 @@
+  * Author: SAW (IBM), derived from walnut.c.
+  *         Maintained by MontaVista Software <source@mvista.com>
+  *
+- * 2003 (c) MontaVista Softare Inc.  This file is licensed under the
++ * 2003-2004 (c) MontaVista Softare Inc.  This file is licensed under the
+  * terms of the GNU General Public License version 2. This program is
+  * licensed "as is" without any warranty of any kind, whether express
+  * or implied.
+@@ -100,17 +100,26 @@ bubinga_early_serial_map(void)
+       port.flags = UPF_BOOT_AUTOCONF | UPF_SKIP_TEST;
+       port.line = 0;
+ 
+-      if (early_serial_setup(&port) != 0) {
++#ifdef CONFIG_SERIAL_8250
++      if (early_serial_setup(&port) != 0)
+               printk("Early serial init of port 0 failed\n");
+-      }
++#endif
++
++#ifdef CONFIG_KGDB_8250
++      kgdb8250_add_port(0, &port);
++#endif
+ 
+       port.membase = (void*)ACTING_UART1_IO_BASE;
+       port.irq = ACTING_UART1_INT;
+       port.line = 1;
+ 
+-      if (early_serial_setup(&port) != 0) {
++#ifdef CONFIG_SERIAL_8250
++      if (early_serial_setup(&port) != 0)
+               printk("Early serial init of port 1 failed\n");
+-      }
++#endif
++#ifdef CONFIG_KGDB_8250
++      kgdb8250_add_port(1, &port);
++#endif
+ }
+ 
+ void __init
+@@ -255,8 +264,4 @@ platform_init(unsigned long r3, unsigned
+       ppc_md.nvram_read_val = todc_direct_read_val;
+       ppc_md.nvram_write_val = todc_direct_write_val;
+ #endif
+-#ifdef CONFIG_KGDB
+-      ppc_md.early_serial_map = bubinga_early_serial_map;
+-#endif
+ }
+-
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ppc/platforms/4xx/ebony.c linux-2.6.18-53.1.14.kgdb/arch/ppc/platforms/4xx/ebony.c
+--- linux-2.6.18-53.1.14/arch/ppc/platforms/4xx/ebony.c        2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/ppc/platforms/4xx/ebony.c   2008-06-10 15:38:14.000000000 +0400
+@@ -32,6 +32,7 @@
+ #include <linux/tty.h>
+ #include <linux/serial.h>
+ #include <linux/serial_core.h>
++#include <linux/kgdb.h>
+ 
+ #include <asm/system.h>
+ #include <asm/pgtable.h>
+@@ -226,14 +227,20 @@ ebony_early_serial_map(void)
+       port.flags = UPF_BOOT_AUTOCONF | UPF_SKIP_TEST;
+       port.line = 0;
+ 
+-      if (early_serial_setup(&port) != 0) {
++#ifdef CONFIG_SERIAL_8250
++      if (early_serial_setup(&port) != 0)
+               printk("Early serial init of port 0 failed\n");
+-      }
++#endif
+ 
+-#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB)
++#ifdef CONFIG_SERIAL_TEXT_DEBUG
+       /* Configure debug serial access */
+       gen550_init(0, &port);
++#endif
++#ifdef CONFIG_KGDB_8250
++      kgdb8250_add_port(0, &port);
++#endif
+ 
++#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB_8250)
+       /* Purge TLB entry added in head_44x.S for early serial access */
+       _tlbie(UART0_IO_BASE);
+ #endif
+@@ -243,14 +250,18 @@ ebony_early_serial_map(void)
+       port.uartclk = clocks.uart1;
+       port.line = 1;
+ 
+-      if (early_serial_setup(&port) != 0) {
++#ifdef CONFIG_SERIAL_8250
++      if (early_serial_setup(&port) != 1)
+               printk("Early serial init of port 1 failed\n");
+-      }
++#endif
+ 
+-#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB)
++#ifdef CONFIG_SERIAL_TEXT_DEBUG
+       /* Configure debug serial access */
+       gen550_init(1, &port);
+ #endif
++#ifdef CONFIG_KGDB_8250
++      kgdb8250_add_port(1, &port);
++#endif
+ }
+ 
+ static void __init
+@@ -327,8 +338,4 @@ void __init platform_init(unsigned long 
+ 
+       ppc_md.nvram_read_val = todc_direct_read_val;
+       ppc_md.nvram_write_val = todc_direct_write_val;
+-#ifdef CONFIG_KGDB
+-      ppc_md.early_serial_map = ebony_early_serial_map;
+-#endif
+ }
+-
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ppc/platforms/4xx/ocotea.c linux-2.6.18-53.1.14.kgdb/arch/ppc/platforms/4xx/ocotea.c
+--- linux-2.6.18-53.1.14/arch/ppc/platforms/4xx/ocotea.c       2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/ppc/platforms/4xx/ocotea.c  2008-06-10 15:38:14.000000000 +0400
+@@ -30,6 +30,7 @@
+ #include <linux/tty.h>
+ #include <linux/serial.h>
+ #include <linux/serial_core.h>
++#include <linux/kgdb.h>
+ 
+ #include <asm/system.h>
+ #include <asm/pgtable.h>
+@@ -249,14 +250,20 @@ ocotea_early_serial_map(void)
+       port.flags = UPF_BOOT_AUTOCONF | UPF_SKIP_TEST;
+       port.line = 0;
+ 
+-      if (early_serial_setup(&port) != 0) {
++#ifdef CONFIG_SERIAL_8250
++      if (early_serial_setup(&port) != 0)
+               printk("Early serial init of port 0 failed\n");
+-      }
++#endif
+ 
+-#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB)
++#ifdef CONFIG_SERIAL_TEXT_DEBUG
+       /* Configure debug serial access */
+       gen550_init(0, &port);
++#endif
++#ifdef CONFIG_KGDB_8250
++      kgdb8250_add_port(0, &port);
++#endif
+ 
++#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB_8250)
+       /* Purge TLB entry added in head_44x.S for early serial access */
+       _tlbie(UART0_IO_BASE);
+ #endif
+@@ -266,14 +273,18 @@ ocotea_early_serial_map(void)
+       port.uartclk = clocks.uart1;
+       port.line = 1;
+ 
+-      if (early_serial_setup(&port) != 0) {
++#ifdef CONFIG_SERIAL_8250
++      if (early_serial_setup(&port) != 1)
+               printk("Early serial init of port 1 failed\n");
+-      }
++#endif
+ 
+-#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB)
++#ifdef CONFIG_SERIAL_TEXT_DEBUG
+       /* Configure debug serial access */
+       gen550_init(1, &port);
+ #endif
++#ifdef CONFIG_KGDB_8250
++      kgdb8250_add_port(1, &port);
++#endif
+ }
+ 
+ static void __init
+@@ -343,8 +354,5 @@ void __init platform_init(unsigned long 
+ 
+       ppc_md.nvram_read_val = todc_direct_read_val;
+       ppc_md.nvram_write_val = todc_direct_write_val;
+-#ifdef CONFIG_KGDB
+-      ppc_md.early_serial_map = ocotea_early_serial_map;
+-#endif
+       ppc_md.init = ocotea_init;
+ }
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ppc/platforms/4xx/xilinx_ml300.c linux-2.6.18-53.1.14.kgdb/arch/ppc/platforms/4xx/xilinx_ml300.c
+--- linux-2.6.18-53.1.14/arch/ppc/platforms/4xx/xilinx_ml300.c 2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/ppc/platforms/4xx/xilinx_ml300.c    2008-06-10 15:38:14.000000000 +0400
+@@ -41,9 +41,6 @@
+  *      ppc4xx_map_io                         arch/ppc/syslib/ppc4xx_setup.c
+  *  start_kernel                              init/main.c
+  *    setup_arch                              arch/ppc/kernel/setup.c
+- * #if defined(CONFIG_KGDB)
+- *      *ppc_md.kgdb_map_scc() == gen550_kgdb_map_scc
+- * #endif
+  *      *ppc_md.setup_arch == ml300_setup_arch        this file
+  *        ppc4xx_setup_arch                   arch/ppc/syslib/ppc4xx_setup.c
+  *          ppc4xx_find_bridges                       arch/ppc/syslib/ppc405_pci.c
+@@ -117,7 +114,6 @@ ml300_early_serial_init(int num, struct 
+ void __init
+ ml300_early_serial_map(void)
+ {
+-#ifdef CONFIG_SERIAL_8250
+       struct plat_serial8250_port *pdata;
+       int i = 0;
+ 
+@@ -129,7 +125,14 @@ ml300_early_serial_map(void)
+               pdata++;
+               i++;
+       }
+-#endif /* CONFIG_SERIAL_8250 */
++#ifdef CONFIG_SERIAL_8250
++                if (early_serial_setup(&port) != 0)
++                        printk("Early serial init of port %d failed\n", i);
++#endif
++
++#ifdef CONFIG_KGDB_8250
++      kgdb8250_add_port(i, &port)
++#endif
+ }
+ 
+ void __init
+@@ -165,9 +168,4 @@ platform_init(unsigned long r3, unsigned
+ #if defined(XPAR_POWER_0_POWERDOWN_BASEADDR)
+       ppc_md.power_off = xilinx_power_off;
+ #endif
+-
+-#ifdef CONFIG_KGDB
+-      ppc_md.early_serial_map = ml300_early_serial_map;
+-#endif
+ }
+-
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ppc/platforms/85xx/sbc8560.c linux-2.6.18-53.1.14.kgdb/arch/ppc/platforms/85xx/sbc8560.c
+--- linux-2.6.18-53.1.14/arch/ppc/platforms/85xx/sbc8560.c     2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/ppc/platforms/85xx/sbc8560.c        2008-06-10 15:38:14.000000000 +0400
+@@ -50,7 +50,6 @@
+ #include <syslib/ppc85xx_common.h>
+ #include <syslib/ppc85xx_setup.h>
+ 
+-#ifdef CONFIG_SERIAL_8250
+ static void __init
+ sbc8560_early_serial_map(void)
+ {
+@@ -66,12 +65,16 @@ sbc8560_early_serial_map(void)
+         uart_req.membase = ioremap(uart_req.mapbase, MPC85xx_UART0_SIZE);
+       uart_req.type = PORT_16650;
+ 
+-#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB)
+-        gen550_init(0, &uart_req);
+-#endif
+- 
++#ifdef CONFIG_SERIAL_8250
+         if (early_serial_setup(&uart_req) != 0)
+                 printk("Early serial init of port 0 failed\n");
++#endif
++#ifdef CONFIG_SERIAL_TEXT_DEBUG
++      gen550_init(0, &uart_req);
++#endif
++#ifdef CONFIG_KGDB_8250
++      kgdb8250_add_port(0, &uart_req);
++#endif
+  
+         /* Assume early_serial_setup() doesn't modify uart_req */
+       uart_req.line = 1;
+@@ -79,14 +82,17 @@ sbc8560_early_serial_map(void)
+         uart_req.membase = ioremap(uart_req.mapbase, MPC85xx_UART1_SIZE);
+       uart_req.irq = MPC85xx_IRQ_EXT10;
+  
+-#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB)
+-        gen550_init(1, &uart_req);
+-#endif
+- 
++#ifdef CONFIG_SERIAL_8250
+         if (early_serial_setup(&uart_req) != 0)
+-                printk("Early serial init of port 1 failed\n");
+-}
++              printk("Early serial init of port 0 failed\n");
+ #endif
++#ifdef CONFIG_SERIAL_TEXT_DEBUG
++      gen550_init(0, &uart_req);
++#endif
++#ifdef CONFIG_KGDB_8250
++      kgdb8250_add_port(0, &uart_req);
++#endif
++}
+ 
+ /* ************************************************************************
+  *
+@@ -115,9 +121,7 @@ sbc8560_setup_arch(void)
+       /* setup PCI host bridges */
+       mpc85xx_setup_hose();
+ #endif
+-#ifdef CONFIG_SERIAL_8250
+       sbc8560_early_serial_map();
+-#endif
+ #ifdef CONFIG_SERIAL_TEXT_DEBUG
+       /* Invalidate the entry we stole earlier the serial ports
+        * should be properly mapped */ 
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ppc/platforms/chestnut.c linux-2.6.18-53.1.14.kgdb/arch/ppc/platforms/chestnut.c
+--- linux-2.6.18-53.1.14/arch/ppc/platforms/chestnut.c 2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/ppc/platforms/chestnut.c    2008-06-10 15:38:14.000000000 +0400
+@@ -492,7 +492,7 @@ chestnut_power_off(void)
+ static void __init
+ chestnut_map_io(void)
+ {
+-#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB)
++#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB_8250)
+       io_block_mapping(CHESTNUT_UART_BASE, CHESTNUT_UART_BASE, 0x100000,
+               _PAGE_IO);
+ #endif
+@@ -566,9 +566,6 @@ platform_init(unsigned long r3, unsigned
+ #if defined(CONFIG_SERIAL_TEXT_DEBUG)
+       ppc_md.progress = gen550_progress;
+ #endif
+-#if defined(CONFIG_KGDB)
+-      ppc_md.kgdb_map_scc = gen550_kgdb_map_scc;
+-#endif
+ 
+       if (ppc_md.progress)
+                 ppc_md.progress("chestnut_init(): exit", 0);
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ppc/platforms/pplus.c linux-2.6.18-53.1.14.kgdb/arch/ppc/platforms/pplus.c
+--- linux-2.6.18-53.1.14/arch/ppc/platforms/pplus.c    2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/ppc/platforms/pplus.c       2008-06-10 15:38:14.000000000 +0400
+@@ -893,9 +893,6 @@ platform_init(unsigned long r3, unsigned
+ #ifdef CONFIG_SERIAL_TEXT_DEBUG
+       ppc_md.progress = gen550_progress;
+ #endif                                /* CONFIG_SERIAL_TEXT_DEBUG */
+-#ifdef CONFIG_KGDB
+-      ppc_md.kgdb_map_scc = gen550_kgdb_map_scc;
+-#endif
+ #ifdef CONFIG_SMP
+       smp_ops = &pplus_smp_ops;
+ #endif                                /* CONFIG_SMP */
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ppc/platforms/sandpoint.c linux-2.6.18-53.1.14.kgdb/arch/ppc/platforms/sandpoint.c
+--- linux-2.6.18-53.1.14/arch/ppc/platforms/sandpoint.c        2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/ppc/platforms/sandpoint.c   2008-06-10 15:38:14.000000000 +0400
+@@ -730,9 +730,6 @@ platform_init(unsigned long r3, unsigned
+       ppc_md.nvram_read_val = todc_mc146818_read_val;
+       ppc_md.nvram_write_val = todc_mc146818_write_val;
+ 
+-#ifdef CONFIG_KGDB
+-      ppc_md.kgdb_map_scc = gen550_kgdb_map_scc;
+-#endif
+ #ifdef CONFIG_SERIAL_TEXT_DEBUG
+       ppc_md.progress = gen550_progress;
+ #endif
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ppc/platforms/spruce.c linux-2.6.18-53.1.14.kgdb/arch/ppc/platforms/spruce.c
+--- linux-2.6.18-53.1.14/arch/ppc/platforms/spruce.c   2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/ppc/platforms/spruce.c      2008-06-10 15:38:14.000000000 +0400
+@@ -178,26 +178,32 @@ spruce_early_serial_map(void)
+       serial_req.membase = (u_char *)UART0_IO_BASE;
+       serial_req.regshift = 0;
+ 
+-#if defined(CONFIG_KGDB) || defined(CONFIG_SERIAL_TEXT_DEBUG)
+-      gen550_init(0, &serial_req);
+-#endif
+ #ifdef CONFIG_SERIAL_8250
+       if (early_serial_setup(&serial_req) != 0)
+               printk("Early serial init of port 0 failed\n");
+ #endif
++#ifdef CONFIG_SERIAL_TEXT_DEBUG
++      gen550_init(0, &serial_req);
++#endif
++#ifdef CONFIG_KGDB_8250
++      kgdb8250_add_port(0, &port);
++#endif
+ 
+       /* Assume early_serial_setup() doesn't modify serial_req */
+       serial_req.line = 1;
+       serial_req.irq = UART1_INT;
+       serial_req.membase = (u_char *)UART1_IO_BASE;
+ 
+-#if defined(CONFIG_KGDB) || defined(CONFIG_SERIAL_TEXT_DEBUG)
+-      gen550_init(1, &serial_req);
+-#endif
+ #ifdef CONFIG_SERIAL_8250
+       if (early_serial_setup(&serial_req) != 0)
+               printk("Early serial init of port 1 failed\n");
+ #endif
++#ifdef CONFIG_SERIAL_TEXT_DEBUG
++      gen550_init(1, &serial_req);
++#endif
++#ifdef CONFIG_KGDB_8250
++      kgdb8250_add_port(1, &serial_req);
++#endif
+ }
+ 
+ TODC_ALLOC();
+@@ -316,7 +322,4 @@ platform_init(unsigned long r3, unsigned
+ #ifdef CONFIG_SERIAL_TEXT_DEBUG
+       ppc_md.progress = gen550_progress;
+ #endif /* CONFIG_SERIAL_TEXT_DEBUG */
+-#ifdef CONFIG_KGDB
+-      ppc_md.kgdb_map_scc = gen550_kgdb_map_scc;
+-#endif
+ }
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ppc/syslib/Makefile linux-2.6.18-53.1.14.kgdb/arch/ppc/syslib/Makefile
+--- linux-2.6.18-53.1.14/arch/ppc/syslib/Makefile      2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/ppc/syslib/Makefile 2008-06-10 15:38:14.000000000 +0400
+@@ -76,7 +76,6 @@ obj-$(CONFIG_PCI_8260)               += m82xx_pci.o p
+ obj-$(CONFIG_8260_PCI9)               += m8260_pci_erratum9.o
+ obj-$(CONFIG_CPM2)            += cpm2_common.o cpm2_pic.o
+ ifeq ($(CONFIG_PPC_GEN550),y)
+-obj-$(CONFIG_KGDB)            += gen550_kgdb.o gen550_dbg.o
+ obj-$(CONFIG_SERIAL_TEXT_DEBUG)       += gen550_dbg.o
+ endif
+ ifeq ($(CONFIG_SERIAL_MPSC_CONSOLE),y)
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ppc/syslib/gen550.h linux-2.6.18-53.1.14.kgdb/arch/ppc/syslib/gen550.h
+--- linux-2.6.18-53.1.14/arch/ppc/syslib/gen550.h      2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/ppc/syslib/gen550.h 2008-06-10 15:38:14.000000000 +0400
+@@ -11,4 +11,3 @@
+ 
+ extern void gen550_progress(char *, unsigned short);
+ extern void gen550_init(int, struct uart_port *);
+-extern void gen550_kgdb_map_scc(void);
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ppc/syslib/ibm44x_common.c linux-2.6.18-53.1.14.kgdb/arch/ppc/syslib/ibm44x_common.c
+--- linux-2.6.18-53.1.14/arch/ppc/syslib/ibm44x_common.c       2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/ppc/syslib/ibm44x_common.c  2008-06-10 15:38:14.000000000 +0400
+@@ -192,9 +192,6 @@ void __init ibm44x_platform_init(unsigne
+ #ifdef CONFIG_SERIAL_TEXT_DEBUG
+       ppc_md.progress = gen550_progress;
+ #endif /* CONFIG_SERIAL_TEXT_DEBUG */
+-#ifdef CONFIG_KGDB
+-      ppc_md.kgdb_map_scc = gen550_kgdb_map_scc;
+-#endif
+ 
+       /*
+        * The Abatron BDI JTAG debugger does not tolerate others
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ppc/syslib/mv64x60.c linux-2.6.18-53.1.14.kgdb/arch/ppc/syslib/mv64x60.c
+--- linux-2.6.18-53.1.14/arch/ppc/syslib/mv64x60.c     2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/ppc/syslib/mv64x60.c        2008-06-10 15:38:14.000000000 +0400
+@@ -241,6 +241,12 @@ static struct resource mv64x60_mpsc0_res
+               .end    = MV64x60_IRQ_SDMA_0,
+               .flags  = IORESOURCE_IRQ,
+       },
++      [4] = {
++              .name   = "mpsc 0 irq",
++              .start  = MV64x60_IRQ_MPSC_0,
++              .end    = MV64x60_IRQ_MPSC_0,
++              .flags  = IORESOURCE_IRQ,
++      },
+ };
+ 
+ static struct platform_device mpsc0_device = {
+@@ -298,6 +304,12 @@ static struct resource mv64x60_mpsc1_res
+               .end    = MV64360_IRQ_SDMA_1,
+               .flags  = IORESOURCE_IRQ,
+       },
++      [4] = {
++              .name   = "mpsc 1 irq",
++              .start  = MV64360_IRQ_MPSC_1,
++              .end    = MV64360_IRQ_MPSC_1,
++              .flags  = IORESOURCE_IRQ,
++      },
+ };
+ 
+ static struct platform_device mpsc1_device = {
+@@ -1426,12 +1438,46 @@ mv64x60_pd_fixup(struct mv64x60_handle *
+ static int __init
+ mv64x60_add_pds(void)
+ {
+-      return platform_add_devices(mv64x60_pd_devs,
+-              ARRAY_SIZE(mv64x60_pd_devs));
++      int i, ret = 0;
++
++      for (i = 0; i < ARRAY_SIZE(mv64x60_pd_devs); i++) {
++              if (mv64x60_pd_devs[i]) {
++                      ret = platform_device_register(mv64x60_pd_devs[i]);
++              }
++              if (ret) {
++                      while (--i >= 0)
++                              platform_device_unregister(mv64x60_pd_devs[i]);
++                      break;
++              }
++      }
++      return ret;
+ }
+ arch_initcall(mv64x60_add_pds);
+ 
+ /*
++ * mv64x60_early_get_pdev_data()
++ *
++ * Get the data associated with a platform device by name and number.
++ */
++struct platform_device * __init
++mv64x60_early_get_pdev_data(const char *name, int id, int remove)
++{
++      int i;
++      struct platform_device *pdev;
++
++      for (i = 0; i <ARRAY_SIZE(mv64x60_pd_devs); i++) {
++              if ((pdev = mv64x60_pd_devs[i]) &&
++                      pdev->id == id &&
++                      !strcmp(pdev->name, name)) {
++                      if (remove)
++                              mv64x60_pd_devs[i] = NULL;
++                      return pdev;
++              }
++      }
++      return NULL;
++}
++
++/*
+  *****************************************************************************
+  *
+  *    GT64260-Specific Routines
+@@ -1764,6 +1810,11 @@ gt64260a_chip_specific_init(struct mv64x
+               r->start = MV64x60_IRQ_SDMA_0;
+               r->end = MV64x60_IRQ_SDMA_0;
+       }
++      if ((r = platform_get_resource(&mpsc1_device, IORESOURCE_IRQ, 1))
++                      != NULL) {
++              r->start = GT64260_IRQ_MPSC_1;
++              r->end = GT64260_IRQ_MPSC_1;
++      }
+ #endif
+ }
+ 
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ppc/syslib/mv64x60_dbg.c linux-2.6.18-53.1.14.kgdb/arch/ppc/syslib/mv64x60_dbg.c
+--- linux-2.6.18-53.1.14/arch/ppc/syslib/mv64x60_dbg.c 2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/ppc/syslib/mv64x60_dbg.c    2008-06-10 15:38:14.000000000 +0400
+@@ -34,7 +34,7 @@ static struct mv64x60_handle mv64x60_dbg
+ void
+ mv64x60_progress_init(u32 base)
+ {
+-      mv64x60_dbg_bh.v_base = base;
++      mv64x60_dbg_bh.v_base = (void*)base;
+       return;
+ }
+ 
+@@ -69,53 +69,3 @@ mv64x60_mpsc_progress(char *s, unsigned 
+       return;
+ }
+ #endif        /* CONFIG_SERIAL_TEXT_DEBUG */
+-
+-
+-#if defined(CONFIG_KGDB)
+-
+-#if defined(CONFIG_KGDB_TTYS0)
+-#define KGDB_PORT 0
+-#elif defined(CONFIG_KGDB_TTYS1)
+-#define KGDB_PORT 1
+-#else
+-#error "Invalid kgdb_tty port"
+-#endif
+-
+-void
+-putDebugChar(unsigned char c)
+-{
+-      mv64x60_polled_putc(KGDB_PORT, (char)c);
+-}
+-
+-int
+-getDebugChar(void)
+-{
+-      unsigned char   c;
+-
+-      while (!mv64x60_polled_getc(KGDB_PORT, &c));
+-      return (int)c;
+-}
+-
+-void
+-putDebugString(char* str)
+-{
+-      while (*str != '\0') {
+-              putDebugChar(*str);
+-              str++;
+-      }
+-      putDebugChar('\r');
+-      return;
+-}
+-
+-void
+-kgdb_interruptible(int enable)
+-{
+-}
+-
+-void
+-kgdb_map_scc(void)
+-{
+-      if (ppc_md.early_serial_map)
+-              ppc_md.early_serial_map();
+-}
+-#endif        /* CONFIG_KGDB */
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ppc/syslib/ppc85xx_setup.c linux-2.6.18-53.1.14.kgdb/arch/ppc/syslib/ppc85xx_setup.c
+--- linux-2.6.18-53.1.14/arch/ppc/syslib/ppc85xx_setup.c       2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/ppc/syslib/ppc85xx_setup.c  2008-06-10 15:38:14.000000000 +0400
+@@ -69,7 +69,6 @@ mpc85xx_calibrate_decr(void)
+       mtspr(SPRN_TCR, TCR_DIE);
+ }
+ 
+-#ifdef CONFIG_SERIAL_8250
+ void __init
+ mpc85xx_early_serial_map(void)
+ {
+@@ -85,7 +84,7 @@ mpc85xx_early_serial_map(void)
+       pdata[0].mapbase += binfo->bi_immr_base;
+       pdata[0].membase = ioremap(pdata[0].mapbase, MPC85xx_UART0_SIZE);
+ 
+-#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB)
++#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB_8250)
+       memset(&serial_req, 0, sizeof (serial_req));
+       serial_req.iotype = UPIO_MEM;
+       serial_req.mapbase = pdata[0].mapbase;
+@@ -93,18 +92,24 @@ mpc85xx_early_serial_map(void)
+       serial_req.regshift = 0;
+ 
+       gen550_init(0, &serial_req);
++#ifdef CONFIG_KGDB_8250
++      kgdb8250_add_port(0, &serial_req);
++#endif
+ #endif
+ 
+       pdata[1].uartclk = binfo->bi_busfreq;
+       pdata[1].mapbase += binfo->bi_immr_base;
+       pdata[1].membase = ioremap(pdata[1].mapbase, MPC85xx_UART0_SIZE);
+ 
+-#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB)
++#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB_8250)
+       /* Assume gen550_init() doesn't modify serial_req */
+       serial_req.mapbase = pdata[1].mapbase;
+       serial_req.membase = pdata[1].membase;
+ 
+       gen550_init(1, &serial_req);
++#ifdef CONFIG_KGDB_8250
++      kgdb8250_add_port(1, &serial_req);
++#endif
+ #endif
+ }
+ #endif
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/sh/Kconfig.debug linux-2.6.18-53.1.14.kgdb/arch/sh/Kconfig.debug
+--- linux-2.6.18-53.1.14/arch/sh/Kconfig.debug 2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/sh/Kconfig.debug    2008-06-10 15:38:50.000000000 +0400
+@@ -29,96 +29,4 @@ config EARLY_PRINTK
+         This option is only useful porting the kernel to a new machine,
+         when the kernel may crash or hang before the serial console is
+         initialised. If unsure, say N.
+-
+-config KGDB
+-      bool "Include KGDB kernel debugger"
+-      help
+-        Include in-kernel hooks for kgdb, the Linux kernel source level
+-        debugger.  See <http://kgdb.sourceforge.net/> for more information.
+-        Unless you are intending to debug the kernel, say N here.
+-
+-menu "KGDB configuration options"
+-      depends on KGDB
+-
+-config MORE_COMPILE_OPTIONS
+-      bool "Add any additional compile options"
+-      help
+-        If you want to add additional CFLAGS to the kernel build, enable this
+-        option and then enter what you would like to add in the next question.
+-        Note however that -g is already appended with the selection of KGDB.
+-
+-config COMPILE_OPTIONS
+-      string "Additional compile arguments"
+-      depends on MORE_COMPILE_OPTIONS
+-
+-config KGDB_NMI
+-      bool "Enter KGDB on NMI"
+-      default n
+-
+-config KGDB_THREAD
+-      bool "Include KGDB thread support"
+-      default y
+-
+-config SH_KGDB_CONSOLE
+-      bool "Console messages through GDB"
+-      default n
+-
+-config KGDB_SYSRQ
+-      bool "Allow SysRq 'G' to enter KGDB"
+-      default y
+-
+-config KGDB_KERNEL_ASSERTS
+-      bool "Include KGDB kernel assertions"
+-      default n
+-
+-comment "Serial port setup"
+-
+-config KGDB_DEFPORT
+-      int "Port number (ttySCn)"
+-      default "1"
+-
+-config KGDB_DEFBAUD
+-      int "Baud rate"
+-      default "115200"
+-
+-choice
+-      prompt "Parity"
+-      depends on KGDB
+-      default KGDB_DEFPARITY_N
+-
+-config KGDB_DEFPARITY_N
+-      bool "None"
+-
+-config KGDB_DEFPARITY_E
+-      bool "Even"
+-
+-config KGDB_DEFPARITY_O
+-      bool "Odd"
+-
+-endchoice
+-
+-choice
+-      prompt "Data bits"
+-      depends on KGDB
+-      default KGDB_DEFBITS_8
+-
+-config KGDB_DEFBITS_8
+-      bool "8"
+-
+-config KGDB_DEFBITS_7
+-      bool "7"
+-
+-endchoice
+-
+-endmenu
+-
+-config FRAME_POINTER
+-      bool "Compile the kernel with frame pointers"
+-      default y if KGDB
+-      help
+-        If you say Y here the resulting kernel image will be slightly larger
+-        and slower, but it will give very useful debugging information.
+-        If you don't debug the kernel, you can say N, but we may not be able
+-        to solve problems without frame pointers.
+-
+ endmenu
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/sh/Makefile linux-2.6.18-53.1.14.kgdb/arch/sh/Makefile
+--- linux-2.6.18-53.1.14/arch/sh/Makefile      2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/sh/Makefile 2008-06-10 15:38:50.000000000 +0400
+@@ -43,7 +43,6 @@ cflags-$(CONFIG_CPU_SH4)             += -m4 \
+ cflags-$(CONFIG_CPU_SH4A)             += $(call cc-option,-m4a-nofpu,)
+ 
+ cflags-$(CONFIG_SH_DSP)                       += -Wa,-dsp
+-cflags-$(CONFIG_SH_KGDB)              += -g
+ 
+ cflags-$(CONFIG_MORE_COMPILE_OPTIONS) += \
+       $(shell echo $(CONFIG_COMPILE_OPTIONS) | sed -e 's/"//g')
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/sh/boards/se/7751/setup.c linux-2.6.18-53.1.14.kgdb/arch/sh/boards/se/7751/setup.c
+--- linux-2.6.18-53.1.14/arch/sh/boards/se/7751/setup.c        2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/sh/boards/se/7751/setup.c   2008-06-10 15:38:50.000000000 +0400
+@@ -17,10 +17,6 @@
+ #include <asm/io.h>
+ #include <asm/se7751/se7751.h>
+ 
+-#ifdef CONFIG_SH_KGDB
+-#include <asm/kgdb.h>
+-#endif
+-
+ /*
+  * Configure the Super I/O chip
+  */
+@@ -82,12 +78,6 @@ const char *get_system_type(void)
+       return "7751 SolutionEngine";
+ }
+ 
+-#ifdef CONFIG_SH_KGDB
+-static int kgdb_uart_setup(void);
+-static struct kgdb_sermap kgdb_uart_sermap = 
+-{ "ttyS", 0, kgdb_uart_setup, NULL };
+-#endif
+- 
+ /*
+  * Initialize the board
+  */
+@@ -95,133 +85,4 @@ void __init platform_setup(void)
+ {
+       /* Call init_smsc() replacement to set up SuperIO. */
+       /* XXX: RTC setting comes here */
+-#ifdef CONFIG_SH_KGDB
+-      kgdb_register_sermap(&kgdb_uart_sermap);
+-#endif
+-}
+-
+-/*********************************************************************
+- * Currently a hack (e.g. does not interact well w/serial.c, lots of *
+- * hardcoded stuff) but may be useful if SCI/F needs debugging.      *
+- * Mostly copied from x86 code (see files asm-i386/kgdb_local.h and  *
+- * arch/i386/lib/kgdb_serial.c).                                     *
+- *********************************************************************/
+-
+-#ifdef CONFIG_SH_KGDB
+-#include <linux/types.h>
+-#include <linux/serial.h>
+-#include <linux/serialP.h>
+-#include <linux/serial_reg.h>
+-
+-#define COM1_PORT 0x3f8  /* Base I/O address */
+-#define COM1_IRQ  4      /* IRQ not used yet */
+-#define COM2_PORT 0x2f8  /* Base I/O address */
+-#define COM2_IRQ  3      /* IRQ not used yet */
+-
+-#define SB_CLOCK 1843200 /* Serial baud clock */
+-#define SB_BASE (SB_CLOCK/16)
+-#define SB_MCR UART_MCR_OUT2 | UART_MCR_DTR | UART_MCR_RTS
+-
+-struct uart_port {
+-      int base;
+-};
+-#define UART_NPORTS 2
+-struct uart_port uart_ports[] = {
+-      { COM1_PORT },
+-      { COM2_PORT },
+-};
+-struct uart_port *kgdb_uart_port;
+-
+-#define UART_IN(reg)  inb_p(kgdb_uart_port->base + reg)
+-#define UART_OUT(reg,v)       outb_p((v), kgdb_uart_port->base + reg)
+-
+-/* Basic read/write functions for the UART */
+-#define UART_LSR_RXCERR    (UART_LSR_BI | UART_LSR_FE | UART_LSR_PE)
+-static int kgdb_uart_getchar(void)
+-{
+-      int lsr;
+-      int c = -1;
+-
+-      while (c == -1) {
+-              lsr = UART_IN(UART_LSR);
+-              if (lsr & UART_LSR_DR) 
+-                      c = UART_IN(UART_RX);
+-              if ((lsr & UART_LSR_RXCERR))
+-                      c = -1;
+-      }
+-      return c;
+-}
+-
+-static void kgdb_uart_putchar(int c)
+-{
+-      while ((UART_IN(UART_LSR) & UART_LSR_THRE) == 0)
+-              ;
+-      UART_OUT(UART_TX, c);
+-}
+-
+-/*
+- * Initialize UART to configured/requested values.
+- * (But we don't interrupts yet, or interact w/serial.c)
+- */
+-static int kgdb_uart_setup(void)
+-{
+-      int port;
+-      int lcr = 0;
+-      int bdiv = 0;
+-
+-      if (kgdb_portnum >= UART_NPORTS) {
+-              KGDB_PRINTK("uart port %d invalid.\n", kgdb_portnum);
+-              return -1;
+-      }
+-
+-      kgdb_uart_port = &uart_ports[kgdb_portnum];
+-
+-      /* Init sequence from gdb_hook_interrupt */
+-      UART_IN(UART_RX);
+-      UART_OUT(UART_IER, 0);
+-
+-      UART_IN(UART_RX);       /* Serial driver comments say */
+-      UART_IN(UART_IIR);      /* this clears interrupt regs */
+-      UART_IN(UART_MSR);
+-
+-      /* Figure basic LCR values */
+-      switch (kgdb_bits) {
+-      case '7':
+-              lcr |= UART_LCR_WLEN7;
+-              break;
+-      default: case '8': 
+-              lcr |= UART_LCR_WLEN8;
+-              break;
+-      }
+-      switch (kgdb_parity) {
+-      case 'O':
+-              lcr |= UART_LCR_PARITY;
+-              break;
+-      case 'E':
+-              lcr |= (UART_LCR_PARITY | UART_LCR_EPAR);
+-              break;
+-      default: break;
+-      }
+-
+-      /* Figure the baud rate divisor */
+-      bdiv = (SB_BASE/kgdb_baud);
+-      
+-      /* Set the baud rate and LCR values */
+-      UART_OUT(UART_LCR, (lcr | UART_LCR_DLAB));
+-      UART_OUT(UART_DLL, (bdiv & 0xff));
+-      UART_OUT(UART_DLM, ((bdiv >> 8) & 0xff));
+-      UART_OUT(UART_LCR, lcr);
+-
+-      /* Set the MCR */
+-      UART_OUT(UART_MCR, SB_MCR);
+-
+-      /* Turn off FIFOs for now */
+-      UART_OUT(UART_FCR, 0);
+-
+-      /* Setup complete: initialize function pointers */
+-      kgdb_getchar = kgdb_uart_getchar;
+-      kgdb_putchar = kgdb_uart_putchar;
+-
+-      return 0;
+ }
+-#endif /* CONFIG_SH_KGDB */
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/sh/kernel/Makefile linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/Makefile
+--- linux-2.6.18-53.1.14/arch/sh/kernel/Makefile       2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/Makefile  2008-06-10 15:38:50.000000000 +0400
+@@ -13,7 +13,7 @@ obj-y                                += cpu/ timers/
+ obj-$(CONFIG_SMP)             += smp.o
+ obj-$(CONFIG_CF_ENABLER)      += cf-enabler.o
+ obj-$(CONFIG_SH_STANDARD_BIOS)        += sh_bios.o
+-obj-$(CONFIG_SH_KGDB)         += kgdb_stub.o kgdb_jmp.o
++obj-$(CONFIG_KGDB)            += kgdb.o kgdb-jmp.o
+ obj-$(CONFIG_SH_CPU_FREQ)     += cpufreq.o
+ obj-$(CONFIG_MODULES)         += module.o
+ obj-$(CONFIG_EARLY_PRINTK)    += early_printk.o
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/sh/kernel/cpu/sh3/ex.S linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/cpu/sh3/ex.S
+--- linux-2.6.18-53.1.14/arch/sh/kernel/cpu/sh3/ex.S   2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/cpu/sh3/ex.S      2008-06-10 15:38:50.000000000 +0400
+@@ -42,7 +42,7 @@ ENTRY(exception_handling_table)
+       .long   exception_error ! reserved_instruction (filled by trap_init) /* 180 */
+       .long   exception_error ! illegal_slot_instruction (filled by trap_init) /*1A0*/
+ ENTRY(nmi_slot)
+-#if defined (CONFIG_KGDB_NMI)
++#if defined (CONFIG_KGDB)
+       .long   debug_enter     /* 1C0 */       ! Allow trap to debugger
+ #else
+       .long   exception_none  /* 1C0 */       ! Not implemented yet
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/sh/kernel/cpu/sh4/ex.S linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/cpu/sh4/ex.S
+--- linux-2.6.18-53.1.14/arch/sh/kernel/cpu/sh4/ex.S   2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/cpu/sh4/ex.S      2008-06-10 15:38:50.000000000 +0400
+@@ -46,7 +46,7 @@ ENTRY(exception_handling_table)
+       .long   exception_error ! reserved_instruction (filled by trap_init) /* 180 */
+       .long   exception_error ! illegal_slot_instruction (filled by trap_init) /*1A0*/
+ ENTRY(nmi_slot)
+-#if defined (CONFIG_KGDB_NMI)
++#if defined (CONFIG_KGDB)
+       .long   debug_enter     /* 1C0 */       ! Allow trap to debugger
+ #else
+       .long   exception_none  /* 1C0 */       ! Not implemented yet
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/sh/kernel/entry.S linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/entry.S
+--- linux-2.6.18-53.1.14/arch/sh/kernel/entry.S        2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/entry.S   2008-06-10 15:38:50.000000000 +0400
+@@ -75,7 +75,7 @@
+ ENOSYS = 38
+ EINVAL = 22
+ 
+-#if defined(CONFIG_KGDB_NMI)
++#if defined(CONFIG_KGDB)
+ NMI_VEC = 0x1c0                       ! Must catch early for debounce
+ #endif
+ 
+@@ -227,31 +227,33 @@ call_dae:
+ 2:    .long   do_address_error
+ #endif /* CONFIG_MMU */
+ 
+-#if defined(CONFIG_SH_STANDARD_BIOS) || defined(CONFIG_SH_KGDB)
++#if defined(CONFIG_SH_STANDARD_BIOS) || defined(CONFIG_KGDB)
+ ! Handle kernel debug if either kgdb (SW) or gdb-stub (FW) is present.
+ ! If both are configured, handle the debug traps (breakpoints) in SW,
+ ! but still allow BIOS traps to FW.
+ 
+       .align  2
+ debug_kernel:
+-#if defined(CONFIG_SH_STANDARD_BIOS) && defined(CONFIG_SH_KGDB)
++#if defined(CONFIG_SH_STANDARD_BIOS) && defined(CONFIG_KGDB)
+       /* Force BIOS call to FW (debug_trap put TRA in r8) */
+       mov     r8,r0
+       shlr2   r0
+       cmp/eq  #0x3f,r0
+       bt      debug_kernel_fw
+-#endif /* CONFIG_SH_STANDARD_BIOS && CONFIG_SH_KGDB */
++#endif /* CONFIG_SH_STANDARD_BIOS && CONFIG_KGDB */
+ 
++      .align 2
++      .globl debug_enter
+ debug_enter:          
+-#if defined(CONFIG_SH_KGDB)
++#if defined(CONFIG_KGDB)
+       /* Jump to kgdb, pass stacked regs as arg */
+ debug_kernel_sw:
+       mov.l   3f, r0
+       jmp     @r0
+        mov    r15, r4
+       .align  2
+-3:    .long   kgdb_handle_exception
+-#endif /* CONFIG_SH_KGDB */
++3:    .long   kgdb_exception_handler
++#endif /* CONFIG_KGDB */
+ 
+ #if defined(CONFIG_SH_STANDARD_BIOS)
+       /* Unwind the stack and jmp to the debug entry */
+@@ -293,12 +295,12 @@ debug_kernel_fw:
+ 2:    .long   gdb_vbr_vector
+ #endif /* CONFIG_SH_STANDARD_BIOS */
+ 
+-#endif /* CONFIG_SH_STANDARD_BIOS || CONFIG_SH_KGDB */
++#endif /* CONFIG_SH_STANDARD_BIOS || CONFIG_KGDB */
+ 
+ 
+       .align  2
+ debug_trap:   
+-#if defined(CONFIG_SH_STANDARD_BIOS) || defined(CONFIG_SH_KGDB)
++#if defined(CONFIG_SH_STANDARD_BIOS) || defined(CONFIG_KGDB)
+       mov     #OFF_SR, r0
+       mov.l   @(r0,r15), r0           ! get status register
+       shll    r0
+@@ -642,7 +644,7 @@ skip_restore:
+ 6:    or      k0, k2                  ! Set the IMASK-bits
+       ldc     k2, ssr
+       !
+-#if defined(CONFIG_KGDB_NMI)
++#if defined(CONFIG_KGDB)
+       ! Clear in_nmi
+       mov.l   4f, k0
+       mov     #0, k1
+@@ -694,7 +696,7 @@ tlb_miss:
+ interrupt:
+       mov.l   2f, k2
+       mov.l   3f, k3
+-#if defined(CONFIG_KGDB_NMI)
++#if defined(CONFIG_KGDB)
+       ! Debounce (filter nested NMI)
+       mov.l   @k2, k0
+       mov.l   5f, k1
+@@ -709,7 +711,7 @@ interrupt:
+ 5:    .long   NMI_VEC
+ 6:    .long   in_nmi
+ 0:
+-#endif /* defined(CONFIG_KGDB_NMI) */
++#endif /* defined(CONFIG_KGDB) */
+       bra     handle_exception
+        mov.l  @k2, k2
+ 
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/sh/kernel/kgdb-jmp.S linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/kgdb-jmp.S
+--- linux-2.6.18-53.1.14/arch/sh/kernel/kgdb-jmp.S     1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/kgdb-jmp.S        2008-06-10 15:38:50.000000000 +0400
+@@ -0,0 +1,32 @@
++#include <linux/linkage.h>
++
++ENTRY(kgdb_fault_setjmp)
++      add     #(9*4), r4
++      sts.l   pr, @-r4
++      mov.l   r15, @-r4
++      mov.l   r14, @-r4
++      mov.l   r13, @-r4
++      mov.l   r12, @-r4
++      mov.l   r11, @-r4
++      mov.l   r10, @-r4
++      mov.l   r9, @-r4
++      mov.l   r8, @-r4
++      rts
++       mov    #0, r0
++
++ENTRY(kgdb_fault_longjmp)
++      mov.l   @r4+, r8
++      mov.l   @r4+, r9
++      mov.l   @r4+, r10
++      mov.l   @r4+, r11
++      mov.l   @r4+, r12
++      mov.l   @r4+, r13
++      mov.l   @r4+, r14
++      mov.l   @r4+, r15
++      lds.l   @r4+, pr
++      mov     r5, r0
++      tst     r0, r0
++      bf      1f
++      mov     #1, r0
++1:    rts
++       nop
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/sh/kernel/kgdb.c linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/kgdb.c
+--- linux-2.6.18-53.1.14/arch/sh/kernel/kgdb.c 1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/kgdb.c    2008-06-10 15:38:50.000000000 +0400
+@@ -0,0 +1,363 @@
++/*
++ * arch/sh/kernel/kgdb.c
++ *
++ * Contains SH-specific low-level support for KGDB.
++ *
++ * Containes extracts from code by Glenn Engel, Jim Kingdon,
++ * David Grothe <dave@gcom.com>, Tigran Aivazian <tigran@sco.com>,
++ * Amit S. Kale <akale@veritas.com>,  William Gatliff <bgat@open-widgets.com>,
++ * Ben Lee, Steve Chamberlain and Benoit Miller <fulg@iname.com>,
++ * Henry Bell <henry.bell@st.com> and Jeremy Siegel <jsiegel@mvista.com>
++ *
++ * Maintainer: Tom Rini <trini@kernel.crashing.org>
++ *
++ * 2004 (c) MontaVista Software, Inc. This file is licensed under
++ * the terms of the GNU General Public License version 2. This program
++ * is licensed "as is" without any warranty of any kind, whether express
++ * or implied.
++ */
++
++#include <linux/string.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/smp.h>
++#include <linux/spinlock.h>
++#include <linux/delay.h>
++#include <linux/linkage.h>
++#include <linux/init.h>
++#include <linux/kgdb.h>
++
++#include <asm/system.h>
++#include <asm/current.h>
++#include <asm/signal.h>
++#include <asm/pgtable.h>
++#include <asm/ptrace.h>
++
++extern void per_cpu_trap_init(void);
++extern atomic_t cpu_doing_single_step;
++
++/* Function pointers for linkage */
++static struct kgdb_regs trap_registers;
++
++/* Globals. */
++char in_nmi;                  /* Set during NMI to prevent reentry */
++
++/* TRA differs sh3/4 */
++#if defined(CONFIG_CPU_SH3)
++#define TRA 0xffffffd0
++#elif defined(CONFIG_CPU_SH4)
++#define TRA 0xff000020
++#endif
++
++/* Macros for single step instruction identification */
++#define OPCODE_BT(op)         (((op) & 0xff00) == 0x8900)
++#define OPCODE_BF(op)         (((op) & 0xff00) == 0x8b00)
++#define OPCODE_BTF_DISP(op)   (((op) & 0x80) ? (((op) | 0xffffff80) << 1) : \
++                            (((op) & 0x7f ) << 1))
++#define OPCODE_BFS(op)        (((op) & 0xff00) == 0x8f00)
++#define OPCODE_BTS(op)        (((op) & 0xff00) == 0x8d00)
++#define OPCODE_BRA(op)        (((op) & 0xf000) == 0xa000)
++#define OPCODE_BRA_DISP(op)   (((op) & 0x800) ? (((op) | 0xfffff800) << 1) : \
++                            (((op) & 0x7ff) << 1))
++#define OPCODE_BRAF(op)       (((op) & 0xf0ff) == 0x0023)
++#define OPCODE_BRAF_REG(op)   (((op) & 0x0f00) >> 8)
++#define OPCODE_BSR(op)        (((op) & 0xf000) == 0xb000)
++#define OPCODE_BSR_DISP(op)   (((op) & 0x800) ? (((op) | 0xfffff800) << 1) : \
++                            (((op) & 0x7ff) << 1))
++#define OPCODE_BSRF(op)       (((op) & 0xf0ff) == 0x0003)
++#define OPCODE_BSRF_REG(op)   (((op) >> 8) & 0xf)
++#define OPCODE_JMP(op)        (((op) & 0xf0ff) == 0x402b)
++#define OPCODE_JMP_REG(op)    (((op) >> 8) & 0xf)
++#define OPCODE_JSR(op)        (((op) & 0xf0ff) == 0x400b)
++#define OPCODE_JSR_REG(op)    (((op) >> 8) & 0xf)
++#define OPCODE_RTS(op)        ((op) == 0xb)
++#define OPCODE_RTE(op)        ((op) == 0x2b)
++
++#define SR_T_BIT_MASK           0x1
++#define STEP_OPCODE             0xc320
++#define BIOS_CALL_TRAP          0x3f
++
++/* Exception codes as per SH-4 core manual */
++#define ADDRESS_ERROR_LOAD_VEC   7
++#define ADDRESS_ERROR_STORE_VEC  8
++#define TRAP_VEC                 11
++#define INVALID_INSN_VEC         12
++#define INVALID_SLOT_VEC         13
++#define NMI_VEC                  14
++#define SERIAL_BREAK_VEC         58
++
++/* Misc static */
++static int stepped_address;
++static short stepped_opcode;
++
++/* Translate SH-3/4 exception numbers to unix-like signal values */
++static int compute_signal(const int excep_code)
++{
++      switch (excep_code) {
++      case INVALID_INSN_VEC:
++      case INVALID_SLOT_VEC:
++              return SIGILL;
++      case ADDRESS_ERROR_LOAD_VEC:
++      case ADDRESS_ERROR_STORE_VEC:
++              return SIGSEGV;
++      case SERIAL_BREAK_VEC:
++      case NMI_VEC:
++              return SIGINT;
++      default:
++              /* Act like it was a break/trap. */
++              return SIGTRAP;
++      }
++}
++
++/*
++ * Translate the registers of the system into the format that GDB wants.  Since
++ * we use a local structure to store things, instead of getting them out
++ * of pt_regs, we can just do a memcpy.
++ */
++void regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *ign)
++{
++      memcpy(gdb_regs, &trap_registers, sizeof(trap_registers));
++}
++
++/*
++ * On SH we save: r1 (prev->thread.sp) r2 (prev->thread.pc) r4 (prev) r5 (next)
++ * r6 (next->thread.sp) r7 (next->thread.pc)
++ */
++void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
++{
++      int count;
++
++      for (count = 0; count < 16; count++)
++              *(gdb_regs++) = 0;
++      *(gdb_regs++) = p->thread.pc;
++      *(gdb_regs++) = 0;
++      *(gdb_regs++) = 0;
++      *(gdb_regs++) = 0;
++      *(gdb_regs++) = 0;
++      *(gdb_regs++) = 0;
++      *(gdb_regs++) = 0;
++}
++
++/*
++ * Translate the registers values that GDB has given us back into the
++ * format of the system.  See the comment above about memcpy.
++ */
++void gdb_regs_to_regs(unsigned long *gdb_regs, struct pt_regs *ign)
++{
++      memcpy(&trap_registers, gdb_regs, sizeof(trap_registers));
++}
++
++/* Calculate the new address for after a step */
++static short *get_step_address(void)
++{
++      short op = *(short *)trap_registers.pc;
++      long addr;
++
++      /* BT */
++      if (OPCODE_BT(op)) {
++              if (trap_registers.sr & SR_T_BIT_MASK)
++                      addr = trap_registers.pc + 4 + OPCODE_BTF_DISP(op);
++              else
++                      addr = trap_registers.pc + 2;
++      }
++
++      /* BTS */
++      else if (OPCODE_BTS(op)) {
++              if (trap_registers.sr & SR_T_BIT_MASK)
++                      addr = trap_registers.pc + 4 + OPCODE_BTF_DISP(op);
++              else
++                      addr = trap_registers.pc + 4;   /* Not in delay slot */
++      }
++
++      /* BF */
++      else if (OPCODE_BF(op)) {
++              if (!(trap_registers.sr & SR_T_BIT_MASK))
++                      addr = trap_registers.pc + 4 + OPCODE_BTF_DISP(op);
++              else
++                      addr = trap_registers.pc + 2;
++      }
++
++      /* BFS */
++      else if (OPCODE_BFS(op)) {
++              if (!(trap_registers.sr & SR_T_BIT_MASK))
++                      addr = trap_registers.pc + 4 + OPCODE_BTF_DISP(op);
++              else
++                      addr = trap_registers.pc + 4;   /* Not in delay slot */
++      }
++
++      /* BRA */
++      else if (OPCODE_BRA(op))
++              addr = trap_registers.pc + 4 + OPCODE_BRA_DISP(op);
++
++      /* BRAF */
++      else if (OPCODE_BRAF(op))
++              addr = trap_registers.pc + 4
++                  + trap_registers.regs[OPCODE_BRAF_REG(op)];
++
++      /* BSR */
++      else if (OPCODE_BSR(op))
++              addr = trap_registers.pc + 4 + OPCODE_BSR_DISP(op);
++
++      /* BSRF */
++      else if (OPCODE_BSRF(op))
++              addr = trap_registers.pc + 4
++                  + trap_registers.regs[OPCODE_BSRF_REG(op)];
++
++      /* JMP */
++      else if (OPCODE_JMP(op))
++              addr = trap_registers.regs[OPCODE_JMP_REG(op)];
++
++      /* JSR */
++      else if (OPCODE_JSR(op))
++              addr = trap_registers.regs[OPCODE_JSR_REG(op)];
++
++      /* RTS */
++      else if (OPCODE_RTS(op))
++              addr = trap_registers.pr;
++
++      /* RTE */
++      else if (OPCODE_RTE(op))
++              addr = trap_registers.regs[15];
++
++      /* Other */
++      else
++              addr = trap_registers.pc + 2;
++
++      kgdb_flush_icache_range(addr, addr + 2);
++      return (short *)addr;
++}
++
++/* The command loop, read and act on requests */
++int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
++                             char *remcom_in_buffer, char *remcom_out_buffer,
++                             struct pt_regs *ign)
++{
++      unsigned long addr;
++      char *ptr = &remcom_in_buffer[1];
++
++      /* Examine first char of buffer to see what we need to do */
++      switch (remcom_in_buffer[0]) {
++      case 'c':               /* Continue at address AA..AA (optional) */
++      case 's':               /* Step one instruction from AA..AA */
++              /* Try to read optional parameter, PC unchanged if none */
++              if (kgdb_hex2long(&ptr, &addr))
++                      trap_registers.pc = addr;
++
++              atomic_set(&cpu_doing_single_step, -1);
++              if (remcom_in_buffer[0] == 's') {
++                      /* Replace the instruction immediately after the
++                       * current instruction (i.e. next in the expected
++                       * flow of control) with a trap instruction, so that
++                       * returning will cause only a single instruction to
++                       * be executed. Note that this model is slightly
++                       * broken for instructions with delay slots
++                       * (e.g. B[TF]S, BSR, BRA etc), where both the branch
++                       * and the instruction in the delay slot will be
++                       * executed.
++                       */
++                      /* Determine where the target instruction will send
++                       * us to */
++                      unsigned short *next_addr = get_step_address();
++                      stepped_address = (int)next_addr;
++
++                      /* Replace it */
++                      stepped_opcode = *(short *)next_addr;
++                      *next_addr = STEP_OPCODE;
++
++                      /* Flush and return */
++                      kgdb_flush_icache_range((long)next_addr,
++                                              (long)next_addr + 2);
++                      if (kgdb_contthread)
++                              atomic_set(&cpu_doing_single_step,
++                                         smp_processor_id());
++              }
++              return 0;
++      }
++      return -1;
++}
++
++/*
++ * When an exception has occured, we are called.  We need to set things
++ * up so that we can call kgdb_handle_exception to handle requests from
++ * the remote GDB.
++ */
++void kgdb_exception_handler(struct pt_regs *regs)
++{
++      int excep_code, vbr_val;
++      int count;
++
++      /* Copy kernel regs (from stack) */
++      for (count = 0; count < 16; count++)
++              trap_registers.regs[count] = regs->regs[count];
++      trap_registers.pc = regs->pc;
++      trap_registers.pr = regs->pr;
++      trap_registers.sr = regs->sr;
++      trap_registers.gbr = regs->gbr;
++      trap_registers.mach = regs->mach;
++      trap_registers.macl = regs->macl;
++
++      __asm__ __volatile__("stc vbr, %0":"=r"(vbr_val));
++      trap_registers.vbr = vbr_val;
++
++      /* Get the execption code. */
++      __asm__ __volatile__("stc r2_bank, %0":"=r"(excep_code));
++
++      excep_code >>= 5;
++
++      /* If we got an NMI, and KGDB is not yet initialized, call
++       * breakpoint() to try and initialize everything for us. */
++      if (excep_code == NMI_VEC && !kgdb_initialized) {
++              breakpoint();
++              return;
++      }
++
++      /* TRAP_VEC exception indicates a software trap inserted in place of
++       * code by GDB so back up PC by one instruction, as this instruction
++       * will later be replaced by its original one.  Do NOT do this for
++       * trap 0xff, since that indicates a compiled-in breakpoint which
++       * will not be replaced (and we would retake the trap forever) */
++      if (excep_code == TRAP_VEC &&
++          (*(volatile unsigned long *)TRA != (0xff << 2)))
++              trap_registers.pc -= 2;
++
++      /* If we have been single-stepping, put back the old instruction.
++       * We use stepped_address in case we have stopped more than one
++       * instruction away. */
++      if (stepped_opcode != 0) {
++              *(short *)stepped_address = stepped_opcode;
++              kgdb_flush_icache_range(stepped_address, stepped_address + 2);
++      }
++      stepped_opcode = 0;
++
++      /* Call the stub to do the processing.  Note that not everything we
++       * need to send back and forth lives in pt_regs. */
++      kgdb_handle_exception(excep_code, compute_signal(excep_code), 0, regs);
++
++      /* Copy back the (maybe modified) registers */
++      for (count = 0; count < 16; count++)
++              regs->regs[count] = trap_registers.regs[count];
++      regs->pc = trap_registers.pc;
++      regs->pr = trap_registers.pr;
++      regs->sr = trap_registers.sr;
++      regs->gbr = trap_registers.gbr;
++      regs->mach = trap_registers.mach;
++      regs->macl = trap_registers.macl;
++
++      vbr_val = trap_registers.vbr;
++      __asm__ __volatile__("ldc %0, vbr": :"r"(vbr_val));
++}
++
++int __init kgdb_arch_init(void)
++{
++      per_cpu_trap_init();
++
++      return 0;
++}
++
++struct kgdb_arch arch_kgdb_ops = {
++#ifdef CONFIG_CPU_LITTLE_ENDIAN
++      .gdb_bpt_instr = {0xff, 0xc3},
++#else
++      .gdb_bpt_instr = {0xc3, 0xff},
++#endif
++};
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/sh/kernel/kgdb_jmp.S linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/kgdb_jmp.S
+--- linux-2.6.18-53.1.14/arch/sh/kernel/kgdb_jmp.S     2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/kgdb_jmp.S        1970-01-01 03:00:00.000000000 +0300
+@@ -1,33 +0,0 @@
+-#include <linux/linkage.h>
+-
+-ENTRY(setjmp)
+-      add     #(9*4), r4
+-      sts.l   pr, @-r4
+-      mov.l   r15, @-r4
+-      mov.l   r14, @-r4
+-      mov.l   r13, @-r4
+-      mov.l   r12, @-r4
+-      mov.l   r11, @-r4
+-      mov.l   r10, @-r4
+-      mov.l   r9, @-r4
+-      mov.l   r8, @-r4
+-      rts
+-       mov    #0, r0
+-
+-ENTRY(longjmp)
+-      mov.l   @r4+, r8
+-      mov.l   @r4+, r9
+-      mov.l   @r4+, r10
+-      mov.l   @r4+, r11
+-      mov.l   @r4+, r12
+-      mov.l   @r4+, r13
+-      mov.l   @r4+, r14
+-      mov.l   @r4+, r15
+-      lds.l   @r4+, pr
+-      mov     r5, r0
+-      tst     r0, r0
+-      bf      1f
+-      mov     #1, r0  ! in case val==0
+-1:    rts
+-       nop
+-
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/sh/kernel/kgdb_stub.c linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/kgdb_stub.c
+--- linux-2.6.18-53.1.14/arch/sh/kernel/kgdb_stub.c    2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/kgdb_stub.c       1970-01-01 03:00:00.000000000 +0300
+@@ -1,1491 +0,0 @@
+-/*
+- * May be copied or modified under the terms of the GNU General Public
+- * License.  See linux/COPYING for more information.
+- *
+- * Containes extracts from code by Glenn Engel, Jim Kingdon,
+- * David Grothe <dave@gcom.com>, Tigran Aivazian <tigran@sco.com>,
+- * Amit S. Kale <akale@veritas.com>,  William Gatliff <bgat@open-widgets.com>,
+- * Ben Lee, Steve Chamberlain and Benoit Miller <fulg@iname.com>.
+- * 
+- * This version by Henry Bell <henry.bell@st.com>
+- * Minor modifications by Jeremy Siegel <jsiegel@mvista.com>
+- * 
+- * Contains low-level support for remote debug using GDB. 
+- *
+- * To enable debugger support, two things need to happen. A call to
+- * set_debug_traps() is necessary in order to allow any breakpoints
+- * or error conditions to be properly intercepted and reported to gdb.
+- * A breakpoint also needs to be generated to begin communication.  This
+- * is most easily accomplished by a call to breakpoint() which does
+- * a trapa if the initialisation phase has been successfully completed.
+- *
+- * In this case, set_debug_traps() is not used to "take over" exceptions;
+- * other kernel code is modified instead to enter the kgdb functions here
+- * when appropriate (see entry.S for breakpoint traps and NMI interrupts,
+- * see traps.c for kernel error exceptions).
+- *
+- * The following gdb commands are supported:
+- *
+- *    Command       Function                               Return value
+- *
+- *    g             return the value of the CPU registers  hex data or ENN
+- *    G             set the value of the CPU registers     OK or ENN
+- *
+- *    mAA..AA,LLLL  Read LLLL bytes at address AA..AA      hex data or ENN
+- *    MAA..AA,LLLL: Write LLLL bytes at address AA.AA      OK or ENN
+- *    XAA..AA,LLLL: Same, but data is binary (not hex)     OK or ENN
+- *
+- *    c             Resume at current address              SNN   ( signal NN)
+- *    cAA..AA       Continue at address AA..AA             SNN
+- *    CNN;          Resume at current address with signal  SNN
+- *    CNN;AA..AA    Resume at address AA..AA with signal   SNN
+- *
+- *    s             Step one instruction                   SNN
+- *    sAA..AA       Step one instruction from AA..AA       SNN
+- *    SNN;          Step one instruction with signal       SNN
+- *    SNNAA..AA     Step one instruction from AA..AA w/NN  SNN
+- *
+- *    k             kill (Detach GDB)
+- *
+- *    d             Toggle debug flag
+- *    D             Detach GDB 
+- *
+- *    Hct           Set thread t for operations,           OK or ENN
+- *                  c = 'c' (step, cont), c = 'g' (other
+- *                  operations)
+- *
+- *    qC            Query current thread ID                QCpid
+- *    qfThreadInfo  Get list of current threads (first)    m<id>
+- *    qsThreadInfo   "    "  "     "      "   (subsequent)
+- *    qOffsets      Get section offsets                  Text=x;Data=y;Bss=z
+- * 
+- *    TXX           Find if thread XX is alive             OK or ENN
+- *    ?             What was the last sigval ?             SNN   (signal NN)
+- *    O             Output to GDB console
+- *
+- * Remote communication protocol.
+- *
+- *    A debug packet whose contents are <data> is encapsulated for
+- *    transmission in the form:
+- *
+- *       $ <data> # CSUM1 CSUM2
+- *
+- *       <data> must be ASCII alphanumeric and cannot include characters
+- *       '$' or '#'.  If <data> starts with two characters followed by
+- *       ':', then the existing stubs interpret this as a sequence number.
+- *
+- *       CSUM1 and CSUM2 are ascii hex representation of an 8-bit 
+- *       checksum of <data>, the most significant nibble is sent first.
+- *       the hex digits 0-9,a-f are used.
+- *
+- *    Receiver responds with:
+- *
+- *       +       - if CSUM is correct and ready for next packet
+- *       -       - if CSUM is incorrect
+- *
+- * Responses can be run-length encoded to save space.  A '*' means that
+- * the next character is an ASCII encoding giving a repeat count which
+- * stands for that many repititions of the character preceding the '*'.
+- * The encoding is n+29, yielding a printable character where n >=3 
+- * (which is where RLE starts to win).  Don't use an n > 126. 
+- *
+- * So "0* " means the same as "0000".
+- */
+-
+-#include <linux/string.h>
+-#include <linux/kernel.h>
+-#include <linux/sched.h>
+-#include <linux/smp.h>
+-#include <linux/spinlock.h>
+-#include <linux/delay.h>
+-#include <linux/linkage.h>
+-#include <linux/init.h>
+-
+-#include <asm/system.h>
+-#include <asm/current.h>
+-#include <asm/signal.h>
+-#include <asm/pgtable.h>
+-#include <asm/ptrace.h>
+-#include <asm/kgdb.h>
+-
+-#ifdef CONFIG_SH_KGDB_CONSOLE
+-#include <linux/console.h>
+-#endif
+-
+-/* Function pointers for linkage */
+-kgdb_debug_hook_t *kgdb_debug_hook;
+-kgdb_bus_error_hook_t *kgdb_bus_err_hook;
+-
+-int (*kgdb_getchar)(void);
+-void (*kgdb_putchar)(int);
+-
+-static void put_debug_char(int c)
+-{
+-      if (!kgdb_putchar)
+-              return;
+-      (*kgdb_putchar)(c);
+-}
+-static int get_debug_char(void)
+-{
+-      if (!kgdb_getchar)
+-              return -1;
+-      return (*kgdb_getchar)();
+-}
+-
+-/* Num chars in in/out bound buffers, register packets need NUMREGBYTES * 2 */
+-#define BUFMAX 1024
+-#define NUMREGBYTES (MAXREG*4)
+-#define OUTBUFMAX (NUMREGBYTES*2+512)
+-
+-enum regs {
+-      R0 = 0, R1,  R2,  R3,   R4,   R5,  R6, R7,
+-      R8, R9, R10, R11, R12,  R13,  R14, R15,
+-      PC, PR, GBR, VBR, MACH, MACL, SR,
+-      /*  */
+-      MAXREG
+-};
+-
+-static unsigned int registers[MAXREG];
+-struct kgdb_regs trap_registers;
+-
+-char kgdb_in_gdb_mode;
+-char in_nmi;                  /* Set during NMI to prevent reentry */
+-int kgdb_nofault;             /* Boolean to ignore bus errs (i.e. in GDB) */
+-int kgdb_enabled = 1;         /* Default to enabled, cmdline can disable */
+-int kgdb_halt;
+-
+-/* Exposed for user access */
+-struct task_struct *kgdb_current;
+-unsigned int kgdb_g_imask;
+-int kgdb_trapa_val;
+-int kgdb_excode;
+-
+-/* Default values for SCI (can override via kernel args in setup.c) */
+-#ifndef CONFIG_KGDB_DEFPORT
+-#define CONFIG_KGDB_DEFPORT 1
+-#endif
+-
+-#ifndef CONFIG_KGDB_DEFBAUD
+-#define CONFIG_KGDB_DEFBAUD 115200
+-#endif
+-
+-#if defined(CONFIG_KGDB_DEFPARITY_E)
+-#define CONFIG_KGDB_DEFPARITY 'E'
+-#elif defined(CONFIG_KGDB_DEFPARITY_O)
+-#define CONFIG_KGDB_DEFPARITY 'O'
+-#else /* CONFIG_KGDB_DEFPARITY_N */
+-#define CONFIG_KGDB_DEFPARITY 'N'
+-#endif
+-
+-#ifdef CONFIG_KGDB_DEFBITS_7
+-#define CONFIG_KGDB_DEFBITS '7'
+-#else /* CONFIG_KGDB_DEFBITS_8 */
+-#define CONFIG_KGDB_DEFBITS '8'
+-#endif
+-
+-/* SCI/UART settings, used in kgdb_console_setup() */
+-int  kgdb_portnum = CONFIG_KGDB_DEFPORT;
+-int  kgdb_baud = CONFIG_KGDB_DEFBAUD;
+-char kgdb_parity = CONFIG_KGDB_DEFPARITY;
+-char kgdb_bits = CONFIG_KGDB_DEFBITS;
+-
+-/* Jump buffer for setjmp/longjmp */
+-static jmp_buf rem_com_env;
+-
+-/* TRA differs sh3/4 */
+-#if defined(CONFIG_CPU_SH3)
+-#define TRA 0xffffffd0
+-#elif defined(CONFIG_CPU_SH4)
+-#define TRA 0xff000020
+-#endif
+-
+-/* Macros for single step instruction identification */
+-#define OPCODE_BT(op)         (((op) & 0xff00) == 0x8900)
+-#define OPCODE_BF(op)         (((op) & 0xff00) == 0x8b00)
+-#define OPCODE_BTF_DISP(op)   (((op) & 0x80) ? (((op) | 0xffffff80) << 1) : \
+-                            (((op) & 0x7f ) << 1))
+-#define OPCODE_BFS(op)        (((op) & 0xff00) == 0x8f00)
+-#define OPCODE_BTS(op)        (((op) & 0xff00) == 0x8d00)
+-#define OPCODE_BRA(op)        (((op) & 0xf000) == 0xa000)
+-#define OPCODE_BRA_DISP(op)   (((op) & 0x800) ? (((op) | 0xfffff800) << 1) : \
+-                            (((op) & 0x7ff) << 1))
+-#define OPCODE_BRAF(op)       (((op) & 0xf0ff) == 0x0023)
+-#define OPCODE_BRAF_REG(op)   (((op) & 0x0f00) >> 8)
+-#define OPCODE_BSR(op)        (((op) & 0xf000) == 0xb000)
+-#define OPCODE_BSR_DISP(op)   (((op) & 0x800) ? (((op) | 0xfffff800) << 1) : \
+-                            (((op) & 0x7ff) << 1))
+-#define OPCODE_BSRF(op)       (((op) & 0xf0ff) == 0x0003)
+-#define OPCODE_BSRF_REG(op)   (((op) >> 8) & 0xf)
+-#define OPCODE_JMP(op)        (((op) & 0xf0ff) == 0x402b)
+-#define OPCODE_JMP_REG(op)    (((op) >> 8) & 0xf)
+-#define OPCODE_JSR(op)        (((op) & 0xf0ff) == 0x400b)
+-#define OPCODE_JSR_REG(op)    (((op) >> 8) & 0xf)
+-#define OPCODE_RTS(op)        ((op) == 0xb)
+-#define OPCODE_RTE(op)        ((op) == 0x2b)
+-
+-#define SR_T_BIT_MASK           0x1
+-#define STEP_OPCODE             0xc320
+-#define BIOS_CALL_TRAP          0x3f
+-
+-/* Exception codes as per SH-4 core manual */
+-#define ADDRESS_ERROR_LOAD_VEC   7
+-#define ADDRESS_ERROR_STORE_VEC  8
+-#define TRAP_VEC                 11
+-#define INVALID_INSN_VEC         12
+-#define INVALID_SLOT_VEC         13
+-#define NMI_VEC                  14
+-#define USER_BREAK_VEC           15
+-#define SERIAL_BREAK_VEC         58
+-
+-/* Misc static */
+-static int stepped_address;
+-static short stepped_opcode;
+-static const char hexchars[] = "0123456789abcdef";
+-static char in_buffer[BUFMAX];
+-static char out_buffer[OUTBUFMAX];
+-
+-static void kgdb_to_gdb(const char *s);
+-
+-#ifdef CONFIG_KGDB_THREAD
+-static struct task_struct *trapped_thread;
+-static struct task_struct *current_thread;
+-typedef unsigned char threadref[8];
+-#define BUF_THREAD_ID_SIZE 16
+-#endif
+-
+-/* Return addr as a real volatile address */
+-static inline unsigned int ctrl_inl(const unsigned long addr)
+-{
+-      return *(volatile unsigned long *) addr;
+-}
+-
+-/* Correctly set *addr using volatile */
+-static inline void ctrl_outl(const unsigned int b, unsigned long addr)
+-{
+-      *(volatile unsigned long *) addr = b;
+-}
+-
+-/* Get high hex bits */
+-static char highhex(const int x)
+-{
+-      return hexchars[(x >> 4) & 0xf];
+-}
+-
+-/* Get low hex bits */
+-static char lowhex(const int x)
+-{
+-      return hexchars[x & 0xf];
+-}
+-
+-/* Convert ch to hex */
+-static int hex(const char ch)
+-{
+-      if ((ch >= 'a') && (ch <= 'f'))
+-              return (ch - 'a' + 10);
+-      if ((ch >= '0') && (ch <= '9'))
+-              return (ch - '0');
+-      if ((ch >= 'A') && (ch <= 'F'))
+-              return (ch - 'A' + 10);
+-      return (-1);
+-}
+-
+-/* Convert the memory pointed to by mem into hex, placing result in buf.
+-   Returns a pointer to the last char put in buf (null) */
+-static char *mem_to_hex(const char *mem, char *buf, const int count)
+-{
+-      int i;
+-      int ch;
+-      unsigned short s_val;
+-      unsigned long l_val;
+-
+-      /* Check for 16 or 32 */
+-      if (count == 2 && ((long) mem & 1) == 0) {
+-              s_val = *(unsigned short *) mem;
+-              mem = (char *) &s_val;
+-      } else if (count == 4 && ((long) mem & 3) == 0) {
+-              l_val = *(unsigned long *) mem;
+-              mem = (char *) &l_val;
+-      }
+-      for (i = 0; i < count; i++) {
+-              ch = *mem++;
+-              *buf++ = highhex(ch);
+-              *buf++ = lowhex(ch);
+-      }
+-      *buf = 0;
+-      return (buf);
+-}
+-
+-/* Convert the hex array pointed to by buf into binary, to be placed in mem.
+-   Return a pointer to the character after the last byte written */
+-static char *hex_to_mem(const char *buf, char *mem, const int count)
+-{
+-      int i;
+-      unsigned char ch;
+-
+-      for (i = 0; i < count; i++) {
+-              ch = hex(*buf++) << 4;
+-              ch = ch + hex(*buf++);
+-              *mem++ = ch;
+-      }
+-      return (mem);
+-}
+-
+-/* While finding valid hex chars, convert to an integer, then return it */
+-static int hex_to_int(char **ptr, int *int_value)
+-{
+-      int num_chars = 0;
+-      int hex_value;
+-
+-      *int_value = 0;
+-
+-      while (**ptr) {
+-              hex_value = hex(**ptr);
+-              if (hex_value >= 0) {
+-                      *int_value = (*int_value << 4) | hex_value;
+-                      num_chars++;
+-              } else
+-                      break;
+-              (*ptr)++;
+-      }
+-      return num_chars;
+-}
+-
+-/*  Copy the binary array pointed to by buf into mem.  Fix $, #,
+-    and 0x7d escaped with 0x7d.  Return a pointer to the character 
+-    after the last byte written. */
+-static char *ebin_to_mem(const char *buf, char *mem, int count)
+-{
+-      for (; count > 0; count--, buf++) {
+-              if (*buf == 0x7d)
+-                      *mem++ = *(++buf) ^ 0x20;
+-              else
+-                      *mem++ = *buf;
+-      }
+-      return mem;
+-}
+-
+-/* Pack a hex byte */
+-static char *pack_hex_byte(char *pkt, int byte)
+-{
+-      *pkt++ = hexchars[(byte >> 4) & 0xf];
+-      *pkt++ = hexchars[(byte & 0xf)];
+-      return pkt;
+-}
+-
+-#ifdef CONFIG_KGDB_THREAD
+-
+-/* Pack a thread ID */
+-static char *pack_threadid(char *pkt, threadref * id)
+-{
+-      char *limit;
+-      unsigned char *altid;
+-
+-      altid = (unsigned char *) id;
+-
+-      limit = pkt + BUF_THREAD_ID_SIZE;
+-      while (pkt < limit)
+-              pkt = pack_hex_byte(pkt, *altid++);
+-      return pkt;
+-}
+-
+-/* Convert an integer into our threadref */
+-static void int_to_threadref(threadref * id, const int value)
+-{
+-      unsigned char *scan = (unsigned char *) id;
+-      int i = 4;
+-
+-      while (i--)
+-              *scan++ = 0;
+-
+-      *scan++ = (value >> 24) & 0xff;
+-      *scan++ = (value >> 16) & 0xff;
+-      *scan++ = (value >> 8) & 0xff;
+-      *scan++ = (value & 0xff);
+-}
+-
+-/* Return a task structure ptr for a particular pid */
+-static struct task_struct *get_thread(int pid)
+-{
+-      struct task_struct *thread;
+-
+-      /* Use PID_MAX w/gdb for pid 0 */
+-      if (pid == PID_MAX) pid = 0;
+-
+-      /* First check via PID */
+-      thread = find_task_by_pid(pid);
+-
+-      if (thread)
+-              return thread;
+-
+-      /* Start at the start */
+-      thread = init_tasks[0];
+-
+-      /* Walk along the linked list of tasks */
+-      do {
+-              if (thread->pid == pid)
+-                      return thread;
+-              thread = thread->next_task;
+-      } while (thread != init_tasks[0]);
+-
+-      return NULL;
+-}
+-
+-#endif /* CONFIG_KGDB_THREAD */
+-
+-/* Scan for the start char '$', read the packet and check the checksum */
+-static void get_packet(char *buffer, int buflen)
+-{
+-      unsigned char checksum;
+-      unsigned char xmitcsum;
+-      int i;
+-      int count;
+-      char ch;
+-
+-      do {
+-              /* Ignore everything until the start character */
+-              while ((ch = get_debug_char()) != '$');
+-
+-              checksum = 0;
+-              xmitcsum = -1;
+-              count = 0;
+-
+-              /* Now, read until a # or end of buffer is found */
+-              while (count < (buflen - 1)) {
+-                      ch = get_debug_char();
+-
+-                      if (ch == '#')
+-                              break;
+-
+-                      checksum = checksum + ch;
+-                      buffer[count] = ch;
+-                      count = count + 1;
+-              }
+-
+-              buffer[count] = 0;
+-
+-              /* Continue to read checksum following # */
+-              if (ch == '#') {
+-                      xmitcsum = hex(get_debug_char()) << 4;
+-                      xmitcsum += hex(get_debug_char());
+-
+-                      /* Checksum */
+-                      if (checksum != xmitcsum)
+-                              put_debug_char('-');    /* Failed checksum */
+-                      else {
+-                              /* Ack successful transfer */
+-                              put_debug_char('+');
+-
+-                              /* If a sequence char is present, reply 
+-                                 the sequence ID */
+-                              if (buffer[2] == ':') {
+-                                      put_debug_char(buffer[0]);
+-                                      put_debug_char(buffer[1]);
+-
+-                                      /* Remove sequence chars from buffer */
+-                                      count = strlen(buffer);
+-                                      for (i = 3; i <= count; i++)
+-                                              buffer[i - 3] = buffer[i];
+-                              }
+-                      }
+-              }
+-      }
+-      while (checksum != xmitcsum);   /* Keep trying while we fail */
+-}
+-
+-/* Send the packet in the buffer with run-length encoding */
+-static void put_packet(char *buffer)
+-{
+-      int checksum;
+-      char *src;
+-      int runlen;
+-      int encode;
+-
+-      do {
+-              src = buffer;
+-              put_debug_char('$');
+-              checksum = 0;
+-
+-              /* Continue while we still have chars left */
+-              while (*src) {
+-                      /* Check for runs up to 99 chars long */
+-                      for (runlen = 1; runlen < 99; runlen++) {
+-                              if (src[0] != src[runlen])
+-                                      break;
+-                      }
+-
+-                      if (runlen > 3) {
+-                              /* Got a useful amount, send encoding */
+-                              encode = runlen + ' ' - 4;
+-                              put_debug_char(*src);   checksum += *src;
+-                              put_debug_char('*');    checksum += '*';
+-                              put_debug_char(encode); checksum += encode;
+-                              src += runlen;
+-                      } else {
+-                              /* Otherwise just send the current char */
+-                              put_debug_char(*src);   checksum += *src;
+-                              src += 1;
+-                      }
+-              }
+-
+-              /* '#' Separator, put high and low components of checksum */
+-              put_debug_char('#');
+-              put_debug_char(highhex(checksum));
+-              put_debug_char(lowhex(checksum));
+-      }
+-      while ((get_debug_char()) != '+');      /* While no ack */
+-}
+-
+-/* A bus error has occurred - perform a longjmp to return execution and
+-   allow handling of the error */
+-static void kgdb_handle_bus_error(void)
+-{
+-      longjmp(rem_com_env, 1);
+-}
+-
+-/* Translate SH-3/4 exception numbers to unix-like signal values */
+-static int compute_signal(const int excep_code)
+-{
+-      int sigval;
+-
+-      switch (excep_code) {
+-
+-      case INVALID_INSN_VEC:
+-      case INVALID_SLOT_VEC:
+-              sigval = SIGILL;
+-              break;
+-      case ADDRESS_ERROR_LOAD_VEC:
+-      case ADDRESS_ERROR_STORE_VEC:
+-              sigval = SIGSEGV;
+-              break;
+-
+-      case SERIAL_BREAK_VEC:
+-      case NMI_VEC:
+-              sigval = SIGINT;
+-              break;
+-
+-      case USER_BREAK_VEC:
+-      case TRAP_VEC:
+-              sigval = SIGTRAP;
+-              break;
+-
+-      default:
+-              sigval = SIGBUS;        /* "software generated" */
+-              break;
+-      }
+-
+-      return (sigval);
+-}
+-
+-/* Make a local copy of the registers passed into the handler (bletch) */
+-static void kgdb_regs_to_gdb_regs(const struct kgdb_regs *regs,
+-                                int *gdb_regs)
+-{
+-      gdb_regs[R0] = regs->regs[R0];
+-      gdb_regs[R1] = regs->regs[R1];
+-      gdb_regs[R2] = regs->regs[R2];
+-      gdb_regs[R3] = regs->regs[R3];
+-      gdb_regs[R4] = regs->regs[R4];
+-      gdb_regs[R5] = regs->regs[R5];
+-      gdb_regs[R6] = regs->regs[R6];
+-      gdb_regs[R7] = regs->regs[R7];
+-      gdb_regs[R8] = regs->regs[R8];
+-      gdb_regs[R9] = regs->regs[R9];
+-      gdb_regs[R10] = regs->regs[R10];
+-      gdb_regs[R11] = regs->regs[R11];
+-      gdb_regs[R12] = regs->regs[R12];
+-      gdb_regs[R13] = regs->regs[R13];
+-      gdb_regs[R14] = regs->regs[R14];
+-      gdb_regs[R15] = regs->regs[R15];
+-      gdb_regs[PC] = regs->pc;
+-      gdb_regs[PR] = regs->pr;
+-      gdb_regs[GBR] = regs->gbr;
+-      gdb_regs[MACH] = regs->mach;
+-      gdb_regs[MACL] = regs->macl;
+-      gdb_regs[SR] = regs->sr;
+-      gdb_regs[VBR] = regs->vbr;
+-}
+-
+-/* Copy local gdb registers back to kgdb regs, for later copy to kernel */
+-static void gdb_regs_to_kgdb_regs(const int *gdb_regs,
+-                                struct kgdb_regs *regs)
+-{
+-      regs->regs[R0] = gdb_regs[R0];
+-      regs->regs[R1] = gdb_regs[R1];
+-      regs->regs[R2] = gdb_regs[R2];
+-      regs->regs[R3] = gdb_regs[R3];
+-      regs->regs[R4] = gdb_regs[R4];
+-      regs->regs[R5] = gdb_regs[R5];
+-      regs->regs[R6] = gdb_regs[R6];
+-      regs->regs[R7] = gdb_regs[R7];
+-      regs->regs[R8] = gdb_regs[R8];
+-      regs->regs[R9] = gdb_regs[R9];
+-      regs->regs[R10] = gdb_regs[R10];
+-      regs->regs[R11] = gdb_regs[R11];
+-      regs->regs[R12] = gdb_regs[R12];
+-      regs->regs[R13] = gdb_regs[R13];
+-      regs->regs[R14] = gdb_regs[R14];
+-      regs->regs[R15] = gdb_regs[R15];
+-      regs->pc = gdb_regs[PC];
+-      regs->pr = gdb_regs[PR];
+-      regs->gbr = gdb_regs[GBR];
+-      regs->mach = gdb_regs[MACH];
+-      regs->macl = gdb_regs[MACL];
+-      regs->sr = gdb_regs[SR];
+-      regs->vbr = gdb_regs[VBR];
+-}
+-
+-#ifdef CONFIG_KGDB_THREAD
+-/* Make a local copy of registers from the specified thread */
+-asmlinkage void ret_from_fork(void);
+-static void thread_regs_to_gdb_regs(const struct task_struct *thread,
+-                                  int *gdb_regs)
+-{
+-      int regno;
+-      int *tregs;
+-
+-      /* Initialize to zero */
+-      for (regno = 0; regno < MAXREG; regno++)
+-              gdb_regs[regno] = 0;
+-
+-      /* Just making sure... */
+-      if (thread == NULL)
+-              return;
+-
+-      /* A new fork has pt_regs on the stack from a fork() call */
+-      if (thread->thread.pc == (unsigned long)ret_from_fork) {
+-
+-              int vbr_val;
+-              struct pt_regs *kregs;
+-              kregs = (struct pt_regs*)thread->thread.sp;
+-
+-              gdb_regs[R0] = kregs->regs[R0];
+-              gdb_regs[R1] = kregs->regs[R1];
+-              gdb_regs[R2] = kregs->regs[R2];
+-              gdb_regs[R3] = kregs->regs[R3];
+-              gdb_regs[R4] = kregs->regs[R4];
+-              gdb_regs[R5] = kregs->regs[R5];
+-              gdb_regs[R6] = kregs->regs[R6];
+-              gdb_regs[R7] = kregs->regs[R7];
+-              gdb_regs[R8] = kregs->regs[R8];
+-              gdb_regs[R9] = kregs->regs[R9];
+-              gdb_regs[R10] = kregs->regs[R10];
+-              gdb_regs[R11] = kregs->regs[R11];
+-              gdb_regs[R12] = kregs->regs[R12];
+-              gdb_regs[R13] = kregs->regs[R13];
+-              gdb_regs[R14] = kregs->regs[R14];
+-              gdb_regs[R15] = kregs->regs[R15];
+-              gdb_regs[PC] = kregs->pc;
+-              gdb_regs[PR] = kregs->pr;
+-              gdb_regs[GBR] = kregs->gbr;
+-              gdb_regs[MACH] = kregs->mach;
+-              gdb_regs[MACL] = kregs->macl;
+-              gdb_regs[SR] = kregs->sr;
+-
+-              asm("stc vbr, %0":"=r"(vbr_val));
+-              gdb_regs[VBR] = vbr_val;
+-              return;
+-      }
+-
+-      /* Otherwise, we have only some registers from switch_to() */
+-      tregs = (int *)thread->thread.sp;
+-      gdb_regs[R15] = (int)tregs;
+-      gdb_regs[R14] = *tregs++;
+-      gdb_regs[R13] = *tregs++;
+-      gdb_regs[R12] = *tregs++;
+-      gdb_regs[R11] = *tregs++;
+-      gdb_regs[R10] = *tregs++;
+-      gdb_regs[R9] = *tregs++;
+-      gdb_regs[R8] = *tregs++;
+-      gdb_regs[PR] = *tregs++;
+-      gdb_regs[GBR] = *tregs++;
+-      gdb_regs[PC] = thread->thread.pc;
+-}
+-#endif /* CONFIG_KGDB_THREAD */
+-
+-/* Calculate the new address for after a step */
+-static short *get_step_address(void)
+-{
+-      short op = *(short *) trap_registers.pc;
+-      long addr;
+-
+-      /* BT */
+-      if (OPCODE_BT(op)) {
+-              if (trap_registers.sr & SR_T_BIT_MASK)
+-                      addr = trap_registers.pc + 4 + OPCODE_BTF_DISP(op);
+-              else
+-                      addr = trap_registers.pc + 2;
+-      }
+-
+-      /* BTS */
+-      else if (OPCODE_BTS(op)) {
+-              if (trap_registers.sr & SR_T_BIT_MASK)
+-                      addr = trap_registers.pc + 4 + OPCODE_BTF_DISP(op);
+-              else
+-                      addr = trap_registers.pc + 4;   /* Not in delay slot */
+-      }
+-
+-      /* BF */
+-      else if (OPCODE_BF(op)) {
+-              if (!(trap_registers.sr & SR_T_BIT_MASK))
+-                      addr = trap_registers.pc + 4 + OPCODE_BTF_DISP(op);
+-              else
+-                      addr = trap_registers.pc + 2;
+-      }
+-
+-      /* BFS */
+-      else if (OPCODE_BFS(op)) {
+-              if (!(trap_registers.sr & SR_T_BIT_MASK))
+-                      addr = trap_registers.pc + 4 + OPCODE_BTF_DISP(op);
+-              else
+-                      addr = trap_registers.pc + 4;   /* Not in delay slot */
+-      }
+-
+-      /* BRA */
+-      else if (OPCODE_BRA(op))
+-              addr = trap_registers.pc + 4 + OPCODE_BRA_DISP(op);
+-
+-      /* BRAF */
+-      else if (OPCODE_BRAF(op))
+-              addr = trap_registers.pc + 4
+-                  + trap_registers.regs[OPCODE_BRAF_REG(op)];
+-
+-      /* BSR */
+-      else if (OPCODE_BSR(op))
+-              addr = trap_registers.pc + 4 + OPCODE_BSR_DISP(op);
+-
+-      /* BSRF */
+-      else if (OPCODE_BSRF(op))
+-              addr = trap_registers.pc + 4
+-                  + trap_registers.regs[OPCODE_BSRF_REG(op)];
+-
+-      /* JMP */
+-      else if (OPCODE_JMP(op))
+-              addr = trap_registers.regs[OPCODE_JMP_REG(op)];
+-
+-      /* JSR */
+-      else if (OPCODE_JSR(op))
+-              addr = trap_registers.regs[OPCODE_JSR_REG(op)];
+-
+-      /* RTS */
+-      else if (OPCODE_RTS(op))
+-              addr = trap_registers.pr;
+-
+-      /* RTE */
+-      else if (OPCODE_RTE(op))
+-              addr = trap_registers.regs[15];
+-
+-      /* Other */
+-      else
+-              addr = trap_registers.pc + 2;
+-
+-      kgdb_flush_icache_range(addr, addr + 2);
+-      return (short *) addr;
+-}
+-
+-/* Set up a single-step.  Replace the instruction immediately after the 
+-   current instruction (i.e. next in the expected flow of control) with a
+-   trap instruction, so that returning will cause only a single instruction
+-   to be executed. Note that this model is slightly broken for instructions
+-   with delay slots (e.g. B[TF]S, BSR, BRA etc), where both the branch
+-   and the instruction in the delay slot will be executed. */
+-static void do_single_step(void)
+-{
+-      unsigned short *addr = 0;
+-
+-      /* Determine where the target instruction will send us to */
+-      addr = get_step_address();
+-      stepped_address = (int)addr;
+-
+-      /* Replace it */
+-      stepped_opcode = *(short *)addr;
+-      *addr = STEP_OPCODE;
+-
+-      /* Flush and return */
+-      kgdb_flush_icache_range((long) addr, (long) addr + 2);
+-      return;
+-}
+-
+-/* Undo a single step */
+-static void undo_single_step(void)
+-{
+-      /* If we have stepped, put back the old instruction */
+-      /* Use stepped_address in case we stopped elsewhere */
+-      if (stepped_opcode != 0) {
+-              *(short*)stepped_address = stepped_opcode;
+-              kgdb_flush_icache_range(stepped_address, stepped_address + 2);
+-      }
+-      stepped_opcode = 0;
+-}
+-
+-/* Send a signal message */
+-static void send_signal_msg(const int signum)
+-{
+-#ifndef CONFIG_KGDB_THREAD
+-      out_buffer[0] = 'S';
+-      out_buffer[1] = highhex(signum);
+-      out_buffer[2] = lowhex(signum);
+-      out_buffer[3] = 0;
+-      put_packet(out_buffer);
+-#else /* CONFIG_KGDB_THREAD */
+-      int threadid;
+-      threadref thref;
+-      char *out = out_buffer;
+-      const char *tstring = "thread";
+-
+-      *out++ = 'T';
+-      *out++ = highhex(signum);
+-      *out++ = lowhex(signum);
+-
+-      while (*tstring) {
+-              *out++ = *tstring++;
+-      }
+-      *out++ = ':';
+-
+-      threadid = trapped_thread->pid;
+-      if (threadid == 0) threadid = PID_MAX;
+-      int_to_threadref(&thref, threadid);
+-      pack_threadid(out, &thref);
+-      out += BUF_THREAD_ID_SIZE;
+-      *out++ = ';';
+-
+-      *out = 0;
+-      put_packet(out_buffer);
+-#endif /* CONFIG_KGDB_THREAD */
+-}
+-
+-/* Reply that all was well */
+-static void send_ok_msg(void)
+-{
+-      strcpy(out_buffer, "OK");
+-      put_packet(out_buffer);
+-}
+-
+-/* Reply that an error occurred */
+-static void send_err_msg(void)
+-{
+-      strcpy(out_buffer, "E01");
+-      put_packet(out_buffer);
+-}
+-
+-/* Empty message indicates unrecognised command */
+-static void send_empty_msg(void)
+-{
+-      put_packet("");
+-}
+-
+-/* Read memory due to 'm' message */
+-static void read_mem_msg(void)
+-{
+-      char *ptr;
+-      int addr;
+-      int length;
+-
+-      /* Jmp, disable bus error handler */
+-      if (setjmp(rem_com_env) == 0) {
+-
+-              kgdb_nofault = 1;
+-
+-              /* Walk through, have m<addr>,<length> */
+-              ptr = &in_buffer[1];
+-              if (hex_to_int(&ptr, &addr) && (*ptr++ == ','))
+-                      if (hex_to_int(&ptr, &length)) {
+-                              ptr = 0;
+-                              if (length * 2 > OUTBUFMAX)
+-                                      length = OUTBUFMAX / 2;
+-                              mem_to_hex((char *) addr, out_buffer, length);
+-                      }
+-              if (ptr)
+-                      send_err_msg();
+-              else
+-                      put_packet(out_buffer);
+-      } else
+-              send_err_msg();
+-
+-      /* Restore bus error handler */
+-      kgdb_nofault = 0;
+-}
+-
+-/* Write memory due to 'M' or 'X' message */
+-static void write_mem_msg(int binary)
+-{
+-      char *ptr;
+-      int addr;
+-      int length;
+-
+-      if (setjmp(rem_com_env) == 0) {
+-
+-              kgdb_nofault = 1;
+-
+-              /* Walk through, have M<addr>,<length>:<data> */
+-              ptr = &in_buffer[1];
+-              if (hex_to_int(&ptr, &addr) && (*ptr++ == ','))
+-                      if (hex_to_int(&ptr, &length) && (*ptr++ == ':')) {
+-                              if (binary)
+-                                      ebin_to_mem(ptr, (char*)addr, length);
+-                              else
+-                                      hex_to_mem(ptr, (char*)addr, length);
+-                              kgdb_flush_icache_range(addr, addr + length);
+-                              ptr = 0;
+-                              send_ok_msg();
+-                      }
+-              if (ptr)
+-                      send_err_msg();
+-      } else
+-              send_err_msg();
+-
+-      /* Restore bus error handler */
+-      kgdb_nofault = 0;
+-}
+-
+-/* Continue message  */
+-static void continue_msg(void)
+-{
+-      /* Try to read optional parameter, PC unchanged if none */
+-      char *ptr = &in_buffer[1];
+-      int addr;
+-
+-      if (hex_to_int(&ptr, &addr))
+-              trap_registers.pc = addr;
+-}
+-
+-/* Continue message with signal */
+-static void continue_with_sig_msg(void)
+-{
+-      int signal;
+-      char *ptr = &in_buffer[1];
+-      int addr;
+-
+-      /* Report limitation */
+-      kgdb_to_gdb("Cannot force signal in kgdb, continuing anyway.\n");
+-
+-      /* Signal */
+-      hex_to_int(&ptr, &signal);
+-      if (*ptr == ';')
+-              ptr++;
+-
+-      /* Optional address */
+-      if (hex_to_int(&ptr, &addr))
+-              trap_registers.pc = addr;
+-}
+-
+-/* Step message */
+-static void step_msg(void)
+-{
+-      continue_msg();
+-      do_single_step();
+-}
+-
+-/* Step message with signal */
+-static void step_with_sig_msg(void)
+-{
+-      continue_with_sig_msg();
+-      do_single_step();
+-}
+-
+-/* Send register contents */
+-static void send_regs_msg(void)
+-{
+-#ifdef CONFIG_KGDB_THREAD
+-      if (!current_thread)
+-              kgdb_regs_to_gdb_regs(&trap_registers, registers);
+-      else
+-              thread_regs_to_gdb_regs(current_thread, registers);
+-#else
+-      kgdb_regs_to_gdb_regs(&trap_registers, registers);
+-#endif
+-
+-      mem_to_hex((char *) registers, out_buffer, NUMREGBYTES);
+-      put_packet(out_buffer);
+-}
+-
+-/* Set register contents - currently can't set other thread's registers */
+-static void set_regs_msg(void)
+-{
+-#ifdef CONFIG_KGDB_THREAD
+-      if (!current_thread) {
+-#endif
+-              kgdb_regs_to_gdb_regs(&trap_registers, registers);
+-              hex_to_mem(&in_buffer[1], (char *) registers, NUMREGBYTES);
+-              gdb_regs_to_kgdb_regs(registers, &trap_registers);
+-              send_ok_msg();
+-#ifdef CONFIG_KGDB_THREAD
+-      } else
+-              send_err_msg();
+-#endif
+-}
+-
+-
+-#ifdef CONFIG_KGDB_THREAD
+-
+-/* Set the status for a thread */
+-void set_thread_msg(void)
+-{
+-      int threadid;
+-      struct task_struct *thread = NULL;
+-      char *ptr;
+-
+-      switch (in_buffer[1]) {
+-
+-              /* To select which thread for gG etc messages, i.e. supported */
+-      case 'g':
+-
+-              ptr = &in_buffer[2];
+-              hex_to_int(&ptr, &threadid);
+-              thread = get_thread(threadid);
+-
+-              /* If we haven't found it */
+-              if (!thread) {
+-                      send_err_msg();
+-                      break;
+-              }
+-
+-              /* Set current_thread (or not) */
+-              if (thread == trapped_thread)
+-                      current_thread = NULL;
+-              else
+-                      current_thread = thread;
+-              send_ok_msg();
+-              break;
+-
+-      /* To select which thread for cCsS messages, i.e. unsupported */
+-      case 'c':
+-              send_ok_msg();
+-              break;
+-
+-      default:
+-              send_empty_msg();
+-              break;
+-      }
+-}
+-
+-/* Is a thread alive? */
+-static void thread_status_msg(void)
+-{
+-      char *ptr;
+-      int threadid;
+-      struct task_struct *thread = NULL;
+-
+-      ptr = &in_buffer[1];
+-      hex_to_int(&ptr, &threadid);
+-      thread = get_thread(threadid);
+-      if (thread)
+-              send_ok_msg();
+-      else
+-              send_err_msg();
+-}
+-/* Send the current thread ID */
+-static void thread_id_msg(void)
+-{
+-      int threadid;
+-      threadref thref;
+-
+-      out_buffer[0] = 'Q';
+-      out_buffer[1] = 'C';
+-
+-      if (current_thread)
+-              threadid = current_thread->pid;
+-      else if (trapped_thread)
+-              threadid = trapped_thread->pid;
+-      else /* Impossible, but just in case! */
+-      {
+-              send_err_msg();
+-              return;
+-      }
+-
+-      /* Translate pid 0 to PID_MAX for gdb */
+-      if (threadid == 0) threadid = PID_MAX;
+-
+-      int_to_threadref(&thref, threadid);
+-      pack_threadid(out_buffer + 2, &thref);
+-      out_buffer[2 + BUF_THREAD_ID_SIZE] = '\0';
+-      put_packet(out_buffer);
+-}
+-
+-/* Send thread info */
+-static void thread_info_msg(void)
+-{
+-      struct task_struct *thread = NULL;
+-      int threadid;
+-      char *pos;
+-      threadref thref;
+-
+-      /* Start with 'm' */
+-      out_buffer[0] = 'm';
+-      pos = &out_buffer[1];
+-
+-      /* For all possible thread IDs - this will overrun if > 44 threads! */
+-      /* Start at 1 and include PID_MAX (since GDB won't use pid 0...) */
+-      for (threadid = 1; threadid <= PID_MAX; threadid++) {
+-
+-              read_lock(&tasklist_lock);
+-              thread = get_thread(threadid);
+-              read_unlock(&tasklist_lock);
+-
+-              /* If it's a valid thread */
+-              if (thread) {
+-                      int_to_threadref(&thref, threadid);
+-                      pack_threadid(pos, &thref);
+-                      pos += BUF_THREAD_ID_SIZE;
+-                      *pos++ = ',';
+-              }
+-      }
+-      *--pos = 0;             /* Lose final comma */
+-      put_packet(out_buffer);
+-
+-}
+-
+-/* Return printable info for gdb's 'info threads' command */
+-static void thread_extra_info_msg(void)
+-{
+-      int threadid;
+-      struct task_struct *thread = NULL;
+-      char buffer[20], *ptr;
+-      int i;
+-
+-      /* Extract thread ID */
+-      ptr = &in_buffer[17];
+-      hex_to_int(&ptr, &threadid);
+-      thread = get_thread(threadid);
+-
+-      /* If we don't recognise it, say so */
+-      if (thread == NULL)
+-              strcpy(buffer, "(unknown)");
+-      else
+-              strcpy(buffer, thread->comm);
+-
+-      /* Construct packet */
+-      for (i = 0, ptr = out_buffer; buffer[i]; i++)
+-              ptr = pack_hex_byte(ptr, buffer[i]);
+-
+-      if (thread->thread.pc == (unsigned long)ret_from_fork) {
+-              strcpy(buffer, "<new fork>");
+-              for (i = 0; buffer[i]; i++)
+-                      ptr = pack_hex_byte(ptr, buffer[i]);
+-      }
+-
+-      *ptr = '\0';
+-      put_packet(out_buffer);
+-}
+-
+-/* Handle all qFooBarBaz messages - have to use an if statement as
+-   opposed to a switch because q messages can have > 1 char id. */
+-static void query_msg(void)
+-{
+-      const char *q_start = &in_buffer[1];
+-
+-      /* qC = return current thread ID */
+-      if (strncmp(q_start, "C", 1) == 0)
+-              thread_id_msg();
+-
+-      /* qfThreadInfo = query all threads (first) */
+-      else if (strncmp(q_start, "fThreadInfo", 11) == 0)
+-              thread_info_msg();
+-
+-      /* qsThreadInfo = query all threads (subsequent). We know we have sent
+-         them all after the qfThreadInfo message, so there are no to send */
+-      else if (strncmp(q_start, "sThreadInfo", 11) == 0)
+-              put_packet("l");        /* el = last */
+-
+-      /* qThreadExtraInfo = supply printable information per thread */
+-      else if (strncmp(q_start, "ThreadExtraInfo", 15) == 0)
+-              thread_extra_info_msg();
+-
+-      /* Unsupported - empty message as per spec */
+-      else
+-              send_empty_msg();
+-}
+-#endif /* CONFIG_KGDB_THREAD */
+-
+-/*
+- * Bring up the ports..
+- */
+-static int kgdb_serial_setup(void)
+-{
+-      extern int kgdb_console_setup(struct console *co, char *options);
+-      struct console dummy;
+-
+-      kgdb_console_setup(&dummy, 0);
+-
+-      return 0;
+-}
+-
+-/* The command loop, read and act on requests */
+-static void kgdb_command_loop(const int excep_code, const int trapa_value)
+-{
+-      int sigval;
+-
+-      if (excep_code == NMI_VEC) {
+-#ifndef CONFIG_KGDB_NMI
+-              KGDB_PRINTK("Ignoring unexpected NMI?\n");
+-              return;
+-#else /* CONFIG_KGDB_NMI */
+-              if (!kgdb_enabled) {
+-                      kgdb_enabled = 1;
+-                      kgdb_init();
+-              }
+-#endif /* CONFIG_KGDB_NMI */
+-      }
+-
+-      /* Ignore if we're disabled */
+-      if (!kgdb_enabled)
+-              return;
+-
+-#ifdef CONFIG_KGDB_THREAD
+-      /* Until GDB specifies a thread */
+-      current_thread = NULL;
+-      trapped_thread = current;
+-#endif
+-
+-      /* Enter GDB mode (e.g. after detach) */
+-      if (!kgdb_in_gdb_mode) {
+-              /* Do serial setup, notify user, issue preemptive ack */
+-              kgdb_serial_setup();
+-              KGDB_PRINTK("Waiting for GDB (on %s%d at %d baud)\n",
+-                          (kgdb_porttype ? kgdb_porttype->name : ""),
+-                          kgdb_portnum, kgdb_baud);
+-              kgdb_in_gdb_mode = 1;
+-              put_debug_char('+');
+-      }
+-
+-      /* Reply to host that an exception has occurred */
+-      sigval = compute_signal(excep_code);
+-      send_signal_msg(sigval);
+-
+-      /* TRAP_VEC exception indicates a software trap inserted in place of
+-         code by GDB so back up PC by one instruction, as this instruction
+-         will later be replaced by its original one.  Do NOT do this for
+-         trap 0xff, since that indicates a compiled-in breakpoint which
+-         will not be replaced (and we would retake the trap forever) */
+-      if ((excep_code == TRAP_VEC) && (trapa_value != (0xff << 2))) {
+-              trap_registers.pc -= 2;
+-      }
+-
+-      /* Undo any stepping we may have done */
+-      undo_single_step();
+-
+-      while (1) {
+-
+-              out_buffer[0] = 0;
+-              get_packet(in_buffer, BUFMAX);
+-
+-              /* Examine first char of buffer to see what we need to do */
+-              switch (in_buffer[0]) {
+-
+-              case '?':       /* Send which signal we've received */
+-                      send_signal_msg(sigval);
+-                      break;
+-
+-              case 'g':       /* Return the values of the CPU registers */
+-                      send_regs_msg();
+-                      break;
+-
+-              case 'G':       /* Set the value of the CPU registers */
+-                      set_regs_msg();
+-                      break;
+-
+-              case 'm':       /* Read LLLL bytes address AA..AA */
+-                      read_mem_msg();
+-                      break;
+-
+-              case 'M':       /* Write LLLL bytes address AA..AA, ret OK */
+-                      write_mem_msg(0);       /* 0 = data in hex */
+-                      break;
+-
+-              case 'X':       /* Write LLLL bytes esc bin address AA..AA */
+-                      if (kgdb_bits == '8')
+-                              write_mem_msg(1); /* 1 = data in binary */
+-                      else
+-                              send_empty_msg();
+-                      break;
+-
+-              case 'C':       /* Continue, signum included, we ignore it */
+-                      continue_with_sig_msg();
+-                      return;
+-
+-              case 'c':       /* Continue at address AA..AA (optional) */
+-                      continue_msg();
+-                      return;
+-
+-              case 'S':       /* Step, signum included, we ignore it */
+-                      step_with_sig_msg();
+-                      return;
+-
+-              case 's':       /* Step one instruction from AA..AA */
+-                      step_msg();
+-                      return;
+-
+-#ifdef CONFIG_KGDB_THREAD
+-
+-              case 'H':       /* Task related */
+-                      set_thread_msg();
+-                      break;
+-
+-              case 'T':       /* Query thread status */
+-                      thread_status_msg();
+-                      break;
+-
+-              case 'q':       /* Handle query - currently thread-related */
+-                      query_msg();
+-                      break;
+-#endif
+-
+-              case 'k':       /* 'Kill the program' with a kernel ? */
+-                      break;
+-
+-              case 'D':       /* Detach from program, send reply OK */
+-                      kgdb_in_gdb_mode = 0;
+-                      send_ok_msg();
+-                      get_debug_char();
+-                      return;
+-
+-              default:
+-                      send_empty_msg();
+-                      break;
+-              }
+-      }
+-}
+-
+-/* There has been an exception, most likely a breakpoint. */
+-void kgdb_handle_exception(struct pt_regs *regs)
+-{
+-      int excep_code, vbr_val;
+-      int count;
+-      int trapa_value = ctrl_inl(TRA);
+-
+-      /* Copy kernel regs (from stack) */
+-      for (count = 0; count < 16; count++)
+-              trap_registers.regs[count] = regs->regs[count];
+-      trap_registers.pc = regs->pc;
+-      trap_registers.pr = regs->pr;
+-      trap_registers.sr = regs->sr;
+-      trap_registers.gbr = regs->gbr;
+-      trap_registers.mach = regs->mach;
+-      trap_registers.macl = regs->macl;
+-
+-      asm("stc vbr, %0":"=r"(vbr_val));
+-      trap_registers.vbr = vbr_val;
+-
+-      /* Get excode for command loop call, user access */
+-      asm("stc r2_bank, %0":"=r"(excep_code));
+-      kgdb_excode = excep_code;
+-
+-      /* Other interesting environment items for reference */
+-      asm("stc r6_bank, %0":"=r"(kgdb_g_imask));
+-      kgdb_current = current;
+-      kgdb_trapa_val = trapa_value;
+-
+-      /* Act on the exception */
+-      kgdb_command_loop(excep_code >> 5, trapa_value);
+-
+-      kgdb_current = NULL;
+-
+-      /* Copy back the (maybe modified) registers */
+-      for (count = 0; count < 16; count++)
+-              regs->regs[count] = trap_registers.regs[count];
+-      regs->pc = trap_registers.pc;
+-      regs->pr = trap_registers.pr;
+-      regs->sr = trap_registers.sr;
+-      regs->gbr = trap_registers.gbr;
+-      regs->mach = trap_registers.mach;
+-      regs->macl = trap_registers.macl;
+-
+-      vbr_val = trap_registers.vbr;
+-      asm("ldc %0, vbr": :"r"(vbr_val));
+-
+-      return;
+-}
+-
+-/* Trigger a breakpoint by function */
+-void breakpoint(void)
+-{
+-      if (!kgdb_enabled) {
+-              kgdb_enabled = 1;
+-              kgdb_init();
+-      }
+-      BREAKPOINT();
+-}
+-
+-/* Initialise the KGDB data structures and serial configuration */
+-int kgdb_init(void)
+-{
+-      if (!kgdb_enabled)
+-              return 1;
+-
+-      in_nmi = 0;
+-      kgdb_nofault = 0;
+-      stepped_opcode = 0;
+-      kgdb_in_gdb_mode = 0;
+-
+-      if (kgdb_serial_setup() != 0) {
+-              KGDB_PRINTK("serial setup error\n");
+-              return -1;
+-      }
+-
+-      /* Init ptr to exception handler */
+-      kgdb_debug_hook = kgdb_handle_exception;
+-      kgdb_bus_err_hook = kgdb_handle_bus_error;
+-
+-      /* Enter kgdb now if requested, or just report init done */
+-      if (kgdb_halt) {
+-              kgdb_in_gdb_mode = 1;
+-              put_debug_char('+');
+-              breakpoint();
+-      }
+-      else
+-      {
+-              KGDB_PRINTK("stub is initialized.\n");
+-      }
+-
+-      return 0;
+-}
+-
+-/* Make function available for "user messages"; console will use it too. */
+-
+-char gdbmsgbuf[BUFMAX];
+-#define MAXOUT ((BUFMAX-2)/2)
+-
+-static void kgdb_msg_write(const char *s, unsigned count)
+-{
+-      int i;
+-      int wcount;
+-      char *bufptr;
+-
+-      /* 'O'utput */
+-      gdbmsgbuf[0] = 'O';
+-
+-      /* Fill and send buffers... */
+-      while (count > 0) {
+-              bufptr = gdbmsgbuf + 1;
+-
+-              /* Calculate how many this time */
+-              wcount = (count > MAXOUT) ? MAXOUT : count;
+-              
+-              /* Pack in hex chars */
+-              for (i = 0; i < wcount; i++)
+-                      bufptr = pack_hex_byte(bufptr, s[i]);
+-              *bufptr = '\0';
+-
+-              /* Move up */
+-              s += wcount;
+-              count -= wcount;
+-
+-              /* Write packet */
+-              put_packet(gdbmsgbuf);
+-      }
+-}
+-
+-static void kgdb_to_gdb(const char *s)
+-{
+-      kgdb_msg_write(s, strlen(s));
+-}
+-
+-#ifdef CONFIG_SH_KGDB_CONSOLE
+-void kgdb_console_write(struct console *co, const char *s, unsigned count)
+-{
+-      /* Bail if we're not talking to GDB */
+-      if (!kgdb_in_gdb_mode)
+-              return;
+-
+-      kgdb_msg_write(s, count);
+-}
+-#endif
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/sh/kernel/setup.c linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/setup.c
+--- linux-2.6.18-53.1.14/arch/sh/kernel/setup.c        2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/setup.c   2008-06-10 15:38:50.000000000 +0400
+@@ -28,10 +28,6 @@
+ #include <asm/setup.h>
+ #include <asm/clock.h>
+ 
+-#ifdef CONFIG_SH_KGDB
+-#include <asm/kgdb.h>
+-static int kgdb_parse_options(char *options);
+-#endif
+ extern void * __rd_start, * __rd_end;
+ /*
+  * Machine setup..
+@@ -528,93 +524,3 @@ struct seq_operations cpuinfo_op = {
+       .show   = show_cpuinfo,
+ };
+ #endif /* CONFIG_PROC_FS */
+-
+-#ifdef CONFIG_SH_KGDB
+-/*
+- * Parse command-line kgdb options.  By default KGDB is enabled,
+- * entered on error (or other action) using default serial info.
+- * The command-line option can include a serial port specification
+- * and an action to override default or configured behavior.
+- */
+-struct kgdb_sermap kgdb_sci_sermap =
+-{ "ttySC", 5, kgdb_sci_setup, NULL };
+-
+-struct kgdb_sermap *kgdb_serlist = &kgdb_sci_sermap;
+-struct kgdb_sermap *kgdb_porttype = &kgdb_sci_sermap;
+-
+-void kgdb_register_sermap(struct kgdb_sermap *map)
+-{
+-      struct kgdb_sermap *last;
+-
+-      for (last = kgdb_serlist; last->next; last = last->next)
+-              ;
+-      last->next = map;
+-      if (!map->namelen) {
+-              map->namelen = strlen(map->name);
+-      }
+-}
+-
+-static int __init kgdb_parse_options(char *options)
+-{
+-      char c;
+-      int baud;
+-
+-      /* Check for port spec (or use default) */
+-
+-      /* Determine port type and instance */
+-      if (!memcmp(options, "tty", 3)) {
+-              struct kgdb_sermap *map = kgdb_serlist;
+-
+-              while (map && memcmp(options, map->name, map->namelen))
+-                      map = map->next;
+-
+-              if (!map) {
+-                      KGDB_PRINTK("unknown port spec in %s\n", options);
+-                      return -1;
+-              }
+-
+-              kgdb_porttype = map;
+-              kgdb_serial_setup = map->setup_fn;
+-              kgdb_portnum = options[map->namelen] - '0';
+-              options += map->namelen + 1;
+-
+-              options = (*options == ',') ? options+1 : options;
+-
+-              /* Read optional parameters (baud/parity/bits) */
+-              baud = simple_strtoul(options, &options, 10);
+-              if (baud != 0) {
+-                      kgdb_baud = baud;
+-
+-                      c = toupper(*options);
+-                      if (c == 'E' || c == 'O' || c == 'N') {
+-                              kgdb_parity = c;
+-                              options++;
+-                      }
+-
+-                      c = *options;
+-                      if (c == '7' || c == '8') {
+-                              kgdb_bits = c;
+-                              options++;
+-                      }
+-                      options = (*options == ',') ? options+1 : options;
+-              }
+-      }
+-
+-      /* Check for action specification */
+-      if (!memcmp(options, "halt", 4)) {
+-              kgdb_halt = 1;
+-              options += 4;
+-      } else if (!memcmp(options, "disabled", 8)) {
+-              kgdb_enabled = 0;
+-              options += 8;
+-      }
+-
+-      if (*options) {
+-                KGDB_PRINTK("ignored unknown options: %s\n", options);
+-              return 0;
+-      }
+-      return 1;
+-}
+-__setup("kgdb=", kgdb_parse_options);
+-#endif /* CONFIG_SH_KGDB */
+-
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/sh/kernel/time.c linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/time.c
+--- linux-2.6.18-53.1.14/arch/sh/kernel/time.c 2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/time.c    2008-06-10 15:38:50.000000000 +0400
+@@ -184,12 +184,4 @@ void __init time_init(void)
+        */
+       sys_timer = get_sys_timer();
+       printk(KERN_INFO "Using %s for system timer\n", sys_timer->name);
+-
+-#if defined(CONFIG_SH_KGDB)
+-      /*
+-       * Set up kgdb as requested. We do it here because the serial
+-       * init uses the timer vars we just set up for figuring baud.
+-       */
+-      kgdb_init();
+-#endif
+ }
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/sh/kernel/traps.c linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/traps.c
+--- linux-2.6.18-53.1.14/arch/sh/kernel/traps.c        2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/traps.c   2008-06-10 15:38:50.000000000 +0400
+@@ -26,6 +26,7 @@
+ #include <linux/spinlock.h>
+ #include <linux/module.h>
+ #include <linux/kallsyms.h>
++#include <linux/kgdb.h>
+ 
+ #include <asm/system.h>
+ #include <asm/uaccess.h>
+@@ -34,17 +35,8 @@
+ #include <asm/processor.h>
+ #include <asm/sections.h>
+ 
+-#ifdef CONFIG_SH_KGDB
+-#include <asm/kgdb.h>
+-#define CHK_REMOTE_DEBUG(regs)                                               \
+-{                                                                            \
+-  if ((kgdb_debug_hook != (kgdb_debug_hook_t *) NULL) && (!user_mode(regs))) \
+-  {                                                                          \
+-    (*kgdb_debug_hook)(regs);                                                \
+-  }                                                                          \
+-}
+-#else
+-#define CHK_REMOTE_DEBUG(regs)
++#ifndef CONFIG_KGDB
++#define kgdb_handle_exception(t, s, e, r)
+ #endif
+ 
+ #define DO_ERROR(trapnr, signr, str, name, tsk)                               \
+@@ -65,7 +57,7 @@ asmlinkage void do_##name(unsigned long 
+       local_irq_enable();                                             \
+       tsk->thread.error_code = error_code;                            \
+       tsk->thread.trap_no = trapnr;                                   \
+-        CHK_REMOTE_DEBUG(&regs);                                      \
++      kgdb_handle_exception(trapnr, signr, error_code, &regs);        \
+       force_sig(signr, tsk);                                          \
+       die_if_no_fixup(str,&regs,error_code);                          \
+ }
+@@ -92,10 +84,12 @@ void die(const char * str, struct pt_reg
+ {
+       static int die_counter;
+ 
++#ifdef CONFIG_KGDB
++      kgdb_handle_exception(1, SIGTRAP, err, regs);
++#endif
+       console_verbose();
+       spin_lock_irq(&die_lock);
+       printk("%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter);
+-      CHK_REMOTE_DEBUG(regs);
+       show_regs(regs);
+       spin_unlock_irq(&die_lock);
+       do_exit(SIGSEGV);
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/sh/mm/extable.c linux-2.6.18-53.1.14.kgdb/arch/sh/mm/extable.c
+--- linux-2.6.18-53.1.14/arch/sh/mm/extable.c  2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/sh/mm/extable.c     2008-06-10 15:38:50.000000000 +0400
+@@ -5,6 +5,7 @@
+  */
+ 
+ #include <linux/module.h>
++#include <linux/kgdb.h>
+ #include <asm/uaccess.h>
+ 
+ int fixup_exception(struct pt_regs *regs)
+@@ -16,6 +17,12 @@ int fixup_exception(struct pt_regs *regs
+               regs->pc = fixup->fixup;
+               return 1;
+       }
++#ifdef CONFIG_KGDB
++      if (atomic_read(&debugger_active) && kgdb_may_fault)
++              /* Restore our previous state. */
++              kgdb_fault_longjmp(kgdb_fault_jmp_regs);
++              /* Never reached. */
++#endif
+ 
+       return 0;
+ }
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/sh/mm/fault-nommu.c linux-2.6.18-53.1.14.kgdb/arch/sh/mm/fault-nommu.c
+--- linux-2.6.18-53.1.14/arch/sh/mm/fault-nommu.c      2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/sh/mm/fault-nommu.c 2008-06-10 15:38:50.000000000 +0400
+@@ -29,10 +29,6 @@
+ #include <asm/mmu_context.h>
+ #include <asm/cacheflush.h>
+ 
+-#if defined(CONFIG_SH_KGDB)
+-#include <asm/kgdb.h>
+-#endif
+-
+ extern void die(const char *,struct pt_regs *,long);
+ 
+ /*
+@@ -43,11 +39,6 @@ extern void die(const char *,struct pt_r
+ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long writeaccess,
+                             unsigned long address)
+ {
+-#if defined(CONFIG_SH_KGDB)
+-      if (kgdb_nofault && kgdb_bus_err_hook)
+-              kgdb_bus_err_hook();
+-#endif
+-
+       /*
+        * Oops. The kernel tried to access some bad page. We'll have to
+        * terminate things with extreme prejudice.
+@@ -69,11 +60,6 @@ asmlinkage void do_page_fault(struct pt_
+ asmlinkage int __do_page_fault(struct pt_regs *regs, unsigned long writeaccess,
+                              unsigned long address)
+ {
+-#if defined(CONFIG_SH_KGDB)
+-      if (kgdb_nofault && kgdb_bus_err_hook)
+-              kgdb_bus_err_hook();
+-#endif
+-
+       if (address >= TASK_SIZE)
+               return 1;
+ 
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/sh/mm/fault.c linux-2.6.18-53.1.14.kgdb/arch/sh/mm/fault.c
+--- linux-2.6.18-53.1.14/arch/sh/mm/fault.c    2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/sh/mm/fault.c       2008-06-10 15:38:50.000000000 +0400
+@@ -28,7 +28,6 @@
+ #include <asm/pgalloc.h>
+ #include <asm/mmu_context.h>
+ #include <asm/cacheflush.h>
+-#include <asm/kgdb.h>
+ 
+ extern void die(const char *,struct pt_regs *,long);
+ 
+@@ -45,11 +44,6 @@ asmlinkage void do_page_fault(struct pt_
+       struct vm_area_struct * vma;
+       unsigned long page;
+ 
+-#ifdef CONFIG_SH_KGDB
+-      if (kgdb_nofault && kgdb_bus_err_hook)
+-              kgdb_bus_err_hook();
+-#endif
+-
+       tsk = current;
+       mm = tsk->mm;
+ 
+@@ -153,6 +147,7 @@ no_context:
+       }
+       die("Oops", regs, writeaccess);
+       do_exit(SIGKILL);
++      dump_stack();
+ 
+ /*
+  * We ran out of memory, or some other thing happened to us that made
+@@ -202,11 +197,6 @@ asmlinkage int __do_page_fault(struct pt
+       spinlock_t *ptl;
+       int ret = 1;
+ 
+-#ifdef CONFIG_SH_KGDB
+-      if (kgdb_nofault && kgdb_bus_err_hook)
+-              kgdb_bus_err_hook();
+-#endif
+-
+ #ifdef CONFIG_SH_STORE_QUEUES
+       addrmax = P4SEG_STORE_QUE + 0x04000000;
+ #endif
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/x86_64/Kconfig.debug linux-2.6.18-53.1.14.kgdb/arch/x86_64/Kconfig.debug
+--- linux-2.6.18-53.1.14/arch/x86_64/Kconfig.debug     2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/arch/x86_64/Kconfig.debug        2008-06-10 15:38:44.000000000 +0400
+@@ -55,7 +55,4 @@ config DEBUG_STACK_USAGE
+ 
+         This option will slow down process creation somewhat.
+ 
+-#config X86_REMOTE_DEBUG
+-#       bool "kgdb debugging stub"
+-
+ endmenu
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/x86_64/kernel/Makefile linux-2.6.18-53.1.14.kgdb/arch/x86_64/kernel/Makefile
+--- linux-2.6.18-53.1.14/arch/x86_64/kernel/Makefile   2008-03-06 05:54:48.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/arch/x86_64/kernel/Makefile      2008-06-10 15:38:44.000000000 +0400
+@@ -35,6 +35,7 @@ obj-$(CONFIG_IOMMU)          += pci-gart.o apert
+ obj-$(CONFIG_CALGARY_IOMMU)   += pci-calgary.o tce.o
+ obj-$(CONFIG_SWIOTLB)         += pci-swiotlb.o
+ obj-$(CONFIG_KPROBES)         += kprobes.o
++obj-$(CONFIG_KGDB)            += kgdb.o kgdb-jmp.o
+ obj-$(CONFIG_X86_PM_TIMER)    += pmtimer.o
+ obj-$(CONFIG_X86_VSMP)                += vsmp.o
+ obj-$(CONFIG_K8_NB)           += k8.o
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/x86_64/kernel/entry.S linux-2.6.18-53.1.14.kgdb/arch/x86_64/kernel/entry.S
+--- linux-2.6.18-53.1.14/arch/x86_64/kernel/entry.S    2008-03-06 05:54:50.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/arch/x86_64/kernel/entry.S       2008-06-10 15:39:01.000000000 +0400
+@@ -42,6 +42,7 @@
+ #include <asm/hw_irq.h>
+ #include <asm/page.h>
+ #include <asm/irqflags.h>
++#include <asm/kgdb.h>
+ 
+       .code64
+ 
+@@ -887,6 +888,7 @@ error_exit:                
+       RESTORE_ARGS 0,8,0                                              
+       jmp iret_label
+       CFI_ENDPROC
++      CFI_END_FRAME(kernel_thread)
+ 
+ error_kernelspace:
+       incl %ebx
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/x86_64/kernel/kgdb-jmp.S linux-2.6.18-53.1.14.kgdb/arch/x86_64/kernel/kgdb-jmp.S
+--- linux-2.6.18-53.1.14/arch/x86_64/kernel/kgdb-jmp.S 1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/arch/x86_64/kernel/kgdb-jmp.S    2008-06-10 15:38:44.000000000 +0400
+@@ -0,0 +1,65 @@
++/*
++ * arch/x86_64/kernel/kgdb-jmp.S
++ *
++ * Save and restore system registers so that within a limited frame we
++ * may have a fault and "jump back" to a known safe location.
++ *
++ * Author: Tom Rini <trini@kernel.crashing.org>
++ *
++ * Cribbed from glibc, which carries the following:
++ * Copyright (C) 2001, 2003, 2004 Free Software Foundation, Inc.
++ * Copyright (C) 2005 by MontaVista Software.
++ *
++ * This file is licensed under the terms of the GNU General Public License
++ * version 2. This program as licensed "as is" without any warranty of
++ * any kind, whether express or implied.
++ */
++
++#include <linux/linkage.h>
++
++#define JB_RBX                0
++#define JB_RBP                1
++#define JB_R12                2
++#define JB_R13                3
++#define JB_R14                4
++#define JB_R15                5
++#define JB_RSP                6
++#define JB_PC         7
++
++      .code64
++
++/* This must be called prior to kgdb_fault_longjmp and
++ * kgdb_fault_longjmp must not be called outside of the context of the
++ * last call to kgdb_fault_setjmp.
++ */
++ENTRY(kgdb_fault_setjmp)
++      /* Save registers. */
++      movq %rbx, (JB_RBX*8)(%rdi)
++      movq %rbp, (JB_RBP*8)(%rdi)
++      movq %r12, (JB_R12*8)(%rdi)
++      movq %r13, (JB_R13*8)(%rdi)
++      movq %r14, (JB_R14*8)(%rdi)
++      movq %r15, (JB_R15*8)(%rdi)
++      leaq 8(%rsp), %rdx      /* Save SP as it will be after we return. */
++      movq %rdx, (JB_RSP*8)(%rdi)
++      movq (%rsp), %rax       /* Save PC we are returning to now. */
++      movq %rax, (JB_PC*8)(%rdi)
++      /* Set return value for setjmp. */
++      mov $0,%eax
++      movq (JB_PC*8)(%rdi),%rdx
++      movq (JB_RSP*8)(%rdi),%rsp
++      jmpq *%rdx
++
++ENTRY(kgdb_fault_longjmp)
++      /* Restore registers. */
++      movq (JB_RBX*8)(%rdi),%rbx
++      movq (JB_RBP*8)(%rdi),%rbp
++      movq (JB_R12*8)(%rdi),%r12
++      movq (JB_R13*8)(%rdi),%r13
++      movq (JB_R14*8)(%rdi),%r14
++      movq (JB_R15*8)(%rdi),%r15
++      /* Set return value for setjmp. */
++      movq (JB_PC*8)(%rdi),%rdx
++      movq (JB_RSP*8)(%rdi),%rsp
++      mov $1,%eax
++      jmpq *%rdx
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/x86_64/kernel/kgdb.c linux-2.6.18-53.1.14.kgdb/arch/x86_64/kernel/kgdb.c
+--- linux-2.6.18-53.1.14/arch/x86_64/kernel/kgdb.c     1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/arch/x86_64/kernel/kgdb.c        2008-06-10 15:38:44.000000000 +0400
+@@ -0,0 +1,474 @@
++/*
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License as published by the
++ * Free Software Foundation; either version 2, or (at your option) any
++ * later version.
++ *
++ * This program is distributed in the hope that it will be useful, but
++ * WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * General Public License for more details.
++ *
++ */
++
++/*
++ * Copyright (C) 2004 Amit S. Kale <amitkale@linsyssoft.com>
++ * Copyright (C) 2000-2001 VERITAS Software Corporation.
++ * Copyright (C) 2002 Andi Kleen, SuSE Labs
++ * Copyright (C) 2004 LinSysSoft Technologies Pvt. Ltd.
++ */
++/****************************************************************************
++ *  Contributor:     Lake Stevens Instrument Division$
++ *  Written by:      Glenn Engel $
++ *  Updated by:            Amit Kale<akale@veritas.com>
++ *  Modified for 386 by Jim Kingdon, Cygnus Support.
++ *  Origianl kgdb, compatibility with 2.1.xx kernel by
++ *  David Grothe <dave@gcom.com>
++ *  Integrated into 2.2.5 kernel by Tigran Aivazian <tigran@sco.com>
++ *  X86_64 changes from Andi Kleen's patch merged by Jim Houston
++ */
++
++#include <linux/string.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/smp.h>
++#include <linux/spinlock.h>
++#include <linux/delay.h>
++#include <asm/system.h>
++#include <asm/ptrace.h>               /* for linux pt_regs struct */
++#include <linux/kgdb.h>
++#include <linux/init.h>
++#include <asm/apicdef.h>
++#include <asm/mach_apic.h>
++#include <asm/kdebug.h>
++#include <asm/debugreg.h>
++
++/* Put the error code here just in case the user cares.  */
++int gdb_x86_64errcode;
++/* Likewise, the vector number here (since GDB only gets the signal
++   number through the usual means, and that's not very specific).  */
++int gdb_x86_64vector = -1;
++
++extern atomic_t cpu_doing_single_step;
++
++void regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
++{
++      gdb_regs[_RAX] = regs->rax;
++      gdb_regs[_RBX] = regs->rbx;
++      gdb_regs[_RCX] = regs->rcx;
++      gdb_regs[_RDX] = regs->rdx;
++      gdb_regs[_RSI] = regs->rsi;
++      gdb_regs[_RDI] = regs->rdi;
++      gdb_regs[_RBP] = regs->rbp;
++      gdb_regs[_PS] = regs->eflags;
++      gdb_regs[_PC] = regs->rip;
++      gdb_regs[_R8] = regs->r8;
++      gdb_regs[_R9] = regs->r9;
++      gdb_regs[_R10] = regs->r10;
++      gdb_regs[_R11] = regs->r11;
++      gdb_regs[_R12] = regs->r12;
++      gdb_regs[_R13] = regs->r13;
++      gdb_regs[_R14] = regs->r14;
++      gdb_regs[_R15] = regs->r15;
++      gdb_regs[_RSP] = regs->rsp;
++}
++
++extern void thread_return(void);
++void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
++{
++      gdb_regs[_RAX] = 0;
++      gdb_regs[_RBX] = 0;
++      gdb_regs[_RCX] = 0;
++      gdb_regs[_RDX] = 0;
++      gdb_regs[_RSI] = 0;
++      gdb_regs[_RDI] = 0;
++      gdb_regs[_RBP] = *(unsigned long *)p->thread.rsp;
++      gdb_regs[_PS] = *(unsigned long *)(p->thread.rsp + 8);
++      gdb_regs[_PC] = (unsigned long)&thread_return;
++      gdb_regs[_R8] = 0;
++      gdb_regs[_R9] = 0;
++      gdb_regs[_R10] = 0;
++      gdb_regs[_R11] = 0;
++      gdb_regs[_R12] = 0;
++      gdb_regs[_R13] = 0;
++      gdb_regs[_R14] = 0;
++      gdb_regs[_R15] = 0;
++      gdb_regs[_RSP] = p->thread.rsp;
++}
++
++void gdb_regs_to_regs(unsigned long *gdb_regs, struct pt_regs *regs)
++{
++      regs->rax = gdb_regs[_RAX];
++      regs->rbx = gdb_regs[_RBX];
++      regs->rcx = gdb_regs[_RCX];
++      regs->rdx = gdb_regs[_RDX];
++      regs->rsi = gdb_regs[_RSI];
++      regs->rdi = gdb_regs[_RDI];
++      regs->rbp = gdb_regs[_RBP];
++      regs->eflags = gdb_regs[_PS];
++      regs->rip = gdb_regs[_PC];
++      regs->r8 = gdb_regs[_R8];
++      regs->r9 = gdb_regs[_R9];
++      regs->r10 = gdb_regs[_R10];
++      regs->r11 = gdb_regs[_R11];
++      regs->r12 = gdb_regs[_R12];
++      regs->r13 = gdb_regs[_R13];
++      regs->r14 = gdb_regs[_R14];
++      regs->r15 = gdb_regs[_R15];
++#if 0                         /* can't change these */
++      regs->rsp = gdb_regs[_RSP];
++      regs->ss = gdb_regs[_SS];
++      regs->fs = gdb_regs[_FS];
++      regs->gs = gdb_regs[_GS];
++#endif
++
++}                             /* gdb_regs_to_regs */
++
++struct hw_breakpoint {
++      unsigned enabled;
++      unsigned type;
++      unsigned len;
++      unsigned long addr;
++} breakinfo[4] = { {
++enabled:0}, {
++enabled:0}, {
++enabled:0}, {
++enabled:0}};
++
++void kgdb_correct_hw_break(void)
++{
++      int breakno;
++      int correctit;
++      int breakbit;
++      unsigned long dr7;
++
++      asm volatile ("movq %%db7, %0\n":"=r" (dr7):);
++      do {
++              unsigned long addr0, addr1, addr2, addr3;
++              asm volatile ("movq %%db0, %0\n"
++                            "movq %%db1, %1\n"
++                            "movq %%db2, %2\n"
++                            "movq %%db3, %3\n":"=r" (addr0), "=r"(addr1),
++                            "=r"(addr2), "=r"(addr3):);
++      } while (0);
++      correctit = 0;
++      for (breakno = 0; breakno < 3; breakno++) {
++              breakbit = 2 << (breakno << 1);
++              if (!(dr7 & breakbit) && breakinfo[breakno].enabled) {
++                      correctit = 1;
++                      dr7 |= breakbit;
++                      dr7 &= ~(0xf0000 << (breakno << 2));
++                      dr7 |= (((breakinfo[breakno].len << 2) |
++                               breakinfo[breakno].type) << 16) <<
++                          (breakno << 2);
++                      switch (breakno) {
++                      case 0:
++                              asm volatile ("movq %0, %%dr0\n"::"r"
++                                            (breakinfo[breakno].addr));
++                              break;
++
++                      case 1:
++                              asm volatile ("movq %0, %%dr1\n"::"r"
++                                            (breakinfo[breakno].addr));
++                              break;
++
++                      case 2:
++                              asm volatile ("movq %0, %%dr2\n"::"r"
++                                            (breakinfo[breakno].addr));
++                              break;
++
++                      case 3:
++                              asm volatile ("movq %0, %%dr3\n"::"r"
++                                            (breakinfo[breakno].addr));
++                              break;
++                      }
++              } else if ((dr7 & breakbit) && !breakinfo[breakno].enabled) {
++                      correctit = 1;
++                      dr7 &= ~breakbit;
++                      dr7 &= ~(0xf0000 << (breakno << 2));
++              }
++      }
++      if (correctit) {
++              asm volatile ("movq %0, %%db7\n"::"r" (dr7));
++      }
++}
++
++int kgdb_remove_hw_break(unsigned long addr)
++{
++      int i, idx = -1;
++      for (i = 0; i < 4; i++) {
++              if (breakinfo[i].addr == addr && breakinfo[i].enabled) {
++                      idx = i;
++                      break;
++              }
++      }
++      if (idx == -1)
++              return -1;
++
++      breakinfo[idx].enabled = 0;
++      return 0;
++}
++
++int kgdb_set_hw_break(unsigned long addr)
++{
++      int i, idx = -1;
++      for (i = 0; i < 4; i++) {
++              if (!breakinfo[i].enabled) {
++                      idx = i;
++                      break;
++              }
++      }
++      if (idx == -1)
++              return -1;
++
++      breakinfo[idx].enabled = 1;
++      breakinfo[idx].type = 1;
++      breakinfo[idx].len = 1;
++      breakinfo[idx].addr = addr;
++      return 0;
++}
++
++int remove_hw_break(unsigned breakno)
++{
++      if (!breakinfo[breakno].enabled) {
++              return -1;
++      }
++      breakinfo[breakno].enabled = 0;
++      return 0;
++}
++
++int set_hw_break(unsigned breakno, unsigned type, unsigned len, unsigned addr)
++{
++      if (breakinfo[breakno].enabled) {
++              return -1;
++      }
++      breakinfo[breakno].enabled = 1;
++      breakinfo[breakno].type = type;
++      breakinfo[breakno].len = len;
++      breakinfo[breakno].addr = addr;
++      return 0;
++}
++
++void kgdb_disable_hw_debug(struct pt_regs *regs)
++{
++      /* Disable hardware debugging while we are in kgdb */
++      asm volatile ("movq %0,%%db7": /* no output */ :"r" (0UL));
++}
++
++void kgdb_post_master_code(struct pt_regs *regs, int e_vector, int err_code)
++{
++      /* Master processor is completely in the debugger */
++      gdb_x86_64vector = e_vector;
++      gdb_x86_64errcode = err_code;
++}
++
++void kgdb_roundup_cpus(unsigned long flags)
++{
++      send_IPI_allbutself(APIC_DM_NMI);
++}
++
++int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
++                             char *remcomInBuffer, char *remcomOutBuffer,
++                             struct pt_regs *linux_regs)
++{
++      unsigned long addr, length;
++      unsigned long breakno, breaktype;
++      char *ptr;
++      int newPC;
++      unsigned long dr6;
++
++      switch (remcomInBuffer[0]) {
++      case 'c':
++      case 's':
++              /* try to read optional parameter, pc unchanged if no parm */
++              ptr = &remcomInBuffer[1];
++              if (kgdb_hex2long(&ptr, &addr))
++                      linux_regs->rip = addr;
++              newPC = linux_regs->rip;
++
++              /* clear the trace bit */
++              linux_regs->eflags &= ~TF_MASK;
++
++              atomic_set(&cpu_doing_single_step, -1);
++              /* set the trace bit if we're stepping */
++              if (remcomInBuffer[0] == 's') {
++                      linux_regs->eflags |= TF_MASK;
++                      debugger_step = 1;
++                      if (kgdb_contthread)
++                              atomic_set(&cpu_doing_single_step,
++                                         smp_processor_id());
++
++              }
++
++              asm volatile ("movq %%db6, %0\n":"=r" (dr6));
++              if (!(dr6 & 0x4000)) {
++                      for (breakno = 0; breakno < 4; ++breakno) {
++                              if (dr6 & (1 << breakno)) {
++                                      if (breakinfo[breakno].type == 0) {
++                                              /* Set restore flag */
++                                              linux_regs->eflags |=
++                                                  X86_EFLAGS_RF;
++                                              break;
++                                      }
++                              }
++                      }
++              }
++              kgdb_correct_hw_break();
++              asm volatile ("movq %0, %%db6\n"::"r" (0UL));
++
++              return (0);
++
++      case 'Y':
++              ptr = &remcomInBuffer[1];
++              kgdb_hex2long(&ptr, &breakno);
++              ptr++;
++              kgdb_hex2long(&ptr, &breaktype);
++              ptr++;
++              kgdb_hex2long(&ptr, &length);
++              ptr++;
++              kgdb_hex2long(&ptr, &addr);
++              if (set_hw_break(breakno & 0x3, breaktype & 0x3,
++                               length & 0x3, addr) == 0)
++                      strcpy(remcomOutBuffer, "OK");
++              else
++                      strcpy(remcomOutBuffer, "ERROR");
++              break;
++
++              /* Remove hardware breakpoint */
++      case 'y':
++              ptr = &remcomInBuffer[1];
++              kgdb_hex2long(&ptr, &breakno);
++              if (remove_hw_break(breakno & 0x3) == 0)
++                      strcpy(remcomOutBuffer, "OK");
++              else
++                      strcpy(remcomOutBuffer, "ERROR");
++              break;
++
++      }                       /* switch */
++      return -1;
++}
++
++static struct pt_regs *in_interrupt_stack(unsigned long rsp, int cpu)
++{
++      struct pt_regs *regs;
++      unsigned long end = (unsigned long)cpu_pda(cpu)->irqstackptr;
++      if (rsp <= end && rsp >= end - IRQSTACKSIZE + 8) {
++              regs = *(((struct pt_regs **)end) - 1);
++              return regs;
++      }
++      return NULL;
++}
++
++static struct pt_regs *in_exception_stack(unsigned long rsp, int cpu)
++{
++      int i;
++      struct tss_struct *init_tss = &__get_cpu_var(init_tss);
++      for (i = 0; i < N_EXCEPTION_STACKS; i++)
++              if (rsp >= init_tss[cpu].ist[i] &&
++                  rsp <= init_tss[cpu].ist[i] + EXCEPTION_STKSZ) {
++                      struct pt_regs *r =
++                          (void *)init_tss[cpu].ist[i] + EXCEPTION_STKSZ;
++                      return r - 1;
++              }
++      return NULL;
++}
++
++void kgdb_shadowinfo(struct pt_regs *regs, char *buffer, unsigned threadid)
++{
++      static char intr_desc[] = "Stack at interrupt entrypoint";
++      static char exc_desc[] = "Stack at exception entrypoint";
++      struct pt_regs *stregs;
++      int cpu = hard_smp_processor_id();
++
++      if ((stregs = in_interrupt_stack(regs->rsp, cpu)))
++              kgdb_mem2hex(intr_desc, buffer, strlen(intr_desc));
++      else if ((stregs = in_exception_stack(regs->rsp, cpu)))
++              kgdb_mem2hex(exc_desc, buffer, strlen(exc_desc));
++}
++
++struct task_struct *kgdb_get_shadow_thread(struct pt_regs *regs, int threadid)
++{
++      struct pt_regs *stregs;
++      int cpu = hard_smp_processor_id();
++
++      if ((stregs = in_interrupt_stack(regs->rsp, cpu)))
++              return current;
++      else if ((stregs = in_exception_stack(regs->rsp, cpu)))
++              return current;
++
++      return NULL;
++}
++
++struct pt_regs *kgdb_shadow_regs(struct pt_regs *regs, int threadid)
++{
++      struct pt_regs *stregs;
++      int cpu = hard_smp_processor_id();
++
++      if ((stregs = in_interrupt_stack(regs->rsp, cpu)))
++              return stregs;
++      else if ((stregs = in_exception_stack(regs->rsp, cpu)))
++              return stregs;
++
++      return NULL;
++}
++
++/* Register KGDB with the die_chain so that we hook into all of the right
++ * spots. */
++static int kgdb_notify(struct notifier_block *self, unsigned long cmd,
++                     void *ptr)
++{
++      struct die_args *args = ptr;
++      struct pt_regs *regs = args->regs;
++
++      if (cmd == DIE_PAGE_FAULT_NO_CONTEXT && atomic_read(&debugger_active)
++                      && kgdb_may_fault) {
++              kgdb_fault_longjmp(kgdb_fault_jmp_regs);
++              return NOTIFY_STOP;
++      /* CPU roundup? */
++      } else if (atomic_read(&debugger_active) && cmd == DIE_NMI_IPI) {
++              kgdb_nmihook(smp_processor_id(), regs);
++              return NOTIFY_STOP;
++              /* See if KGDB is interested. */
++      } else if (cmd == DIE_PAGE_FAULT || user_mode(regs) ||
++                 cmd == DIE_NMI_IPI || (cmd == DIE_DEBUG &&
++                                        atomic_read(&debugger_active)))
++              /* Userpace events, normal watchdog event, or spurious
++               * debug exception.  Ignore. */
++              return NOTIFY_DONE;
++
++      kgdb_handle_exception(args->trapnr, args->signr, args->err, regs);
++
++      return NOTIFY_STOP;
++}
++
++static struct notifier_block kgdb_notifier = {
++      .notifier_call = kgdb_notify,
++      .priority = 0x7fffffff, /* we need to notified first */
++};
++
++int kgdb_arch_init(void)
++{
++      atomic_notifier_chain_register(&die_chain, &kgdb_notifier);
++      return 0;
++}
++/*
++ * Skip an int3 exception when it occurs after a breakpoint has been
++ * removed. Backtrack eip by 1 since the int3 would have caused it to
++ * increment by 1.
++ */
++
++int kgdb_skipexception(int exception, struct pt_regs *regs)
++{
++      if (exception == 3 && kgdb_isremovedbreak(regs->rip - 1)) {
++              regs->rip -= 1;
++              return 1;
++      }
++      return 0;
++}
++
++struct kgdb_arch arch_kgdb_ops = {
++      .gdb_bpt_instr = {0xcc},
++      .flags = KGDB_HW_BREAKPOINT,
++      .shadowth = 1,
++};
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/x86_64/mm/fault.c linux-2.6.18-53.1.14.kgdb/arch/x86_64/mm/fault.c
+--- linux-2.6.18-53.1.14/arch/x86_64/mm/fault.c        2008-03-06 05:54:27.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/arch/x86_64/mm/fault.c   2008-06-10 15:38:41.000000000 +0400
+@@ -557,6 +557,10 @@ no_context:
+       if (is_errata93(regs, address))
+               return; 
+ 
++      if (notify_die(DIE_PAGE_FAULT_NO_CONTEXT, "no context", regs,
++                              error_code, 14, SIGSEGV) == NOTIFY_STOP)
++              return;
++
+ /*
+  * Oops. The kernel tried to access some bad page. We'll have to
+  * terminate things with extreme prejudice.
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/drivers/char/keyboard.c linux-2.6.18-53.1.14.kgdb/drivers/char/keyboard.c
+--- linux-2.6.18-53.1.14/drivers/char/keyboard.c       2008-03-06 05:54:23.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/drivers/char/keyboard.c  2008-06-10 15:39:11.000000000 +0400
+@@ -1174,6 +1174,7 @@ static void kbd_keycode(unsigned int key
+               sysrq_down = 0;
+       if (sysrq_down && down && !rep) {
+               handle_sysrq(kbd_sysrq_xlate[keycode], regs, tty);
++              sysrq_down = 0;         /* In case we miss the 'up' event. */
+               return;
+       }
+ #endif
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/drivers/net/Makefile linux-2.6.18-53.1.14.kgdb/drivers/net/Makefile
+--- linux-2.6.18-53.1.14/drivers/net/Makefile  2008-03-06 05:54:59.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/drivers/net/Makefile     2008-06-10 15:37:55.000000000 +0400
+@@ -221,6 +221,7 @@ obj-$(CONFIG_ETRAX_ETHERNET) += cris/
+ obj-$(CONFIG_ENP2611_MSF_NET) += ixp2000/
+ 
+ obj-$(CONFIG_NETCONSOLE) += netconsole.o
++obj-$(CONFIG_KGDBOE) += kgdboe.o
+ 
+ obj-$(CONFIG_FS_ENET) += fs_enet/
+ 
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/drivers/net/kgdboe.c linux-2.6.18-53.1.14.kgdb/drivers/net/kgdboe.c
+--- linux-2.6.18-53.1.14/drivers/net/kgdboe.c  1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/drivers/net/kgdboe.c     2008-06-10 15:37:55.000000000 +0400
+@@ -0,0 +1,294 @@
++/*
++ * drivers/net/kgdboe.c
++ *
++ * A network interface for GDB.
++ * Based upon 'gdbserial' by David Grothe <dave@gcom.com>
++ * and Scott Foehner <sfoehner@engr.sgi.com>
++ *
++ * Maintainers: Amit S. Kale <amitkale@linsyssoft.com> and
++ *            Tom Rini <trini@kernel.crashing.org>
++ *
++ * 2004 (c) Amit S. Kale <amitkale@linsyssoft.com>
++ * 2004-2005 (c) MontaVista Software, Inc.
++ * 2005 (c) Wind River Systems, Inc.
++ *
++ * Contributors at various stages not listed above:
++ * San Mehat <nettwerk@biodome.org>, Robert Walsh <rjwalsh@durables.org>,
++ * wangdi <wangdi@clusterfs.com>, Matt Mackall <mpm@selenic.com>,
++ * Pavel Machek <pavel@suse.cz>, Jason Wessel <jason.wessel@windriver.com>
++ *
++ * This file is licensed under the terms of the GNU General Public License
++ * version 2. This program is licensed "as is" without any warranty of any
++ * kind, whether express or implied.
++ */
++
++#include <linux/kernel.h>
++#include <linux/interrupt.h>
++#include <linux/string.h>
++#include <linux/kgdb.h>
++#include <linux/netpoll.h>
++#include <linux/init.h>
++
++#include <asm/atomic.h>
++
++#define IN_BUF_SIZE 512               /* power of 2, please */
++#define NOT_CONFIGURED_STRING "not_configured"
++#define OUT_BUF_SIZE 30               /* We don't want to send too big of a packet. */
++#define MAX_KGDBOE_CONFIG_STR 256
++
++static char in_buf[IN_BUF_SIZE], out_buf[OUT_BUF_SIZE];
++static int in_head, in_tail, out_count;
++static atomic_t in_count;
++/* 0 = unconfigured, 1 = netpoll options parsed, 2 = fully configured. */
++static int configured;
++static struct kgdb_io local_kgdb_io_ops;
++static int use_dynamic_mac;
++
++MODULE_DESCRIPTION("KGDB driver for network interfaces");
++MODULE_LICENSE("GPL");
++static char config[MAX_KGDBOE_CONFIG_STR] = NOT_CONFIGURED_STRING;
++static struct kparam_string kps = {
++      .string = config,
++      .maxlen = MAX_KGDBOE_CONFIG_STR,
++};
++
++static void rx_hook(struct netpoll *np, int port, char *msg, int len,
++                  struct sk_buff *skb)
++{
++      int i;
++
++      np->remote_port = port;
++
++      /* Copy the MAC address if we need to. */
++      if (use_dynamic_mac) {
++              memcpy(np->remote_mac, eth_hdr(skb)->h_source,
++                              sizeof(np->remote_mac));
++              use_dynamic_mac = 0;
++      }
++
++      /*
++       * This could be GDB trying to attach.  But it could also be GDB
++       * finishing up a session, with kgdb_connected=0 but GDB sending
++       * an ACK for the final packet.  To make sure we don't try and
++       * make a breakpoint when GDB is leaving, make sure that if
++       * !kgdb_connected the only len == 1 packet we allow is ^C.
++       */
++      if (!kgdb_connected && (len != 1 || msg[0] == 3) &&
++          !atomic_read(&kgdb_setting_breakpoint)) {
++              tasklet_schedule(&kgdb_tasklet_breakpoint);
++      }
++
++      for (i = 0; i < len; i++) {
++              if (msg[i] == 3)
++                      tasklet_schedule(&kgdb_tasklet_breakpoint);
++
++              if (atomic_read(&in_count) >= IN_BUF_SIZE) {
++                      /* buffer overflow, clear it */
++                      in_head = in_tail = 0;
++                      atomic_set(&in_count, 0);
++                      break;
++              }
++              in_buf[in_head++] = msg[i];
++              in_head &= (IN_BUF_SIZE - 1);
++              atomic_inc(&in_count);
++      }
++}
++
++static struct netpoll np = {
++      .dev_name = "eth0",
++      .name = "kgdboe",
++      .rx_hook = rx_hook,
++      .local_port = 6443,
++      .remote_port = 6442,
++      .remote_mac = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
++};
++
++static void eth_pre_exception_handler(void)
++{
++      /* Increment the module count when the debugger is active */
++      if (!kgdb_connected)
++              try_module_get(THIS_MODULE);
++      netpoll_set_trap(1);
++}
++
++static void eth_post_exception_handler(void)
++{
++      /* decrement the module count when the debugger detaches */
++      if (!kgdb_connected)
++              module_put(THIS_MODULE);
++      netpoll_set_trap(0);
++}
++
++static int eth_get_char(void)
++{
++      int chr;
++
++      while (atomic_read(&in_count) == 0)
++              netpoll_poll(&np);
++
++      chr = in_buf[in_tail++];
++      in_tail &= (IN_BUF_SIZE - 1);
++      atomic_dec(&in_count);
++      return chr;
++}
++
++static void eth_flush_buf(void)
++{
++      if (out_count && np.dev) {
++              netpoll_send_udp(&np, out_buf, out_count);
++              memset(out_buf, 0, sizeof(out_buf));
++              out_count = 0;
++      }
++}
++
++static void eth_put_char(u8 chr)
++{
++      out_buf[out_count++] = chr;
++      if (out_count == OUT_BUF_SIZE)
++              eth_flush_buf();
++}
++
++static int option_setup(char *opt)
++{
++      char opt_scratch[MAX_KGDBOE_CONFIG_STR];
++
++      /* If we're being given a new configuration, copy it in. */
++      if (opt != config)
++              strcpy(config, opt);
++      /* But work on a copy as netpoll_parse_options will eat it. */
++      strcpy(opt_scratch, opt);
++      configured = !netpoll_parse_options(&np, opt_scratch);
++
++      use_dynamic_mac = 1;
++
++      return 0;
++}
++__setup("kgdboe=", option_setup);
++
++/* With our config string set by some means, configure kgdboe. */
++static int configure_kgdboe(void)
++{
++      /* Try out the string. */
++      option_setup(config);
++
++      if (!configured) {
++              printk(KERN_ERR "kgdboe: configuration incorrect - kgdboe not "
++                     "loaded.\n");
++              printk(KERN_ERR "  Usage: kgdboe=[src-port]@[src-ip]/[dev],"
++                              "[tgt-port]@<tgt-ip>/<tgt-macaddr>\n");
++              return -EINVAL;
++      }
++
++      /* Bring it up. */
++      if (netpoll_setup(&np)) {
++              printk(KERN_ERR "kgdboe: netpoll_setup failed kgdboe failed\n");
++              return -EINVAL;
++      }
++
++      if (kgdb_register_io_module(&local_kgdb_io_ops)) {
++              netpoll_cleanup(&np);
++              return -EINVAL;
++      }
++
++      configured = 2;
++
++      return 0;
++}
++
++static int init_kgdboe(void)
++{
++      int ret;
++
++      /* Already done? */
++      if (configured == 2)
++              return 0;
++
++      /* OK, go ahead and do it. */
++      ret = configure_kgdboe();
++
++      if (configured == 2)
++              printk(KERN_INFO "kgdboe: debugging over ethernet enabled\n");
++
++      return ret;
++}
++
++static void cleanup_kgdboe(void)
++{
++      netpoll_cleanup(&np);
++      configured = 0;
++      kgdb_unregister_io_module(&local_kgdb_io_ops);
++}
++
++static int param_set_kgdboe_var(const char *kmessage, struct kernel_param *kp)
++{
++      char kmessage_save[MAX_KGDBOE_CONFIG_STR];
++      int msg_len = strlen(kmessage);
++
++      if (msg_len + 1 > MAX_KGDBOE_CONFIG_STR) {
++              printk(KERN_ERR "%s: string doesn't fit in %u chars.\n",
++                     kp->name, MAX_KGDBOE_CONFIG_STR - 1);
++              return -ENOSPC;
++      }
++
++      if (kgdb_connected) {
++              printk(KERN_ERR "kgdboe: Cannot reconfigure while KGDB is "
++                              "connected.\n");
++              return 0;
++      }
++
++      /* Start the reconfiguration process by saving the old string */
++      strncpy(kmessage_save, config, sizeof(kmessage_save));
++
++
++      /* Copy in the new param and strip out invalid characters so we
++       * can optionally specify the MAC.
++       */
++      strncpy(config, kmessage, sizeof(config));
++      msg_len--;
++      while (msg_len > 0 &&
++                      (config[msg_len] < ',' || config[msg_len] > 'f')) {
++              config[msg_len] = '\0';
++              msg_len--;
++      }
++
++      /* Check to see if we are unconfiguring the io module and that it
++       * was in a fully configured state, as this is the only time that
++       * netpoll_cleanup should get called
++       */
++      if (configured == 2 && strcmp(config, NOT_CONFIGURED_STRING) == 0) {
++              printk(KERN_INFO "kgdboe: reverting to unconfigured state\n");
++              cleanup_kgdboe();
++              return 0;
++      } else
++              /* Go and configure with the new params. */
++              configure_kgdboe();
++
++      if (configured == 2)
++              return 0;
++
++      /* If the new string was invalid, revert to the previous state, which
++       * is at a minimum not_configured. */
++      strncpy(config, kmessage_save, sizeof(config));
++      if (strcmp(kmessage_save, NOT_CONFIGURED_STRING) != 0) {
++              printk(KERN_INFO "kgdboe: reverting to prior configuration\n");
++              /* revert back to the original config */
++              strncpy(config, kmessage_save, sizeof(config));
++              configure_kgdboe();
++      }
++      return 0;
++}
++
++static struct kgdb_io local_kgdb_io_ops = {
++      .read_char = eth_get_char,
++      .write_char = eth_put_char,
++      .init = init_kgdboe,
++      .flush = eth_flush_buf,
++      .pre_exception = eth_pre_exception_handler,
++      .post_exception = eth_post_exception_handler
++};
++
++module_init(init_kgdboe);
++module_exit(cleanup_kgdboe);
++module_param_call(kgdboe, param_set_kgdboe_var, param_get_string, &kps, 0644);
++MODULE_PARM_DESC(kgdboe, " kgdboe=[src-port]@[src-ip]/[dev],"
++               "[tgt-port]@<tgt-ip>/<tgt-macaddr>\n");
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/drivers/serial/8250.c linux-2.6.18-53.1.14.kgdb/drivers/serial/8250.c
+--- linux-2.6.18-53.1.14/drivers/serial/8250.c 2008-03-06 05:54:43.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/drivers/serial/8250.c    2008-06-10 15:37:43.000000000 +0400
+@@ -2656,6 +2656,25 @@ void serial8250_unregister_port(int line
+ }
+ EXPORT_SYMBOL(serial8250_unregister_port);
+ 
++/**
++ *    serial8250_unregister_by_port - remove a 16x50 serial port
++ *    at runtime.
++ *    @port: A &struct uart_port that describes the port to remove.
++ *
++ *    Remove one serial port.  This may not be called from interrupt
++ *    context.  We hand the port back to the our control.
++ */
++void serial8250_unregister_by_port(struct uart_port *port)
++{
++      struct uart_8250_port *uart;
++
++      uart = serial8250_find_match_or_unused(port);
++
++      if (uart)
++              serial8250_unregister_port(uart->port.line);
++}
++EXPORT_SYMBOL(serial8250_unregister_by_port);
++
+ static int __init serial8250_init(void)
+ {
+       int ret, i;
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/drivers/serial/8250_kgdb.c linux-2.6.18-53.1.14.kgdb/drivers/serial/8250_kgdb.c
+--- linux-2.6.18-53.1.14/drivers/serial/8250_kgdb.c    1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/drivers/serial/8250_kgdb.c       2008-06-10 15:37:43.000000000 +0400
+@@ -0,0 +1,516 @@
++/*
++ * 8250 interface for kgdb.
++ *
++ * This is a merging of many different drivers, and all of the people have
++ * had an impact in some form or another:
++ *
++ * 2004-2005 (c) MontaVista Software, Inc.
++ * 2005-2006 (c) Wind River Systems, Inc.
++ *
++ * Amit Kale <amitkale@emsyssoft.com>, David Grothe <dave@gcom.com>,
++ * Scott Foehner <sfoehner@engr.sgi.com>, George Anzinger <george@mvista.com>,
++ * Robert Walsh <rjwalsh@durables.org>, wangdi <wangdi@clusterfs.com>,
++ * San Mehat, Tom Rini <trini@mvista.com>,
++ * Jason Wessel <jason.wessel@windriver.com>
++ */
++
++#include <linux/config.h>
++#include <linux/kernel.h>
++#include <linux/init.h>
++#include <linux/kgdb.h>
++#include <linux/interrupt.h>
++#include <linux/tty.h>
++#include <linux/serial.h>
++#include <linux/serial_reg.h>
++#include <linux/serialP.h>
++#include <linux/ioport.h>
++
++#include <asm/io.h>
++#include <asm/serial.h>               /* For BASE_BAUD and SERIAL_PORT_DFNS */
++
++#include "8250.h"
++
++#define GDB_BUF_SIZE  512     /* power of 2, please */
++
++MODULE_DESCRIPTION("KGDB driver for the 8250");
++MODULE_LICENSE("GPL");
++/* These will conflict with early_param otherwise. */
++#ifdef CONFIG_KGDB_8250_MODULE
++static char config[256];
++module_param_string(kgdb8250, config, 256, 0);
++MODULE_PARM_DESC(kgdb8250,
++               " kgdb8250=<io or mmio>,<address>,<baud rate>,<irq>\n");
++static struct kgdb_io local_kgdb_io_ops;
++#endif                                /* CONFIG_KGDB_8250_MODULE */
++
++/* Speed of the UART. */
++static int kgdb8250_baud;
++
++/* Flag for if we need to call request_mem_region */
++static int kgdb8250_needs_request_mem_region;
++
++static char kgdb8250_buf[GDB_BUF_SIZE];
++static atomic_t kgdb8250_buf_in_cnt;
++static int kgdb8250_buf_out_inx;
++
++/* Old-style serial definitions, if existant, and a counter. */
++#ifdef CONFIG_KGDB_SIMPLE_SERIAL
++static int __initdata should_copy_rs_table = 1;
++static struct serial_state old_rs_table[] __initdata = {
++#ifdef SERIAL_PORT_DFNS
++      SERIAL_PORT_DFNS
++#endif
++};
++#endif
++
++/* Our internal table of UARTS. */
++#define UART_NR       CONFIG_SERIAL_8250_NR_UARTS
++static struct uart_port kgdb8250_ports[UART_NR];
++
++static struct uart_port *current_port;
++
++/* Base of the UART. */
++static void *kgdb8250_addr;
++
++/* Forward declarations. */
++static int kgdb8250_uart_init(void);
++static int __init kgdb_init_io(void);
++static int __init kgdb8250_opt(char *str);
++
++/* These are much shorter calls to ioread8/iowrite8 that take into
++ * account our shifts, etc. */
++static inline unsigned int kgdb_ioread(u8 mask)
++{
++      return ioread8(kgdb8250_addr + (mask << current_port->regshift));
++}
++
++static inline void kgdb_iowrite(u8 val, u8 mask)
++{
++      iowrite8(val, kgdb8250_addr + (mask << current_port->regshift));
++}
++
++/*
++ * Wait until the interface can accept a char, then write it.
++ */
++static void kgdb_put_debug_char(u8 chr)
++{
++      while (!(kgdb_ioread(UART_LSR) & UART_LSR_THRE)) ;
++
++      kgdb_iowrite(chr, UART_TX);
++}
++
++/*
++ * Get a byte from the hardware data buffer and return it
++ */
++static int read_data_bfr(void)
++{
++      char it = kgdb_ioread(UART_LSR);
++
++      if (it & UART_LSR_DR)
++              return kgdb_ioread(UART_RX);
++
++      /*
++       * If we have a framing error assume somebody messed with
++       * our uart.  Reprogram it and send '-' both ways...
++       */
++      if (it & 0xc) {
++              kgdb8250_uart_init();
++              kgdb_put_debug_char('-');
++              return '-';
++      }
++
++      return -1;
++}
++
++/*
++ * Get a char if available, return -1 if nothing available.
++ * Empty the receive buffer first, then look at the interface hardware.
++ */
++static int kgdb_get_debug_char(void)
++{
++      int retchr;
++
++      /* intr routine has q'd chars */
++      if (atomic_read(&kgdb8250_buf_in_cnt) != 0) {
++              retchr = kgdb8250_buf[kgdb8250_buf_out_inx++];
++              kgdb8250_buf_out_inx &= (GDB_BUF_SIZE - 1);
++              atomic_dec(&kgdb8250_buf_in_cnt);
++              return retchr;
++      }
++
++      do {
++              retchr = read_data_bfr();
++      } while (retchr < 0);
++
++      return retchr;
++}
++
++/*
++ * This is the receiver interrupt routine for the GDB stub.
++ * All that we need to do is verify that the interrupt happened on the
++ * line we're in charge of.  If this is true, schedule a breakpoint and
++ * return.
++ */
++static irqreturn_t
++kgdb8250_interrupt(int irq, void *dev_id, struct pt_regs *regs)
++{
++      if (kgdb_ioread(UART_IIR) & UART_IIR_RDI) {
++              /* Throw away the data if another I/O routine is active. */
++              if (kgdb_io_ops.read_char != kgdb_get_debug_char &&
++                              (kgdb_ioread(UART_LSR) & UART_LSR_DR))
++                      kgdb_ioread(UART_RX);
++              else
++                      breakpoint();
++      }
++
++      return IRQ_HANDLED;
++}
++
++/*
++ *  Initializes the UART.
++ *  Returns:
++ *    0 on success, 1 on failure.
++ */
++static int
++kgdb8250_uart_init (void)
++{
++      unsigned int ier, base_baud = current_port->uartclk ?
++              current_port->uartclk / 16 : BASE_BAUD;
++
++      /* test uart existance */
++      if(kgdb_ioread(UART_LSR) == 0xff)
++              return -1;
++
++      /* disable interrupts */
++      kgdb_iowrite(0, UART_IER);
++
++#if defined(CONFIG_ARCH_OMAP1510)
++      /* Workaround to enable 115200 baud on OMAP1510 internal ports */
++      if (cpu_is_omap1510() && is_omap_port((void *)kgdb8250_addr)) {
++              if (kgdb8250_baud == 115200) {
++                      base_baud = 1;
++                      kgdb8250_baud = 1;
++                      kgdb_iowrite(1, UART_OMAP_OSC_12M_SEL);
++              } else
++                      kgdb_iowrite(0, UART_OMAP_OSC_12M_SEL);
++      }
++#endif
++      /* set DLAB */
++      kgdb_iowrite(UART_LCR_DLAB, UART_LCR);
++
++      /* set baud */
++      kgdb_iowrite((base_baud / kgdb8250_baud) & 0xff, UART_DLL);
++      kgdb_iowrite((base_baud / kgdb8250_baud) >> 8, UART_DLM);
++
++      /* reset DLAB, set LCR */
++      kgdb_iowrite(UART_LCR_WLEN8, UART_LCR);
++
++      /* set DTR and RTS */
++      kgdb_iowrite(UART_MCR_OUT2 | UART_MCR_DTR | UART_MCR_RTS, UART_MCR);
++
++      /* setup fifo */
++      kgdb_iowrite(UART_FCR_ENABLE_FIFO | UART_FCR_CLEAR_RCVR
++              | UART_FCR_CLEAR_XMIT | UART_FCR_TRIGGER_8,
++              UART_FCR);
++
++      /* clear pending interrupts */
++      kgdb_ioread(UART_IIR);
++      kgdb_ioread(UART_RX);
++      kgdb_ioread(UART_LSR);
++      kgdb_ioread(UART_MSR);
++
++      /* turn on RX interrupt only */
++      kgdb_iowrite(UART_IER_RDI, UART_IER);
++
++      /*
++       * Borrowed from the main 8250 driver.
++       * Try writing and reading the UART_IER_UUE bit (b6).
++       * If it works, this is probably one of the Xscale platform's
++       * internal UARTs.
++       * We're going to explicitly set the UUE bit to 0 before
++       * trying to write and read a 1 just to make sure it's not
++       * already a 1 and maybe locked there before we even start start.
++       */
++      ier = kgdb_ioread(UART_IER);
++      kgdb_iowrite(ier & ~UART_IER_UUE, UART_IER);
++      if (!(kgdb_ioread(UART_IER) & UART_IER_UUE)) {
++              /*
++               * OK it's in a known zero state, try writing and reading
++               * without disturbing the current state of the other bits.
++               */
++              kgdb_iowrite(ier | UART_IER_UUE, UART_IER);
++              if (kgdb_ioread(UART_IER) & UART_IER_UUE)
++                      /*
++                       * It's an Xscale.
++                       */
++                      ier |= UART_IER_UUE | UART_IER_RTOIE;
++      }
++      kgdb_iowrite(ier, UART_IER);
++      return 0;
++}
++
++/*
++ * Copy the old serial_state table to our uart_port table if we haven't
++ * had values specifically configured in.  We need to make sure this only
++ * happens once.
++ */
++static void __init kgdb8250_copy_rs_table(void)
++{
++#ifdef CONFIG_KGDB_SIMPLE_SERIAL
++      int i;
++
++      if (!should_copy_rs_table)
++              return;
++
++      for (i = 0; i < ARRAY_SIZE(old_rs_table); i++) {
++              kgdb8250_ports[i].iobase = old_rs_table[i].port;
++              kgdb8250_ports[i].irq = irq_canonicalize(old_rs_table[i].irq);
++              kgdb8250_ports[i].uartclk = old_rs_table[i].baud_base * 16;
++              kgdb8250_ports[i].membase = old_rs_table[i].iomem_base;
++              kgdb8250_ports[i].iotype = old_rs_table[i].io_type;
++              kgdb8250_ports[i].regshift = old_rs_table[i].iomem_reg_shift;
++              kgdb8250_ports[i].line = i;
++      }
++
++      should_copy_rs_table = 0;
++#endif
++}
++
++/*
++ * Hookup our IRQ line now that it is safe to do so, after we grab any
++ * memory regions we might need to.  If we haven't been initialized yet,
++ * go ahead and copy the old_rs_table in.
++ */
++static void __init kgdb8250_late_init(void)
++{
++      /* Try and copy the old_rs_table. */
++      kgdb8250_copy_rs_table();
++
++#if defined(CONFIG_SERIAL_8250) || defined(CONFIG_SERIAL_8250_MODULE)
++      /* Take the port away from the main driver. */
++      serial8250_unregister_by_port(current_port);
++
++      /* Now reinit the port as the above has disabled things. */
++      kgdb8250_uart_init();
++#endif
++      /* We may need to call request_mem_region() first. */
++      if (kgdb8250_needs_request_mem_region)
++              request_mem_region(current_port->mapbase,
++                                 8 << current_port->regshift, "kgdb");
++      if (request_irq(current_port->irq, kgdb8250_interrupt, SA_SHIRQ,
++                      "GDB-stub", current_port) < 0)
++              printk(KERN_ERR "KGDB failed to request the serial IRQ (%d)\n",
++                     current_port->irq);
++}
++
++static __init int kgdb_init_io(void)
++{
++      /* Give us the basic table of uarts. */
++      kgdb8250_copy_rs_table();
++
++      /* We're either a module and parse a config string, or we have a
++       * semi-static config. */
++#ifdef CONFIG_KGDB_8250_MODULE
++      if (strlen(config)) {
++              if (kgdb8250_opt(config))
++                      return -EINVAL;
++      } else {
++              printk(KERN_ERR "kgdb8250: argument error, usage: "
++                     "kgdb8250=<io or mmio>,<address>,<baud rate>,<irq>\n");
++              return -EINVAL;
++      }
++#elif defined(CONFIG_KGDB_SIMPLE_SERIAL)
++      kgdb8250_baud = CONFIG_KGDB_BAUDRATE;
++
++      /* Setup our pointer to the serial port now. */
++      current_port = &kgdb8250_ports[CONFIG_KGDB_PORT_NUM];
++#else
++      if (kgdb8250_opt(CONFIG_KGDB_8250_CONF_STRING))
++              return -EINVAL;
++#endif
++
++
++      /* Internal driver setup. */
++      switch (current_port->iotype) {
++      case UPIO_MEM:
++              if (current_port->mapbase)
++                      kgdb8250_needs_request_mem_region = 1;
++              if (current_port->flags & UPF_IOREMAP) {
++                      current_port->membase = ioremap(current_port->mapbase,
++                                              8 << current_port->regshift);
++                      if (!current_port->membase)
++                              return -EIO;    /* Failed. */
++              }
++              kgdb8250_addr = current_port->membase;
++              break;
++      case UPIO_PORT:
++      default:
++              kgdb8250_addr = ioport_map(current_port->iobase,
++                                         8 << current_port->regshift);
++              if (!kgdb8250_addr)
++                      return -EIO;    /* Failed. */
++      }
++
++      if (kgdb8250_uart_init() == -1) {
++              printk(KERN_ERR "kgdb8250: init failed\n");
++              return -EIO;
++      }
++#ifdef CONFIG_KGDB_8250_MODULE
++      /* Attach the kgdb irq. When this is built into the kernel, it
++       * is called as a part of late_init sequence.
++       */
++      kgdb8250_late_init();
++      if (kgdb_register_io_module(&local_kgdb_io_ops))
++              return -EINVAL;
++
++      printk(KERN_INFO "kgdb8250: debugging enabled\n");
++#endif                                /* CONFIG_KGD_8250_MODULE */
++
++      return 0;
++}
++
++#ifdef CONFIG_KGDB_8250_MODULE
++/* If it is a module the kgdb_io_ops should be a static which
++ * is passed to the KGDB I/O initialization
++ */
++static struct kgdb_io local_kgdb_io_ops = {
++#else                         /* ! CONFIG_KGDB_8250_MODULE */
++struct kgdb_io kgdb_io_ops = {
++#endif                                /* ! CONFIG_KGD_8250_MODULE */
++      .read_char = kgdb_get_debug_char,
++      .write_char = kgdb_put_debug_char,
++      .init = kgdb_init_io,
++      .late_init = kgdb8250_late_init,
++};
++
++/**
++ *    kgdb8250_add_port - Define a serial port for use with KGDB
++ *    @i: The index of the port being added
++ *    @serial_req: The &struct uart_port describing the port
++ *
++ *    On platforms where we must register the serial device
++ *    dynamically, this is the best option if a platform also normally
++ *    calls early_serial_setup().
++ */
++void __init kgdb8250_add_port(int i, struct uart_port *serial_req)
++{
++      /* Make sure we've got the built-in data before we override. */
++      kgdb8250_copy_rs_table();
++
++      /* Copy the whole thing over. */
++      if (current_port != &kgdb8250_ports[i])
++                memcpy(&kgdb8250_ports[i], serial_req, sizeof(struct uart_port));
++}
++
++/**
++ *    kgdb8250_add_platform_port - Define a serial port for use with KGDB
++ *    @i: The index of the port being added
++ *    @p: The &struct plat_serial8250_port describing the port
++ *
++ *    On platforms where we must register the serial device
++ *    dynamically, this is the best option if a platform normally
++ *    handles uart setup with an array of &struct plat_serial8250_port.
++ */
++void __init kgdb8250_add_platform_port(int i, struct plat_serial8250_port *p)
++{
++      /* Make sure we've got the built-in data before we override. */
++      kgdb8250_copy_rs_table();
++
++      kgdb8250_ports[i].iobase = p->iobase;
++      kgdb8250_ports[i].membase = p->membase;
++      kgdb8250_ports[i].irq = p->irq;
++      kgdb8250_ports[i].uartclk = p->uartclk;
++      kgdb8250_ports[i].regshift = p->regshift;
++      kgdb8250_ports[i].iotype = p->iotype;
++      kgdb8250_ports[i].flags = p->flags;
++      kgdb8250_ports[i].mapbase = p->mapbase;
++}
++
++/*
++ * Syntax for this cmdline option is:
++ * kgdb8250=<io or mmio>,<address>,<baud rate>,<irq>"
++ */
++static int __init kgdb8250_opt(char *str)
++{
++      /* We'll fill out and use the first slot. */
++      current_port = &kgdb8250_ports[0];
++
++      if (!strncmp(str, "io", 2)) {
++              current_port->iotype = UPIO_PORT;
++              str += 2;
++      } else if (!strncmp(str, "mmap", 4)) {
++              current_port->iotype = UPIO_MEM;
++              current_port->flags |= UPF_IOREMAP;
++              str += 4;
++      } else if (!strncmp(str, "mmio", 4)) {
++              current_port->iotype = UPIO_MEM;
++              current_port->flags &= ~UPF_IOREMAP;
++              str += 4;
++      } else
++              goto errout;
++
++      if (*str != ',')
++              goto errout;
++      str++;
++
++      if (current_port->iotype == UPIO_PORT)
++              current_port->iobase = simple_strtoul(str, &str, 16);
++      else {
++              if (current_port->flags & UPF_IOREMAP)
++                      current_port->mapbase =
++                              (unsigned long) simple_strtoul(str, &str, 16);
++              else
++                      current_port->membase =
++                              (void *) simple_strtoul(str, &str, 16);
++      }
++
++      if (*str != ',')
++              goto errout;
++      str++;
++
++      kgdb8250_baud = simple_strtoul(str, &str, 10);
++      if (!kgdb8250_baud)
++              goto errout;
++
++      if (*str != ',')
++              goto errout;
++      str++;
++
++      current_port->irq = simple_strtoul(str, &str, 10);
++
++#ifdef CONFIG_KGDB_SIMPLE_SERIAL
++      should_copy_rs_table = 0;
++#endif
++
++      return 0;
++
++      errout:
++      printk(KERN_ERR "Invalid syntax for option kgdb8250=\n");
++      return 1;
++}
++
++#ifdef CONFIG_KGDB_8250_MODULE
++static void cleanup_kgdb8250(void)
++{
++      kgdb_unregister_io_module(&local_kgdb_io_ops);
++
++      /* Clean up the irq and memory */
++      free_irq(current_port->irq, current_port);
++
++      if (kgdb8250_needs_request_mem_region)
++              release_mem_region(current_port->mapbase,
++                                 8 << current_port->regshift);
++      /* Hook up the serial port back to what it was previously
++       * hooked up to.
++       */
++#if defined(CONFIG_SERIAL_8250) || defined(CONFIG_SERIAL_8250_MODULE)
++      /* Give the port back to the 8250 driver. */
++      serial8250_register_port(current_port);
++#endif
++}
++
++module_init(kgdb_init_io);
++module_exit(cleanup_kgdb8250);
++#else                         /* ! CONFIG_KGDB_8250_MODULE */
++early_param("kgdb8250", kgdb8250_opt);
++#endif                                /* ! CONFIG_KGDB_8250_MODULE */
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/drivers/serial/Kconfig linux-2.6.18-53.1.14.kgdb/drivers/serial/Kconfig
+--- linux-2.6.18-53.1.14/drivers/serial/Kconfig        2008-03-06 05:54:47.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/drivers/serial/Kconfig   2008-06-10 15:37:43.000000000 +0400
+@@ -107,7 +107,7 @@ config SERIAL_8250_CS
+ 
+ config SERIAL_8250_NR_UARTS
+       int "Maximum number of 8250/16550 serial ports"
+-      depends on SERIAL_8250
++      depends on SERIAL_8250 || KGDB_8250
+       default "4"
+       help
+         Set this to the number of serial ports you want the driver
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/drivers/serial/Makefile linux-2.6.18-53.1.14.kgdb/drivers/serial/Makefile
+--- linux-2.6.18-53.1.14/drivers/serial/Makefile       2008-03-06 05:54:47.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/drivers/serial/Makefile  2008-06-10 15:38:14.000000000 +0400
+@@ -47,6 +47,7 @@ obj-$(CONFIG_SERIAL_IMX) += imx.o
+ obj-$(CONFIG_SERIAL_MPC52xx) += mpc52xx_uart.o
+ obj-$(CONFIG_SERIAL_ICOM) += icom.o
+ obj-$(CONFIG_SERIAL_M32R_SIO) += m32r_sio.o
++obj-$(CONFIG_KGDB_MPSC) += mpsc_kgdb.o
+ obj-$(CONFIG_SERIAL_MPSC) += mpsc.o
+ obj-$(CONFIG_ETRAX_SERIAL) += crisv10.o
+ obj-$(CONFIG_SERIAL_JSM) += jsm/
+@@ -57,3 +58,4 @@ obj-$(CONFIG_SERIAL_SGI_IOC3) += ioc3_se
+ obj-$(CONFIG_SERIAL_AT91) += at91_serial.o
+ obj-$(CONFIG_SERIAL_NETX) += netx-serial.o
+ obj-$(CONFIG_SERIAL_OF_PLATFORM) += of_serial.o
++obj-$(CONFIG_KGDB_8250) += 8250_kgdb.o
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/drivers/serial/amba-pl011.c linux-2.6.18-53.1.14.kgdb/drivers/serial/amba-pl011.c
+--- linux-2.6.18-53.1.14/drivers/serial/amba-pl011.c   2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/drivers/serial/amba-pl011.c      2008-06-10 15:38:56.000000000 +0400
+@@ -340,7 +340,7 @@ static int pl011_startup(struct uart_por
+       /*
+        * Allocate the IRQ
+        */
+-      retval = request_irq(uap->port.irq, pl011_int, 0, "uart-pl011", uap);
++      retval = request_irq(uap->port.irq, pl011_int, SA_SHIRQ, "uart-pl011", uap);
+       if (retval)
+               goto clk_dis;
+ 
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/drivers/serial/cpm_uart/Makefile linux-2.6.18-53.1.14.kgdb/drivers/serial/cpm_uart/Makefile
+--- linux-2.6.18-53.1.14/drivers/serial/cpm_uart/Makefile      2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/drivers/serial/cpm_uart/Makefile 2008-06-10 15:38:14.000000000 +0400
+@@ -7,5 +7,6 @@ obj-$(CONFIG_SERIAL_CPM) += cpm_uart.o
+ # Select the correct platform objects.
+ cpm_uart-objs-$(CONFIG_CPM2)  += cpm_uart_cpm2.o
+ cpm_uart-objs-$(CONFIG_8xx)   += cpm_uart_cpm1.o
++cpm_uart-objs-$(CONFIG_KGDB_CPM_UART) += cpm_uart_kgdb.o
+ 
+ cpm_uart-objs := cpm_uart_core.o $(cpm_uart-objs-y)
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/drivers/serial/cpm_uart/cpm_uart.h linux-2.6.18-53.1.14.kgdb/drivers/serial/cpm_uart/cpm_uart.h
+--- linux-2.6.18-53.1.14/drivers/serial/cpm_uart/cpm_uart.h    2008-03-06 05:54:12.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/drivers/serial/cpm_uart/cpm_uart.h       2008-06-10 15:38:14.000000000 +0400
+@@ -50,6 +50,39 @@
+ 
+ #define SCC_WAIT_CLOSING 100
+ 
++#ifdef CONFIG_KGDB_CPM_UART
++
++/* Speed of the debug UART. */
++#if defined(CONFIG_KGDB_9600BAUD)
++#define KGDB_BAUD B9600
++#elif defined(CONFIG_KGDB_19200BAUD)
++#define KGDB_BAUD B19200
++#elif defined(CONFIG_KGDB_38400BAUD)
++#define KGDB_BAUD B38400
++#elif defined(CONFIG_KGDB_57600BAUD)
++#define KGDB_BAUD B57600
++#else
++#define KGDB_BAUD B115200     /* Start with this if not given */
++#endif
++
++#ifdef CONFIG_KGDB_CPM_UART_SCC1
++#define KGDB_PINFO_INDEX      UART_SCC1
++#elif CONFIG_KGDB_CPM_UART_SCC2
++#define KGDB_PINFO_INDEX      UART_SCC2
++#elif CONFIG_KGDB_CPM_UART_SCC3
++#define KGDB_PINFO_INDEX      UART_SCC3
++#elif CONFIG_KGDB_CPM_UART_SCC4
++#define KGDB_PINFO_INDEX      UART_SCC4
++#elif CONFIG_KGDB_CPM_UART_SMC1
++#define KGDB_PINFO_INDEX      UART_SMC1
++#elif CONFIG_KGDB_CPM_UART_SMC2
++#define KGDB_PINFO_INDEX      UART_SMC2
++#else
++#error The S(M)CC for kgdb console is undefined
++#endif
++
++#endif /* CONFIG_KGDB_CPM_UART */
++
+ struct uart_cpm_port {
+       struct uart_port        port;
+       u16                     rx_nrfifos;
+@@ -86,6 +119,9 @@ extern int cpm_uart_port_map[UART_NR];
+ extern int cpm_uart_nr;
+ extern struct uart_cpm_port cpm_uart_ports[UART_NR];
+ 
++void cpm_uart_early_write(int index, const char *s, u_int count);
++int cpm_uart_early_setup(int index,int early);
++
+ /* these are located in their respective files */
+ void cpm_line_cr_cmd(int line, int cmd);
+ int cpm_uart_init_portdesc(void);
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/drivers/serial/cpm_uart/cpm_uart_core.c linux-2.6.18-53.1.14.kgdb/drivers/serial/cpm_uart/cpm_uart_core.c
+--- linux-2.6.18-53.1.14/drivers/serial/cpm_uart/cpm_uart_core.c       2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/drivers/serial/cpm_uart/cpm_uart_core.c  2008-06-10 15:38:14.000000000 +0400
+@@ -1070,22 +1070,17 @@ int cpm_uart_drv_get_platform_data(struc
+       return 0;
+ }
+ 
+-#ifdef CONFIG_SERIAL_CPM_CONSOLE
+-/*
+- *    Print a string to the serial port trying not to disturb
+- *    any possible real use of the port...
+- *
+- *    Note that this is called with interrupts already disabled
+- */
+-static void cpm_uart_console_write(struct console *co, const char *s,
++void cpm_uart_early_write(int index, const char *s,
+                                  u_int count)
+ {
+-      struct uart_cpm_port *pinfo =
+-          &cpm_uart_ports[cpm_uart_port_map[co->index]];
++      struct uart_cpm_port *pinfo;
+       unsigned int i;
+       volatile cbd_t *bdp, *bdbase;
+       volatile unsigned char *cp;
+ 
++      BUG_ON(index>UART_NR);
++      pinfo = &cpm_uart_ports[index];
++
+       /* Get the address of the host memory buffer.
+        */
+       bdp = pinfo->tx_cur;
+@@ -1149,16 +1144,11 @@ static void cpm_uart_console_write(struc
+       pinfo->tx_cur = (volatile cbd_t *) bdp;
+ }
+ 
+-
+-static int __init cpm_uart_console_setup(struct console *co, char *options)
++int cpm_uart_early_setup(int index, int early)
+ {
++      int ret;
+       struct uart_port *port;
+       struct uart_cpm_port *pinfo;
+-      int baud = 38400;
+-      int bits = 8;
+-      int parity = 'n';
+-      int flow = 'n';
+-      int ret;
+ 
+       struct fs_uart_platform_info *pdata;
+       struct platform_device* pdev = early_uart_get_pdev(co->index);
+@@ -1169,8 +1159,9 @@ static int __init cpm_uart_console_setup
+               cpm_uart_init_portdesc();
+       }
+ 
++      BUG_ON(index>UART_NR);
+       port =
+-          (struct uart_port *)&cpm_uart_ports[cpm_uart_port_map[co->index]];
++              (struct uart_port *)&cpm_uart_ports[index];
+       pinfo = (struct uart_cpm_port *)port;
+       if (!pdev) {
+               if (pinfo->set_lineif)
+@@ -1184,19 +1175,6 @@ static int __init cpm_uart_console_setup
+               cpm_uart_drv_get_platform_data(pdev, 1);
+       }
+ 
+-      pinfo->flags |= FLAG_CONSOLE;
+-
+-      if (options) {
+-              uart_parse_options(options, &baud, &parity, &bits, &flow);
+-      } else {
+-              bd_t *bd = (bd_t *) __res;
+-
+-              if (bd->bi_baudrate)
+-                      baud = bd->bi_baudrate;
+-              else
+-                      baud = 9600;
+-      }
+-
+       if (IS_SMC(pinfo)) {
+               pinfo->smcp->smc_smcm &= ~(SMCM_RX | SMCM_TX);
+               pinfo->smcp->smc_smcmr &= ~(SMCMR_REN | SMCMR_TEN);
+@@ -1204,8 +1182,7 @@ static int __init cpm_uart_console_setup
+               pinfo->sccp->scc_sccm &= ~(UART_SCCM_TX | UART_SCCM_RX);
+               pinfo->sccp->scc_gsmrl &= ~(SCC_GSMRL_ENR | SCC_GSMRL_ENT);
+       }
+-
+-      ret = cpm_uart_allocbuf(pinfo, 1);
++      ret = cpm_uart_allocbuf(pinfo, early);
+ 
+       if (ret)
+               return ret;
+@@ -1217,6 +1194,56 @@ static int __init cpm_uart_console_setup
+       else
+               cpm_uart_init_scc(pinfo);
+ 
++      return 0;
++}
++
++#ifdef CONFIG_SERIAL_CPM_CONSOLE
++/*
++ *    Print a string to the serial port trying not to disturb
++ *    any possible real use of the port...
++ *
++ *    Note that this is called with interrupts already disabled
++ */
++
++static void cpm_uart_console_write(struct console *co, const char *s,
++                                 u_int count)
++{
++      cpm_uart_early_write(cpm_uart_port_map[co->index],s,count);
++}
++
++/*
++ * Setup console. Be careful is called early !
++ */
++static int __init cpm_uart_console_setup(struct console *co, char *options)
++{
++      struct uart_port *port;
++      struct uart_cpm_port *pinfo;
++      int baud = 115200;
++      int bits = 8;
++      int parity = 'n';
++      int flow = 'n';
++      int ret;
++
++      port =
++          (struct uart_port *)&cpm_uart_ports[cpm_uart_port_map[co->index]];
++      pinfo = (struct uart_cpm_port *)port;
++
++      pinfo->flags |= FLAG_CONSOLE;
++
++      if (options) {
++              uart_parse_options(options, &baud, &parity, &bits, &flow);
++      } else {
++              bd_t *bd = (bd_t *) __res;
++
++              if (bd->bi_baudrate)
++                      baud = bd->bi_baudrate;
++              else
++                      baud = 9600;
++      }
++
++      ret = cpm_uart_early_setup(cpm_uart_port_map[co->index], 1);
++      if(ret)
++              return ret;
+       uart_set_options(port, co, baud, parity, bits, flow);
+ 
+       return 0;
+@@ -1364,6 +1391,12 @@ static int cpm_uart_init(void) {
+ 
+               for (i = 0; i < cpm_uart_nr; i++) {
+                       int con = cpm_uart_port_map[i];
++
++#ifdef CONFIG_KGDB_CPM_UART
++              /* We are not interested in ports yet utilized by kgdb */
++              if(con == KGDB_PINFO_INDEX)
++                      continue;
++#endif
+                       cpm_uart_ports[con].port.line = i;
+                       cpm_uart_ports[con].port.flags = UPF_BOOT_AUTOCONF;
+                       uart_add_one_port(&cpm_reg, &cpm_uart_ports[con].port);
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/drivers/serial/cpm_uart/cpm_uart_cpm1.c linux-2.6.18-53.1.14.kgdb/drivers/serial/cpm_uart/cpm_uart_cpm1.c
+--- linux-2.6.18-53.1.14/drivers/serial/cpm_uart/cpm_uart_cpm1.c       2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/drivers/serial/cpm_uart/cpm_uart_cpm1.c  2008-06-10 15:38:14.000000000 +0400
+@@ -52,6 +52,7 @@ void cpm_line_cr_cmd(int line, int cmd)
+ {
+       ushort val;
+       volatile cpm8xx_t *cp = cpmp;
++      unsigned *bcsr_io;
+ 
+       switch (line) {
+       case UART_SMC1:
+@@ -94,12 +95,35 @@ void scc1_lineif(struct uart_cpm_port *p
+ {
+       /* XXX SCC1: insert port configuration here */
+       pinfo->brg = 1;
++
++#if defined (CONFIG_MPC885ADS) || defined (CONFIG_MPC86XADS)
++      bcsr_io = ioremap(BCSR1, sizeof(unsigned long));
++
++      if (bcsr_io == NULL) {
++              printk(KERN_CRIT "Could not remap BCSR\n");
++              return;
++      }
++      out_be32(bcsr_io, in_be32(bcsr_io) & ~BCSR1_RS232EN_1);
++      iounmap(bcsr_io);
++#endif
+ }
+ 
+ void scc2_lineif(struct uart_cpm_port *pinfo)
+ {
+       /* XXX SCC2: insert port configuration here */
+       pinfo->brg = 2;
++      unsigned *bcsr_io;
++
++#if defined (CONFIG_MPC885ADS) || defined (CONFIG_MPC86XADS)
++      bcsr_io = ioremap(BCSR1, sizeof(unsigned long));
++
++      if (bcsr_io == NULL) {
++              printk(KERN_CRIT "Could not remap BCSR\n");
++              return;
++      }
++        out_be32(bcsr_io, in_be32(bcsr_io) & ~BCSR1_RS232EN_2);
++      iounmap(bcsr_io);
++#endif
+ }
+ 
+ void scc3_lineif(struct uart_cpm_port *pinfo)
+@@ -188,6 +212,10 @@ int cpm_uart_init_portdesc(void)
+ {
+       pr_debug("CPM uart[-]:init portdesc\n");
+ 
++      /* Check if we have called this yet. This may happen if early kgdb
++      breakpoint is on */
++      if(cpm_uart_nr)
++              return 0;
+       cpm_uart_nr = 0;
+ #ifdef CONFIG_SERIAL_CPM_SMC1
+       cpm_uart_ports[UART_SMC1].smcp = &cpmp->cp_smc[0];
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/drivers/serial/cpm_uart/cpm_uart_cpm2.c linux-2.6.18-53.1.14.kgdb/drivers/serial/cpm_uart/cpm_uart_cpm2.c
+--- linux-2.6.18-53.1.14/drivers/serial/cpm_uart/cpm_uart_cpm2.c       2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/drivers/serial/cpm_uart/cpm_uart_cpm2.c  2008-06-10 15:38:14.000000000 +0400
+@@ -256,6 +256,10 @@ int cpm_uart_init_portdesc(void)
+ {
+       pr_debug("CPM uart[-]:init portdesc\n");
+ 
++      /* Check if we have called this yet. This may happen if early kgdb
++      breakpoint is on */
++      if(cpm_uart_nr)
++              return 0;
+       cpm_uart_nr = 0;
+ #ifdef CONFIG_SERIAL_CPM_SMC1
+       cpm_uart_ports[UART_SMC1].smcp = (smc_t *) & cpm2_immr->im_smc[0];
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/drivers/serial/cpm_uart/cpm_uart_kgdb.c linux-2.6.18-53.1.14.kgdb/drivers/serial/cpm_uart/cpm_uart_kgdb.c
+--- linux-2.6.18-53.1.14/drivers/serial/cpm_uart/cpm_uart_kgdb.c       1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/drivers/serial/cpm_uart/cpm_uart_kgdb.c  2008-06-10 15:38:14.000000000 +0400
+@@ -0,0 +1,195 @@
++/*
++ * drivers/serial/cpm_uart/cpm_uart_kgdb.c
++ *
++ * CPM UART interface for kgdb.
++ *
++ * Author: Vitaly Bordug <vbordug@ru.mvista.com>
++ *
++ * Used some bits from drivers/serial/kgdb_8250.c as a template
++ *
++ * 2005 (c) MontaVista Software, Inc. This file is licensed under
++ * the terms of the GNU General Public License version 2. This program
++ * is licensed "as is" without any warranty of any kind, whether express
++ * or implied.
++ */
++
++#include <linux/kgdb.h>
++#include <linux/config.h>
++#include <linux/kernel.h>
++#include <linux/init.h>
++#include <linux/interrupt.h>
++#include <linux/tty.h>
++#include <linux/serial.h>
++#include <linux/serial_core.h>
++#include <linux/serial_reg.h>
++
++#include <asm/io.h>
++#include <asm/serial.h>               /* For BASE_BAUD and SERIAL_PORT_DFNS */
++
++#include "cpm_uart.h"
++
++#define GDB_BUF_SIZE  512     /* power of 2, please */
++
++
++static char kgdb_buf[GDB_BUF_SIZE], *kgdbp;
++static int kgdb_chars;
++
++/* Forward declarations. */
++
++/*
++ * Receive character from the serial port.  This only works well
++ * before the port is initialize for real use.
++ */
++static int kgdb_wait_key(char *obuf)
++{
++      struct uart_cpm_port *pinfo;
++
++      u_char                          c, *cp;
++      volatile        cbd_t           *bdp;
++      int                             i;
++
++      pinfo = &cpm_uart_ports[KGDB_PINFO_INDEX];
++
++      /* Get the address of the host memory buffer.
++       */
++      bdp = pinfo->rx_cur;
++      while (bdp->cbd_sc & BD_SC_EMPTY);
++
++      /* If the buffer address is in the CPM DPRAM, don't
++       * convert it.
++       */
++      cp = cpm2cpu_addr(bdp->cbd_bufaddr);
++
++      if (obuf) {
++              i = c = bdp->cbd_datlen;
++              while (i-- > 0)
++              {
++                      *obuf++ = *cp++;
++              }
++      } else {
++              c = *cp;
++      }
++      bdp->cbd_sc |= BD_SC_EMPTY;
++
++      if (bdp->cbd_sc & BD_SC_WRAP) {
++              bdp = pinfo->rx_bd_base;
++      } else {
++              bdp++;
++      }
++      pinfo->rx_cur = (cbd_t *)bdp;
++
++      return((int)c);
++}
++
++
++/*
++ * Wait until the interface can accept a char, then write it.
++ */
++static void
++kgdb_put_debug_char(int chr)
++{
++      static char ch[2];
++      ch[0]=(char)chr;
++      cpm_uart_early_write(KGDB_PINFO_INDEX, ch, 1);
++}
++
++
++/*
++ * Get a char if available, return -1 if nothing available.
++ * Empty the receive buffer first, then look at the interface hardware.
++ */
++static int
++kgdb_get_debug_char(void)
++{
++      if (kgdb_chars<=0) {
++              kgdb_chars = kgdb_wait_key(kgdb_buf);
++              kgdbp = kgdb_buf;
++      }
++      kgdb_chars--;
++
++      return (*kgdbp++);
++}
++
++static void termios_set_options(int index,
++               int baud, int parity, int bits, int flow)
++{
++      struct termios termios;
++      struct uart_port *port;
++      struct uart_cpm_port *pinfo;
++
++      BUG_ON(index>UART_NR);
++
++      port =
++          (struct uart_port *)&cpm_uart_ports[index];
++      pinfo = (struct uart_cpm_port *)port;
++
++      /*
++       * Ensure that the serial console lock is initialised
++       * early.
++       */
++      spin_lock_init(&port->lock);
++
++      memset(&termios, 0, sizeof(struct termios));
++
++      termios.c_cflag = CREAD | HUPCL | CLOCAL;
++
++      termios.c_cflag |= baud;
++
++      if (bits == 7)
++              termios.c_cflag |= CS7;
++      else
++              termios.c_cflag |= CS8;
++
++      switch (parity) {
++      case 'o': case 'O':
++              termios.c_cflag |= PARODD;
++              /*fall through*/
++      case 'e': case 'E':
++              termios.c_cflag |= PARENB;
++              break;
++      }
++
++      if (flow == 'r')
++              termios.c_cflag |= CRTSCTS;
++
++      port->ops->set_termios(port, &termios, NULL);
++}
++
++/*
++ *  Returns:
++ *    0 on success, 1 on failure.
++ */
++static int kgdb_init(void)
++{
++      struct uart_port *port;
++      struct uart_cpm_port *pinfo;
++
++      int use_bootmem = 0; /* use dma by default */
++
++      if(!cpm_uart_nr)
++      {
++              use_bootmem = 1;
++              cpm_uart_init_portdesc();
++      }
++      port = (struct uart_port *)&cpm_uart_ports[KGDB_PINFO_INDEX];
++      pinfo = (struct uart_cpm_port *)port;
++
++      if (cpm_uart_early_setup(KGDB_PINFO_INDEX, use_bootmem))
++              return 1;
++
++      termios_set_options(KGDB_PINFO_INDEX, KGDB_BAUD,'n',8,'n');
++        if (IS_SMC(pinfo))
++                pinfo->smcp->smc_smcm |= SMCM_TX;
++        else
++                pinfo->sccp->scc_sccm |= UART_SCCM_TX;
++
++      return 0;
++}
++
++
++struct kgdb_io kgdb_io_ops = {
++      .read_char = kgdb_get_debug_char,
++      .write_char = kgdb_put_debug_char,
++      .init = kgdb_init,
++};
++
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/drivers/serial/mpsc.c linux-2.6.18-53.1.14.kgdb/drivers/serial/mpsc.c
+--- linux-2.6.18-53.1.14/drivers/serial/mpsc.c 2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/drivers/serial/mpsc.c    2008-06-10 15:38:14.000000000 +0400
+@@ -242,6 +242,11 @@ struct mpsc_port_info *mpsc_device_remov
+ #define       MPSC_RCRR                       0x0004
+ #define       MPSC_TCRR                       0x0008
+ 
++/* MPSC Interrupt registers (offset from MV64x60_SDMA_INTR_OFFSET) */
++#define MPSC_INTR_CAUSE                        0x0004
++#define MPSC_INTR_MASK                 0x0084
++#define MPSC_INTR_CAUSE_RCC            (1<<6)
++
+ /* Serial DMA Controller Interface Registers */
+ #define       SDMA_SDC                        0x0000
+ #define       SDMA_SDCM                       0x0008
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/drivers/serial/mpsc_kgdb.c linux-2.6.18-53.1.14.kgdb/drivers/serial/mpsc_kgdb.c
+--- linux-2.6.18-53.1.14/drivers/serial/mpsc_kgdb.c    1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/drivers/serial/mpsc_kgdb.c       2008-06-10 15:38:14.000000000 +0400
+@@ -0,0 +1,299 @@
++/*
++ * drivers/serial/mpsc_kgdb.c
++ *
++ * KGDB driver for the Marvell MultiProtocol Serial Controller (MPCS)
++ *
++ * Based on the polled boot loader driver by Ajit Prem (ajit.prem@motorola.com)
++ *
++ * Author: Randy Vinson <rvinson@mvista.com>
++ *
++ * 2005 (c) MontaVista Software, Inc.
++ * This program is free software; you can redistribute  it and/or modify it
++ * under  the terms of  the GNU General  Public License as published by the
++ * Free Software Foundation;  either version 2 of the  License, or (at your
++ * option) any later version.
++ */
++
++#include <linux/config.h>
++#include <linux/kgdb.h>
++#include <linux/mv643xx.h>
++#include <linux/device.h>
++#include <asm/mv64x60.h>
++#include <asm/serial.h>
++#include <asm/io.h>
++#include <asm/delay.h>
++
++#include "mpsc.h"
++
++/* Speed of the UART. */
++static int kgdbmpsc_baud = CONFIG_KGDB_BAUDRATE;
++
++/* Index of the UART, matches ttyMX naming. */
++static int kgdbmpsc_ttyMM = CONFIG_KGDB_PORT_NUM;
++
++#define MPSC_INTR_REG_SELECT(x)       ((x) + (8 * kgdbmpsc_ttyMM))
++
++static int kgdbmpsc_init(void);
++
++static struct platform_device mpsc_dev, shared_dev;
++
++static void __iomem *mpsc_base;
++static void __iomem *brg_base;
++static void __iomem *routing_base;
++static void __iomem *sdma_base;
++
++static unsigned int mpsc_irq;
++
++static void kgdb_write_debug_char(int c)
++{
++      u32 data;
++
++      data = readl(mpsc_base + MPSC_MPCR);
++      writeb(c, mpsc_base + MPSC_CHR_1);
++      mb();
++      data = readl(mpsc_base + MPSC_CHR_2);
++      data |= MPSC_CHR_2_TTCS;
++      writel(data, mpsc_base + MPSC_CHR_2);
++      mb();
++
++      while (readl(mpsc_base + MPSC_CHR_2) & MPSC_CHR_2_TTCS) ;
++}
++
++static int kgdb_get_debug_char(void)
++{
++      unsigned char c;
++
++      while (!(readl(sdma_base + MPSC_INTR_REG_SELECT(MPSC_INTR_CAUSE)) &
++               MPSC_INTR_CAUSE_RCC)) ;
++
++      c = readb(mpsc_base + MPSC_CHR_10 + (1 << 1));
++      mb();
++      writeb(c, mpsc_base + MPSC_CHR_10 + (1 << 1));
++      mb();
++      writel(~MPSC_INTR_CAUSE_RCC, sdma_base +
++             MPSC_INTR_REG_SELECT(MPSC_INTR_CAUSE));
++      return (c);
++}
++
++/*
++ * This is the receiver interrupt routine for the GDB stub.
++ * All that we need to do is verify that the interrupt happened on the
++ * line we're in charge of.  If this is true, schedule a breakpoint and
++ * return.
++ */
++static irqreturn_t
++kgdbmpsc_interrupt(int irq, void *dev_id, struct pt_regs *regs)
++{
++      if (irq != mpsc_irq)
++              return IRQ_NONE;
++      /*
++       * If  there is some other CPU in KGDB then this is a
++       * spurious interrupt. so return without even checking a byte
++       */
++      if (atomic_read(&debugger_active))
++              return IRQ_NONE;
++
++      if (readl(sdma_base + MPSC_INTR_REG_SELECT(MPSC_INTR_CAUSE)) &
++          MPSC_INTR_CAUSE_RCC)
++              breakpoint();
++
++      return IRQ_HANDLED;
++}
++
++static int __init kgdbmpsc_init(void)
++{
++      struct mpsc_pdata *pdata;
++      u32 cdv;
++
++      if (!brg_base || !mpsc_base || !routing_base || !sdma_base)
++              return -1;
++
++      /* Set MPSC Routing to enable both ports */
++      writel(0x0, routing_base + MPSC_MRR);
++
++      /* MPSC 0/1 Rx & Tx get clocks BRG0/1 */
++      writel(0x00000100, routing_base + MPSC_RCRR);
++      writel(0x00000100, routing_base + MPSC_TCRR);
++
++      /* Disable all MPSC interrupts and clear any pending interrupts */
++      writel(0, sdma_base + MPSC_INTR_REG_SELECT(MPSC_INTR_MASK));
++      writel(0, sdma_base + MPSC_INTR_REG_SELECT(MPSC_INTR_CAUSE));
++
++      pdata = (struct mpsc_pdata *)mpsc_dev.dev.platform_data;
++
++      /* cdv = (clock/(2*16*baud rate)) for 16X mode. */
++      cdv = ((pdata->brg_clk_freq / (32 * kgdbmpsc_baud)) - 1);
++      writel((pdata->brg_clk_src << 18) | (1 << 16) | cdv,
++             brg_base + BRG_BCR);
++
++      /* Put MPSC into UART mode, no null modem, 16x clock mode */
++      writel(0x000004c4, mpsc_base + MPSC_MMCRL);
++      writel(0x04400400, mpsc_base + MPSC_MMCRH);
++
++      writel(0, mpsc_base + MPSC_CHR_1);
++      writel(0, mpsc_base + MPSC_CHR_9);
++      writel(0, mpsc_base + MPSC_CHR_10);
++      writel(4, mpsc_base + MPSC_CHR_3);
++      writel(0x20000000, mpsc_base + MPSC_CHR_4);
++      writel(0x9000, mpsc_base + MPSC_CHR_5);
++      writel(0, mpsc_base + MPSC_CHR_6);
++      writel(0, mpsc_base + MPSC_CHR_7);
++      writel(0, mpsc_base + MPSC_CHR_8);
++
++      /* 8 data bits, 1 stop bit */
++      writel((3 << 12), mpsc_base + MPSC_MPCR);
++
++      /* Enter "hunt" mode */
++      writel((1 << 31), mpsc_base + MPSC_CHR_2);
++
++      udelay(100);
++      return 0;
++}
++
++static void __iomem *__init
++kgdbmpsc_map_resource(struct platform_device *pd, int type, int num)
++{
++      void __iomem *base = NULL;
++      struct resource *r;
++
++      if ((r = platform_get_resource(pd, IORESOURCE_MEM, num)))
++              base = ioremap(r->start, r->end - r->start + 1);
++      return base;
++}
++
++static void __iomem *__init
++kgdbmpsc_unmap_resource(struct platform_device *pd, int type, int num,
++                      void __iomem * base)
++{
++      if (base)
++              iounmap(base);
++      return NULL;
++}
++
++static void __init
++kgdbmpsc_reserve_resource(struct platform_device *pd, int type, int num)
++{
++      struct resource *r;
++
++      if ((r = platform_get_resource(pd, IORESOURCE_MEM, num)))
++              request_mem_region(r->start, r->end - r->start + 1, "kgdb");
++}
++
++static int __init kgdbmpsc_local_init(void)
++{
++      if (!mpsc_dev.num_resources || !shared_dev.num_resources)
++              return 1;       /* failure */
++
++      mpsc_base = kgdbmpsc_map_resource(&mpsc_dev, IORESOURCE_MEM,
++                                        MPSC_BASE_ORDER);
++      brg_base = kgdbmpsc_map_resource(&mpsc_dev, IORESOURCE_MEM,
++                                       MPSC_BRG_BASE_ORDER);
++
++      /* get the platform data for the shared registers and get them mapped */
++      routing_base = kgdbmpsc_map_resource(&shared_dev,
++                                           IORESOURCE_MEM,
++                                           MPSC_ROUTING_BASE_ORDER);
++      sdma_base =
++          kgdbmpsc_map_resource(&shared_dev, IORESOURCE_MEM,
++                                MPSC_SDMA_INTR_BASE_ORDER);
++
++      mpsc_irq = platform_get_irq(&mpsc_dev, 1);
++
++      if (mpsc_base && brg_base && routing_base && sdma_base)
++              return 0;       /* success */
++
++      return 1;               /* failure */
++}
++
++static void __init kgdbmpsc_local_exit(void)
++{
++      if (sdma_base)
++              sdma_base = kgdbmpsc_unmap_resource(&shared_dev, IORESOURCE_MEM,
++                                                  MPSC_SDMA_INTR_BASE_ORDER,
++                                                  sdma_base);
++      if (routing_base)
++              routing_base = kgdbmpsc_unmap_resource(&shared_dev,
++                                                     IORESOURCE_MEM,
++                                                     MPSC_ROUTING_BASE_ORDER,
++                                                     routing_base);
++      if (brg_base)
++              brg_base = kgdbmpsc_unmap_resource(&mpsc_dev, IORESOURCE_MEM,
++                                                 MPSC_BRG_BASE_ORDER,
++                                                 brg_base);
++      if (mpsc_base)
++              mpsc_base = kgdbmpsc_unmap_resource(&mpsc_dev, IORESOURCE_MEM,
++                                                  MPSC_BASE_ORDER, mpsc_base);
++}
++
++static void __init kgdbmpsc_update_pdata(struct platform_device *pdev)
++{
++
++      snprintf(pdev->dev.bus_id, BUS_ID_SIZE, "%s%u", pdev->name, pdev->id);
++}
++
++static int __init kgdbmpsc_pdev_init(void)
++{
++      struct platform_device *pdev;
++
++      /* get the platform data for the specified port. */
++      pdev = mv64x60_early_get_pdev_data(MPSC_CTLR_NAME, kgdbmpsc_ttyMM, 1);
++      if (pdev) {
++              memcpy(&mpsc_dev, pdev, sizeof(struct platform_device));
++              if (platform_notify) {
++                      kgdbmpsc_update_pdata(&mpsc_dev);
++                      platform_notify(&mpsc_dev.dev);
++              }
++
++              /* get the platform data for the shared registers. */
++              pdev = mv64x60_early_get_pdev_data(MPSC_SHARED_NAME, 0, 0);
++              if (pdev) {
++                      memcpy(&shared_dev, pdev,
++                             sizeof(struct platform_device));
++                      if (platform_notify) {
++                              kgdbmpsc_update_pdata(&shared_dev);
++                              platform_notify(&shared_dev.dev);
++                      }
++              }
++      }
++      return 0;
++}
++
++postcore_initcall(kgdbmpsc_pdev_init);
++
++static int __init kgdbmpsc_init_io(void)
++{
++
++      kgdbmpsc_pdev_init();
++
++      if (kgdbmpsc_local_init()) {
++              kgdbmpsc_local_exit();
++              return -1;
++      }
++
++      if (kgdbmpsc_init() == -1)
++              return -1;
++      return 0;
++}
++
++static void __init kgdbmpsc_hookup_irq(void)
++{
++      unsigned int msk;
++      if (!request_irq(mpsc_irq, kgdbmpsc_interrupt, 0, "kgdb mpsc", NULL)) {
++              /* Enable interrupt */
++              msk = readl(sdma_base + MPSC_INTR_REG_SELECT(MPSC_INTR_MASK));
++              msk |= MPSC_INTR_CAUSE_RCC;
++              writel(msk, sdma_base + MPSC_INTR_REG_SELECT(MPSC_INTR_MASK));
++
++              kgdbmpsc_reserve_resource(&mpsc_dev, IORESOURCE_MEM,
++                                        MPSC_BASE_ORDER);
++              kgdbmpsc_reserve_resource(&mpsc_dev, IORESOURCE_MEM,
++                                        MPSC_BRG_BASE_ORDER);
++      }
++}
++
++struct kgdb_io kgdb_io_ops = {
++      .read_char = kgdb_get_debug_char,
++      .write_char = kgdb_write_debug_char,
++      .init = kgdbmpsc_init_io,
++      .late_init = kgdbmpsc_hookup_irq,
++};
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/drivers/serial/pxa.c linux-2.6.18-53.1.14.kgdb/drivers/serial/pxa.c
+--- linux-2.6.18-53.1.14/drivers/serial/pxa.c  2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/drivers/serial/pxa.c     2008-06-10 15:38:56.000000000 +0400
+@@ -42,6 +42,9 @@
+ #include <linux/tty.h>
+ #include <linux/tty_flip.h>
+ #include <linux/serial_core.h>
++#ifdef CONFIG_KGDB_CONSOLE
++#include <linux/kgdb.h>
++#endif
+ 
+ #include <asm/io.h>
+ #include <asm/hardware.h>
+@@ -692,6 +695,8 @@ serial_pxa_console_init(void)
+ console_initcall(serial_pxa_console_init);
+ 
+ #define PXA_CONSOLE   &serial_pxa_console
++#elif defined(CONFIG_KGDB_CONSOLE)
++#define PXA_CONSOLE   &kgdbcons
+ #else
+ #define PXA_CONSOLE   NULL
+ #endif
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/drivers/serial/serial_core.c linux-2.6.18-53.1.14.kgdb/drivers/serial/serial_core.c
+--- linux-2.6.18-53.1.14/drivers/serial/serial_core.c  2008-03-06 05:54:07.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/drivers/serial/serial_core.c     2008-06-10 15:37:43.000000000 +0400
+@@ -33,6 +33,7 @@
+ #include <linux/serial.h> /* for serial_state and serial_icounter_struct */
+ #include <linux/delay.h>
+ #include <linux/mutex.h>
++#include <linux/kgdb.h>
+ 
+ #include <asm/irq.h>
+ #include <asm/uaccess.h>
+@@ -65,6 +66,12 @@ static struct lock_class_key port_lock_k
+ #define uart_console(port)    (0)
+ #endif
+ 
++#ifdef CONFIG_KGDB_CONSOLE
++#define uart_kgdb(port)               (port->cons && !strcmp(port->cons->name, "kgdb"))
++#else
++#define uart_kgdb(port)               (0)
++#endif
++
+ static void uart_change_speed(struct uart_state *state, struct termios *old_termios);
+ static void uart_wait_until_sent(struct tty_struct *tty, int timeout);
+ static void uart_change_pm(struct uart_state *state, int pm_state);
+@@ -1673,6 +1680,9 @@ static int uart_line_info(char *buf, str
+                       port->iotype == UPIO_MEM ? port->mapbase :
+                                               (unsigned long) port->iobase,
+                       port->irq);
++      if (port->iotype == UPIO_MEM)
++              ret += sprintf(buf+ret, " membase 0x%08lX",
++                                         (unsigned long) port->membase);
+ 
+       if (port->type == PORT_UNKNOWN) {
+               strcat(buf, "\n");
+@@ -2043,7 +2053,8 @@ uart_report_port(struct uart_driver *drv
+       case UPIO_AU:
+       case UPIO_TSI:
+               snprintf(address, sizeof(address),
+-                       "MMIO 0x%lx", port->mapbase);
++                      "MMIO map 0x%lx mem 0x%lx", port->mapbase,
++                      (unsigned long) port->membase);
+               break;
+       default:
+               strlcpy(address, "*unknown*", sizeof(address));
+@@ -2095,9 +2106,9 @@ uart_configure_port(struct uart_driver *
+ 
+               /*
+                * Power down all ports by default, except the
+-               * console if we have one.
++               * console (real or kgdb) if we have one.
+                */
+-              if (!uart_console(port))
++              if (!uart_console(port) && !uart_kgdb(port))
+                       uart_change_pm(state, 3);
+       }
+ }
+@@ -2289,6 +2300,12 @@ int uart_add_one_port(struct uart_driver
+        */
+       port->flags &= ~UPF_DEAD;
+ 
++#if defined(CONFIG_KGDB_8250)
++      /* Add any 8250-like ports we find later. */
++      if (port->type <= PORT_MAX_8250)
++              kgdb8250_add_port(port->line, port);
++#endif
++
+  out:
+       mutex_unlock(&state->mutex);
+       mutex_unlock(&port_mutex);
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/drivers/serial/serial_txx9.c linux-2.6.18-53.1.14.kgdb/drivers/serial/serial_txx9.c
+--- linux-2.6.18-53.1.14/drivers/serial/serial_txx9.c  2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/drivers/serial/serial_txx9.c     2008-06-10 15:38:24.000000000 +0400
+@@ -1164,6 +1164,96 @@ static struct pci_driver serial_txx9_pci
+ MODULE_DEVICE_TABLE(pci, serial_txx9_pci_tbl);
+ #endif /* ENABLE_SERIAL_TXX9_PCI */
+ 
++/******************************************************************************/
++/* BEG: KDBG Routines                                                         */
++/******************************************************************************/
++
++#ifdef CONFIG_KGDB
++int kgdb_init_count = 0;
++
++void txx9_sio_kgdb_hook(unsigned int port, unsigned int baud_rate)
++{
++      static struct resource kgdb_resource;
++      int ret;
++      struct uart_txx9_port *up = &serial_txx9_ports[port];
++
++      /* prevent initialization by driver */
++      kgdb_resource.name = "serial_txx9(debug)";
++      kgdb_resource.start = (unsigned long)up->port.membase;
++      kgdb_resource.end = (unsigned long)(up->port.membase + 36 - 1);
++      kgdb_resource.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
++
++      ret = request_resource(&iomem_resource, &kgdb_resource);
++      if(ret == -EBUSY)
++              printk(" serial_txx9(debug): request_resource failed\n");
++
++      return;
++}
++void
++txx9_sio_kdbg_init( unsigned int port_number )
++{
++      if (port_number == 1) {
++              txx9_sio_kgdb_hook(port_number, 38400);
++      } else {
++              printk("Bad Port Number [%u] != [1]\n",port_number);
++      }
++      return;
++}
++
++u8
++txx9_sio_kdbg_rd( void )
++{
++      unsigned int status,ch;
++      struct uart_txx9_port *up = &serial_txx9_ports[1];
++
++      if (kgdb_init_count == 0) {
++              txx9_sio_kdbg_init(1);
++              kgdb_init_count = 1;
++      }
++
++      while (1) {
++              status = sio_in(up, TXX9_SIDISR);
++              if ( status & 0x1f ) {
++                      ch = sio_in(up, TXX9_SIRFIFO );
++                      break;
++              }
++      }
++
++      return (ch);
++}
++
++int
++txx9_sio_kdbg_wr( u8 ch )
++{
++      unsigned int status;
++      struct uart_txx9_port *up = &serial_txx9_ports[1];
++
++      if (kgdb_init_count == 0) {
++              txx9_sio_kdbg_init(1);
++              kgdb_init_count = 1;
++      }
++
++      while (1) {
++              status = sio_in(up, TXX9_SICISR);
++              if (status & TXX9_SICISR_TRDY) {
++                      if ( ch == '\n' ) {
++                              txx9_sio_kdbg_wr( '\r' );
++                      }
++                      sio_out(up, TXX9_SITFIFO, (u32)ch );
++
++                      break;
++              }
++      }
++
++      return (1);
++}
++#endif /* CONFIG_KGDB */
++
++
++/******************************************************************************/
++/* END: KDBG Routines                                                         */
++/******************************************************************************/
++
+ static int __init serial_txx9_init(void)
+ {
+       int ret;
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/drivers/serial/sh-sci.c linux-2.6.18-53.1.14.kgdb/drivers/serial/sh-sci.c
+--- linux-2.6.18-53.1.14/drivers/serial/sh-sci.c       2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/drivers/serial/sh-sci.c  2008-06-10 15:38:50.000000000 +0400
+@@ -42,6 +42,7 @@
+ #include <linux/console.h>
+ #include <linux/bitops.h>
+ #include <linux/generic_serial.h>
++#include <linux/kgdb.h>
+ 
+ #ifdef CONFIG_CPU_FREQ
+ #include <linux/notifier.h>
+@@ -67,14 +68,16 @@
+ 
+ #include "sh-sci.h"
+ 
+-#ifdef CONFIG_SH_KGDB
+-#include <asm/kgdb.h>
+-
+-static int kgdb_get_char(struct sci_port *port);
+-static void kgdb_put_char(struct sci_port *port, char c);
+-static void kgdb_handle_error(struct sci_port *port);
+-static struct sci_port *kgdb_sci_port;
+-#endif /* CONFIG_SH_KGDB */
++#ifdef CONFIG_KGDB_SH_SCI
++/* Speed of the UART. */
++static int kgdbsci_baud = CONFIG_KGDB_BAUDRATE
++
++/* Index of the UART, matches ttySCX naming. */
++static int kgdbsci_ttySC = CONFIG_KGDB_PORT_NUM;
++
++/* Make life easier on us. */
++#define KGDBPORT      sci_ports[kgdbsci_ttySC]
++#endif /* CONFIG_KGDB_SH_SCI */
+ 
+ #ifdef CONFIG_SERIAL_SH_SCI_CONSOLE
+ static struct sci_port *serial_console_port = 0;
+@@ -87,20 +90,17 @@ static void sci_start_rx(struct uart_por
+ static void sci_stop_rx(struct uart_port *port);
+ static int sci_request_irq(struct sci_port *port);
+ static void sci_free_irq(struct sci_port *port);
++static void sci_set_termios(struct uart_port *port, struct termios *termios,
++                      struct termios *old);
++static int kgdbsci_init(void);
+ 
+ static struct sci_port sci_ports[];
+ static struct uart_driver sci_uart_driver;
+ 
+ #define SCI_NPORTS sci_uart_driver.nr
+ 
+-#if defined(CONFIG_SH_STANDARD_BIOS) || defined(CONFIG_SH_KGDB)
+-
+-static void handle_error(struct uart_port *port)
+-{                             /* Clear error flags */
+-      sci_out(port, SCxSR, SCxSR_ERROR_CLEAR(port));
+-}
+-
+-static int get_char(struct uart_port *port)
++#if defined(CONFIG_SH_STANDARD_BIOS) || defined(CONFIG_KGDB_SH_SCI)
++static int get_char_for_gdb(struct uart_port *port)
+ {
+       unsigned long flags;
+       unsigned short status;
+@@ -110,7 +110,8 @@ static int get_char(struct uart_port *po
+         do {
+               status = sci_in(port, SCxSR);
+               if (status & SCxSR_ERRORS(port)) {
+-                      handle_error(port);
++                      /* Clear error flags. */
++                      sci_out(port, SCxSR, SCxSR_ERROR_CLEAR(port));
+                       continue;
+               }
+       } while (!(status & SCxSR_RDxF(port)));
+@@ -121,21 +122,7 @@ static int get_char(struct uart_port *po
+ 
+       return c;
+ }
+-
+-/* Taken from sh-stub.c of GDB 4.18 */
+-static const char hexchars[] = "0123456789abcdef";
+-
+-static __inline__ char highhex(int  x)
+-{
+-      return hexchars[(x >> 4) & 0xf];
+-}
+-
+-static __inline__ char lowhex(int  x)
+-{
+-      return hexchars[x & 0xf];
+-}
+-
+-#endif /* CONFIG_SH_STANDARD_BIOS || CONFIG_SH_KGDB */
++#endif /* CONFIG_SH_STANDARD_BIOS || CONFIG_KGDB_SH_SCI */
+ 
+ /*
+  * Send the packet in buffer.  The host gets one chance to read it.
+@@ -167,21 +154,14 @@ static void put_string(struct sci_port *
+       const unsigned char *p = buffer;
+       int i;
+ 
+-#if defined(CONFIG_SH_STANDARD_BIOS) || defined(CONFIG_SH_KGDB)
++#ifdef CONFIG_SH_STANDARD_BIOS
+       int checksum;
+-      int usegdb=0;
++       const char hexchars[] = "0123456789abcdef";
+ 
+-#ifdef CONFIG_SH_STANDARD_BIOS
+       /* This call only does a trap the first time it is
+        * called, and so is safe to do here unconditionally
+        */
+-      usegdb |= sh_bios_in_gdb_mode();
+-#endif
+-#ifdef CONFIG_SH_KGDB
+-      usegdb |= (kgdb_in_gdb_mode && (port == kgdb_sci_port));
+-#endif
+-
+-      if (usegdb) {
++      if (sh_bios_in_gdb_mode()) {
+           /*  $<packet info>#<checksum>. */
+           do {
+               unsigned char c;
+@@ -193,18 +173,18 @@ static void put_string(struct sci_port *
+                       int h, l;
+ 
+                       c = *p++;
+-                      h = highhex(c);
+-                      l = lowhex(c);
++                      h = hexchars[c >> 4];
++                      l = hexchars[c % 16];
+                       put_char(port, h);
+                       put_char(port, l);
+                       checksum += h + l;
+               }
+               put_char(port, '#');
+-              put_char(port, highhex(checksum));
+-              put_char(port, lowhex(checksum));
++              put_char(port, hexchars[checksum >> 4]);
++              put_char(port, hexchars[checksum % 16]);
+           } while  (get_char(port) != '+');
+       } else
+-#endif /* CONFIG_SH_STANDARD_BIOS || CONFIG_SH_KGDB */
++#endif /* CONFIG_SH_STANDARD_BIOS */
+       for (i=0; i<count; i++) {
+               if (*p == 10)
+                       put_char(port, '\r');
+@@ -214,90 +194,163 @@ static void put_string(struct sci_port *
+ #endif /* CONFIG_SERIAL_SH_SCI_CONSOLE */
+ 
+ 
+-#ifdef CONFIG_SH_KGDB
+-
+-/* Is the SCI ready, ie is there a char waiting? */
+-static int kgdb_is_char_ready(struct sci_port *port)
++#ifdef CONFIG_KGDB_SH_SCI
++static int kgdbsci_read_char(void)
+ {
+-        unsigned short status = sci_in(port, SCxSR);
+-
+-        if (status & (SCxSR_ERRORS(port) | SCxSR_BRK(port)))
+-                kgdb_handle_error(port);
+-
+-        return (status & SCxSR_RDxF(port));
++      return get_char_for_gdb(&KGDBPORT.port);
+ }
+ 
+-/* Write a char */
+-static void kgdb_put_char(struct sci_port *port, char c)
++/* Called from kgdbstub.c to put a character, just a wrapper */
++static void kgdbsci_write_char(int c)
+ {
+         unsigned short status;
+ 
+         do
+-                status = sci_in(port, SCxSR);
+-        while (!(status & SCxSR_TDxE(port)));
++              status = sci_in(&KGDBPORT.port, SCxSR);
++      while (!(status & SCxSR_TDxE(&KGDBPORT.port)));
+ 
+-        sci_out(port, SCxTDR, c);
+-        sci_in(port, SCxSR);    /* Dummy read */
+-        sci_out(port, SCxSR, SCxSR_TDxE_CLEAR(port));
++      sci_out(&KGDBPORT.port, SCxTDR, c);
++      sci_in(&KGDBPORT.port, SCxSR);  /* Dummy read */
++      sci_out(&KGDBPORT.port, SCxSR, SCxSR_TDxE_CLEAR(&KGDBPORT.port));
+ }
+ 
+-/* Get a char if there is one, else ret -1 */
+-static int kgdb_get_char(struct sci_port *port)
++#ifndef CONFIG_SERIAL_SH_SCI_CONSOLE
++/* If we don't have console, we never hookup IRQs.  But we need to
++ * hookup one so that we can interrupt the system.
++ */
++static irqreturn_t kgdbsci_rx_interrupt(int irq, void *ptr,
++              struct pt_regs *regs)
+ {
+-        int c;
++      struct uart_port *port = ptr;
+ 
+-        if (kgdb_is_char_ready(port) == 0)
+-                c = -1;
+-        else {
+-                c = sci_in(port, SCxRDR);
+-                sci_in(port, SCxSR);    /* Dummy read */
++      if (!(sci_in(port, SCxSR) & SCxSR_RDxF(port)))
++              return IRQ_NONE;
++
++      if (kgdb_io_ops.init != kgdbsci_init) {
++              /* Throw away the data if another I/O routine is active */
++              get_char_for_gdb(&KGDBPORT.port);
++      } else
++              /* We've got an interrupt, so go ahead and call breakpoint() */
++              breakpoint();
++
++      sci_in(port, SCxSR); /* dummy read */
+                 sci_out(port, SCxSR, SCxSR_RDxF_CLEAR(port));
+-        }
+ 
+-        return c;
++      return IRQ_HANDLED;
+ }
+ 
+-/* Called from kgdbstub.c to get a character, i.e. is blocking */
+-static int kgdb_sci_getchar(void)
++static irqreturn_t kgdbsci_mpxed_interrupt(int irq, void *ptr,
++              struct pt_regs *regs)
+ {
+-        volatile int c;
+-
+-        /* Keep trying to read a character, this could be neater */
+-        while ((c = kgdb_get_char(kgdb_sci_port)) < 0);
++        unsigned short ssr_status, scr_status;
++        struct uart_port *port = ptr;
+ 
+-        return c;
+-}
++        ssr_status = sci_in(port,SCxSR);
++        scr_status = sci_in(port,SCSCR);
+ 
+-/* Called from kgdbstub.c to put a character, just a wrapper */
+-static void kgdb_sci_putchar(int c)
+-{
++      /* Rx Interrupt */
++        if ((ssr_status&0x0002) && (scr_status&0x0040))
++              kgdbsci_rx_interrupt(irq, ptr, regs);
+ 
+-        kgdb_put_char(kgdb_sci_port, c);
++      return IRQ_HANDLED;
+ }
+ 
+-/* Clear any errors on the SCI */
+-static void kgdb_handle_error(struct sci_port *port)
++static void __init kgdbsci_lateinit(void)
+ {
+-        sci_out(port, SCxSR, SCxSR_ERROR_CLEAR(port));  /* Clear error flags */
++      if (KGDBPORT.irqs[0] == KGDBPORT.irqs[1]) {
++              if (!KGDBPORT.irqs[0]) {
++                      printk(KERN_ERR "kgdbsci: Cannot allocate irq.\n");
++                      return;
++              }
++              if (request_irq(KGDBPORT.irqs[0], kgdbsci_mpxed_interrupt,
++                                      SA_INTERRUPT, "kgdbsci",
++                                      &KGDBPORT.port)) {
++                      printk(KERN_ERR "kgdbsci: Cannot allocate irq.\n");
++                      return;
++              }
++      } else {
++              if (KGDBPORT.irqs[1])
++                      request_irq(KGDBPORT.irqs[1],
++                                      kgdbsci_rx_interrupt, SA_INTERRUPT,
++                                      "kgdbsci", &KGDBPORT.port);
++      }
+ }
++#endif
+ 
+-/* Breakpoint if there's a break sent on the serial port */
+-static void kgdb_break_interrupt(int irq, void *ptr, struct pt_regs *regs)
++/*
++ * We use the normal init routine to setup the port, so we can't be
++ * in here too early.
++ */
++static int kgdbsci_init(void)
+ {
+-        struct sci_port *port = ptr;
+-        unsigned short status = sci_in(port, SCxSR);
+-
+-        if (status & SCxSR_BRK(port)) {
++      struct termios termios;
+ 
+-                /* Break into the debugger if a break is detected */
+-                BREAKPOINT();
++      memset(&termios, 0, sizeof(struct termios));
+ 
+-                /* Clear */
+-                sci_out(port, SCxSR, SCxSR_BREAK_CLEAR(port));
++      termios.c_cflag = CREAD | HUPCL | CLOCAL | CS8;
++      switch (kgdbsci_baud) {
++      case 9600:
++              termios.c_cflag |= B9600;
++              break;
++      case 19200:
++              termios.c_cflag |= B19200;
++              break;
++      case 38400:
++              termios.c_cflag |= B38400;
++              break;
++      case 57600:
++              termios.c_cflag |= B57600;
++              break;
++      case 115200:
++              termios.c_cflag |= B115200;
++              break;
+         }
++      sci_set_termios(&KGDBPORT.port, &termios, NULL);
++
++      return 0;
+ }
+ 
+-#endif /* CONFIG_SH_KGDB */
++struct kgdb_io kgdb_io_ops = {
++      .read_char = kgdbsci_read_char,
++      .write_char = kgdbsci_write_char,
++      .init = kgdbsci_init,
++#ifndef CONFIG_SERIAL_SH_SCI_CONSOLE
++      .late_init = kgdbsci_lateinit,
++#else /* ! CONFIG_SERIAL_SH_SCI_CONSOLE */
++      .late_init = NULL,
++#endif /* ! CONFIG_SERIAL_SH_SCI_CONSOLE */
++      .pre_exception = NULL,
++      .post_exception = NULL
++};
++
++/*
++ * Syntax for this cmdline option is "kgdbsci=ttyno,baudrate".
++ */
++static int __init
++kgdbsci_opt(char *str)
++{
++      /* We might have anywhere from 1 to 3 ports. */
++      if (*str < '0' || *str > SCI_NPORTS + '0')
++               goto errout;
++      kgdbsci_ttySC = *str - '0';
++      str++;
++      if (*str != ',')
++               goto errout;
++      str++;
++      kgdbsci_baud = simple_strtoul(str, &str, 10);
++      if (kgdbsci_baud != 9600 && kgdbsci_baud != 19200 &&
++          kgdbsci_baud != 38400 && kgdbsci_baud != 57600 &&
++          kgdbsci_baud != 115200)
++               goto errout;
++
++      return 0;
++
++errout:
++      printk(KERN_ERR "Invalid syntax for option kgdbsci=\n");
++      return 1;
++}
++__setup("kgdbsci", kgdbsci_opt);
++#endif /* CONFIG_KGDB_SH_SCI */
+ 
+ #if defined(__H8300S__)
+ enum { sci_disable, sci_enable };
+@@ -555,6 +608,16 @@ static inline void sci_receive_chars(str
+                                       continue;
+                               }
+ 
++#ifdef CONFIG_KGDB_SH_SCI
++                              /* We assume that a ^C on the port KGDB
++                               * is using means that KGDB wants to
++                               * interrupt the running system.
++                               */
++                              if (port->line == KGDBPORT.port.line &&
++                                              c == 3)
++                                      breakpoint();
++#endif
++
+                               /* Store data and status */
+                               if (status&SCxSR_FER(port)) {
+                                       flag = TTY_FRAME;
+@@ -1618,6 +1681,7 @@ static int __init sci_console_init(void)
+ console_initcall(sci_console_init);
+ #endif /* CONFIG_SERIAL_SH_SCI_CONSOLE */
+ 
++#if 0
+ #ifdef CONFIG_SH_KGDB
+ /*
+  * FIXME: Most of this can go away.. at the moment, we rely on
+@@ -1663,30 +1727,9 @@ int __init kgdb_console_setup(struct con
+       return uart_set_options(port, co, baud, parity, bits, flow);
+ }
+ #endif /* CONFIG_SH_KGDB */
++#endif /* 0 */
+ 
+-#ifdef CONFIG_SH_KGDB_CONSOLE
+-static struct console kgdb_console = {
+-        .name         = "ttySC",
+-        .write                = kgdb_console_write,
+-        .setup                = kgdb_console_setup,
+-        .flags                = CON_PRINTBUFFER | CON_ENABLED,
+-        .index                = -1,
+-      .data           = &sci_uart_driver,
+-};
+-
+-/* Register the KGDB console so we get messages (d'oh!) */
+-static int __init kgdb_console_init(void)
+-{
+-      register_console(&kgdb_console);
+-      return 0;
+-}
+-
+-console_initcall(kgdb_console_init);
+-#endif /* CONFIG_SH_KGDB_CONSOLE */
+-
+-#if defined(CONFIG_SH_KGDB_CONSOLE)
+-#define SCI_CONSOLE   &kgdb_console
+-#elif defined(CONFIG_SERIAL_SH_SCI_CONSOLE)
++#ifdef CONFIG_SERIAL_SH_SCI_CONSOLE
+ #define SCI_CONSOLE   &serial_console
+ #else
+ #define SCI_CONSOLE   0
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/asm-arm/kgdb.h linux-2.6.18-53.1.14.kgdb/include/asm-arm/kgdb.h
+--- linux-2.6.18-53.1.14/include/asm-arm/kgdb.h        1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/include/asm-arm/kgdb.h   2008-06-10 15:39:01.000000000 +0400
+@@ -0,0 +1,92 @@
++/*
++ * include/asm-arm/kgdb.h
++ *
++ * ARM KGDB support
++ *
++ * Author: Deepak Saxena <dsaxena@mvista.com>
++ *
++ * Copyright (C) 2002 MontaVista Software Inc.
++ *
++ */
++
++#ifndef __ASM_KGDB_H__
++#define __ASM_KGDB_H__
++
++#include <linux/config.h>
++#include <asm/ptrace.h>
++#include <asm-generic/kgdb.h>
++
++
++/*
++ * GDB assumes that we're a user process being debugged, so
++ * it will send us an SWI command to write into memory as the
++ * debug trap. When an SWI occurs, the next instruction addr is
++ * placed into R14_svc before jumping to the vector trap.
++ * This doesn't work for kernel debugging as we are already in SVC
++ * we would loose the kernel's LR, which is a bad thing. This
++ * is  bad thing.
++ *
++ * By doing this as an undefined instruction trap, we force a mode
++ * switch from SVC to UND mode, allowing us to save full kernel state.
++ *
++ * We also define a KGDB_COMPILED_BREAK which can be used to compile
++ * in breakpoints. This is important for things like sysrq-G and for
++ * the initial breakpoint from trap_init().
++ *
++ * Note to ARM HW designers: Add real trap support like SH && PPC to
++ * make our lives much much simpler. :)
++ */
++#define       BREAK_INSTR_SIZE                4
++#define GDB_BREAKINST                   0xef9f0001
++#define KGDB_BREAKINST                  0xe7ffdefe
++#define KGDB_COMPILED_BREAK             0xe7ffdeff
++#define CACHE_FLUSH_IS_SAFE           1
++
++#ifndef       __ASSEMBLY__
++
++#define       BREAKPOINT()                    asm(".word      0xe7ffdeff")
++
++
++extern void kgdb_handle_bus_error(void);
++extern int kgdb_fault_expected;
++#endif /* !__ASSEMBLY__ */
++
++/*
++ * From Amit S. Kale:
++ *
++ * In the register packet, words 0-15 are R0 to R10, FP, IP, SP, LR, PC. But
++ * Register 16 isn't cpsr. GDB passes CPSR in word 25. There are 9 words in
++ * between which are unused. Passing only 26 words to gdb is sufficient.
++ * GDB can figure out that floating point registers are not passed.
++ * GDB_MAX_REGS should be 26.
++ */
++#define       GDB_MAX_REGS            (26)
++
++#define       KGDB_MAX_NO_CPUS        1
++#define       BUFMAX                  400
++#define       NUMREGBYTES             (GDB_MAX_REGS << 2)
++#define       NUMCRITREGBYTES         (32 << 2)
++
++#define       _R0             0
++#define       _R1             1
++#define       _R2             2
++#define       _R3             3
++#define       _R4             4
++#define       _R5             5
++#define       _R6             6
++#define       _R7             7
++#define       _R8             8
++#define       _R9             9
++#define       _R10            10
++#define       _FP             11
++#define       _IP             12
++#define       _SP             13
++#define       _LR             14
++#define       _PC             15
++#define       _CPSR           (GDB_MAX_REGS - 1)
++
++/* So that we can denote the end of a frame for tracing, in the simple
++ * case. */
++#define CFI_END_FRAME(func)   __CFI_END_FRAME(_PC,_SP,func)
++
++#endif /* __ASM_KGDB_H__ */
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/asm-arm/system.h linux-2.6.18-53.1.14.kgdb/include/asm-arm/system.h
+--- linux-2.6.18-53.1.14/include/asm-arm/system.h      2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/include/asm-arm/system.h 2008-06-10 15:38:56.000000000 +0400
+@@ -444,6 +444,47 @@ static inline unsigned long __xchg(unsig
+ extern void disable_hlt(void);
+ extern void enable_hlt(void);
+ 
++#define       __HAVE_ARCH_CMPXCHG     1
++
++#include <asm/types.h>
++
++static inline unsigned long __cmpxchg_u32(volatile int *m, unsigned long old,
++                                      unsigned long new)
++{
++      u32 retval;
++      unsigned long flags;
++
++      local_irq_save(flags);
++      retval = *m;
++      if (retval == old)
++              *m = new;
++      local_irq_restore(flags);       /* implies memory barrier  */
++
++      return retval;
++}
++
++/* This function doesn't exist, so you'll get a linker error
++   if something tries to do an invalid cmpxchg().  */
++extern void __cmpxchg_called_with_bad_pointer(void);
++
++static inline unsigned long __cmpxchg(volatile void * ptr, unsigned long old,
++      unsigned long new, int size)
++{
++      switch (size) {
++      case 4:
++              return __cmpxchg_u32(ptr, old, new);
++      }
++      __cmpxchg_called_with_bad_pointer();
++      return old;
++}
++
++#define cmpxchg(ptr,o,n)                                               \
++  ({                                                                   \
++     __typeof__(*(ptr)) _o_ = (o);                                     \
++     __typeof__(*(ptr)) _n_ = (n);                                     \
++     (__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)_o_,                 \
++                                  (unsigned long)_n_, sizeof(*(ptr))); \
++  })
+ #endif /* __ASSEMBLY__ */
+ 
+ #define arch_align_stack(x) (x)
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/asm-generic/kgdb.h linux-2.6.18-53.1.14.kgdb/include/asm-generic/kgdb.h
+--- linux-2.6.18-53.1.14/include/asm-generic/kgdb.h    1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/include/asm-generic/kgdb.h       2008-06-10 15:39:01.000000000 +0400
+@@ -0,0 +1,34 @@
++/*
++ * include/asm-generic/kgdb.h
++ *
++ * This provides the assembly level information so that KGDB can provide
++ * a GDB that has been patched with enough information to know to stop
++ * trying to unwind the function.
++ *
++ * Author: Tom Rini <trini@kernel.crashing.org>
++ *
++ * 2005 (c) MontaVista Software, Inc. This file is licensed under the terms
++ * of the GNU General Public License version 2. This program is licensed
++ * "as is" without any warranty of any kind, whether express or implied.
++ */
++
++#ifndef __ASM_GENERIC_KGDB_H__
++#define __ASM_GENERIC_KGDB_H__
++
++#include <linux/dwarf2-lang.h>
++#ifdef __ASSEMBLY__
++#ifdef CONFIG_KGDB
++/* This MUST be put at the end of a given assembly function */
++#define __CFI_END_FRAME(pc,sp,func)                   \
++CAT3(.Lend_,func,:)                                   \
++      CFI_preamble(func,pc,0x1,-DATA_ALIGN_FACTOR)    \
++      CFA_define_reference(sp, 0)                     \
++      CFA_undefine_reg(pc)                            \
++      CFI_postamble()                                 \
++      FDE_preamble(func,func,CAT3(.Lend,_,func))      \
++      FDE_postamble()
++#else
++#define __CFI_END_FRAME(pc,sp,fn)
++#endif                                /* CONFIG_KGDB */
++#endif                                /* __ASSEMBLY__ */
++#endif                                /* __ASM_GENERIC_KGDB_H__ */
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/asm-i386/kdebug.h linux-2.6.18-53.1.14.kgdb/include/asm-i386/kdebug.h
+--- linux-2.6.18-53.1.14/include/asm-i386/kdebug.h     2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/include/asm-i386/kdebug.h        2008-06-10 15:38:03.000000000 +0400
+@@ -39,6 +39,7 @@ enum die_val {
+       DIE_CALL,
+       DIE_NMI_IPI,
+       DIE_PAGE_FAULT,
++      DIE_PAGE_FAULT_NO_CONTEXT,
+ };
+ 
+ static inline int notify_die(enum die_val val, const char *str,
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/asm-i386/kgdb.h linux-2.6.18-53.1.14.kgdb/include/asm-i386/kgdb.h
+--- linux-2.6.18-53.1.14/include/asm-i386/kgdb.h       1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/include/asm-i386/kgdb.h  2008-06-10 15:39:01.000000000 +0400
+@@ -0,0 +1,58 @@
++#ifdef __KERNEL__
++#ifndef _ASM_KGDB_H_
++#define _ASM_KGDB_H_
++
++/*
++ * Copyright (C) 2001-2004 Amit S. Kale
++ */
++
++#include <asm-generic/kgdb.h>
++
++/*
++ *  Note that this register image is in a different order than
++ *  the register image that Linux produces at interrupt time.
++ *
++ *  Linux's register image is defined by struct pt_regs in ptrace.h.
++ *  Just why GDB uses a different order is a historical mystery.
++ */
++#define _EAX  0
++#define _ECX  1
++#define _EDX  2
++#define _EBX  3
++#define _ESP  4
++#define _EBP  5
++#define _ESI  6
++#define _EDI  7
++#define _PC   8
++#define _EIP  8
++#define _PS   9
++#define _EFLAGS       9
++#define _CS   10
++#define _SS   11
++#define _DS   12
++#define _ES   13
++#define _FS   14
++#define _GS   15
++
++/* So that we can denote the end of a frame for tracing, in the simple
++ * case. */
++#define CFI_END_FRAME(func)   __CFI_END_FRAME(_EIP,_ESP,func)
++
++#ifndef __ASSEMBLY__
++/************************************************************************/
++/* BUFMAX defines the maximum number of characters in inbound/outbound buffers*/
++/* at least NUMREGBYTES*2 are needed for register packets */
++/* Longer buffer is needed to list all threads */
++#define BUFMAX                        1024
++
++/* Number of bytes of registers.  */
++#define NUMREGBYTES           64
++/* Number of bytes of registers we need to save for a setjmp/longjmp. */
++#define NUMCRITREGBYTES               24
++
++#define BREAKPOINT()          asm("   int $3");
++#define BREAK_INSTR_SIZE      1
++#define CACHE_FLUSH_IS_SAFE   1
++#endif                                /* !__ASSEMBLY__ */
++#endif                                /* _ASM_KGDB_H_ */
++#endif                                /* __KERNEL__ */
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/asm-ia64/kdebug.h linux-2.6.18-53.1.14.kgdb/include/asm-ia64/kdebug.h
+--- linux-2.6.18-53.1.14/include/asm-ia64/kdebug.h     2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/include/asm-ia64/kdebug.h        2008-06-10 15:38:32.000000000 +0400
+@@ -72,6 +72,7 @@ enum die_val {
+       DIE_KDEBUG_LEAVE,
+       DIE_KDUMP_ENTER,
+       DIE_KDUMP_LEAVE,
++      DIE_PAGE_FAULT_NO_CONTEXT,
+ };
+ 
+ static inline int notify_die(enum die_val val, char *str, struct pt_regs *regs,
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/asm-ia64/kgdb.h linux-2.6.18-53.1.14.kgdb/include/asm-ia64/kgdb.h
+--- linux-2.6.18-53.1.14/include/asm-ia64/kgdb.h       1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/include/asm-ia64/kgdb.h  2008-06-10 15:38:32.000000000 +0400
+@@ -0,0 +1,36 @@
++#ifdef __KERNEL__
++#ifndef _ASM_KGDB_H_
++#define _ASM_KGDB_H_
++
++/*
++ * Copyright (C) 2001-2004 Amit S. Kale
++ */
++
++#include <linux/threads.h>
++
++/************************************************************************/
++/* BUFMAX defines the maximum number of characters in inbound/outbound buffers*/
++/* at least NUMREGBYTES*2 are needed for register packets */
++/* Longer buffer is needed to list all threads */
++#define BUFMAX                        1024
++
++/* Number of bytes of registers.  We set this to 0 so that certain GDB
++ * packets will fail, forcing the use of others, which are more friendly
++ * on ia64. */
++#define NUMREGBYTES           0
++
++#define NUMCRITREGBYTES               (70*8)
++#define JMP_REGS_ALIGNMENT    __attribute__ ((aligned (16)))
++
++#define BREAKNUM              0x00003333300LL
++#define KGDBBREAKNUM          0x6665UL
++#define BREAKPOINT()          asm volatile ("break.m 0x6665")
++#define BREAK_INSTR_SIZE      16
++#define CACHE_FLUSH_IS_SAFE   1
++
++struct pt_regs;
++extern volatile int kgdb_hwbreak_sstep[NR_CPUS];
++extern void smp_send_nmi_allbutself(void);
++extern void kgdb_wait_ipi(struct pt_regs *);
++#endif                                /* _ASM_KGDB_H_ */
++#endif                                /* __KERNEL__ */
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/asm-mips/kdebug.h linux-2.6.18-53.1.14.kgdb/include/asm-mips/kdebug.h
+--- linux-2.6.18-53.1.14/include/asm-mips/kdebug.h     1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/include/asm-mips/kdebug.h        2008-06-10 15:38:24.000000000 +0400
+@@ -0,0 +1,47 @@
++/*
++ *
++ * Copyright (C) 2004  MontaVista Software Inc.
++ * Author: Manish Lachwani, mlachwani@mvista.com or manish@koffee-break.com
++ *
++ * This program is free software; you can redistribute  it and/or modify it
++ * under  the terms of  the GNU General  Public License as published by the
++ * Free Software Foundation;  either version 2 of the  License, or (at your
++ * option) any later version.
++ *
++ */
++#ifndef _MIPS_KDEBUG_H
++#define _MIPS_KDEBUG_H
++
++#include <linux/notifier.h>
++
++struct pt_regs;
++
++struct die_args {
++      struct pt_regs *regs;
++      const char *str;
++      long err;
++};
++
++int register_die_notifier(struct notifier_block *nb);
++extern struct notifier_block *mips_die_chain;
++
++enum die_val {
++      DIE_OOPS = 1,
++      DIE_PANIC,
++      DIE_DIE,
++      DIE_KERNELDEBUG,
++      DIE_TRAP,
++      DIE_PAGE_FAULT,
++};
++
++/*
++ * trap number can be computed from regs and signr can be computed using
++ * compute_signal()
++ */
++static inline int notify_die(enum die_val val,char *str,struct pt_regs *regs,long err)
++{
++      struct die_args args = { .regs=regs, .str=str, .err=err };
++      return notifier_call_chain(&mips_die_chain, val, &args);
++}
++
++#endif /* _MIPS_KDEBUG_H */
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/asm-mips/kgdb.h linux-2.6.18-53.1.14.kgdb/include/asm-mips/kgdb.h
+--- linux-2.6.18-53.1.14/include/asm-mips/kgdb.h       1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/include/asm-mips/kgdb.h  2008-06-10 15:39:01.000000000 +0400
+@@ -0,0 +1,34 @@
++#ifdef __KERNEL__
++#ifndef _ASM_KGDB_H_
++#define _ASM_KGDB_H_
++
++#ifndef __ASSEMBLY__
++#if (_MIPS_ISA == _MIPS_ISA_MIPS1) || (_MIPS_ISA == _MIPS_ISA_MIPS2)
++typedef u32 gdb_reg_t;
++#elif (_MIPS_ISA == _MIPS_ISA_MIPS3) || (_MIPS_ISA == _MIPS_ISA_MIPS4)
++typedef u64 gdb_reg_t;
++#else
++#error need to do
++#endif /* _MIPS_ISA */
++
++#include <asm-generic/kgdb.h>
++
++#ifndef __ASSEMBLY__
++#define BUFMAX                        2048
++#define NUMREGBYTES           (90*sizeof(gdb_reg_t))
++#define NUMCRITREGBYTES               (12*sizeof(gdb_reg_t))
++#define BREAK_INSTR_SIZE      4
++#define BREAKPOINT()          __asm__ __volatile__(           \
++                                      ".globl breakinst\n\t"  \
++                                      ".set\tnoreorder\n\t"   \
++                                      "nop\n"                 \
++                                      "breakinst:\tbreak\n\t" \
++                                      "nop\n\t"               \
++                                      ".set\treorder")
++#define CACHE_FLUSH_IS_SAFE   0
++
++extern int kgdb_early_setup;
++
++#endif                                /* !__ASSEMBLY__ */
++#endif                                /* _ASM_KGDB_H_ */
++#endif                                /* __KERNEL__ */
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/asm-powerpc/kgdb.h linux-2.6.18-53.1.14.kgdb/include/asm-powerpc/kgdb.h
+--- linux-2.6.18-53.1.14/include/asm-powerpc/kgdb.h    1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/include/asm-powerpc/kgdb.h       2008-06-10 15:39:01.000000000 +0400
+@@ -0,0 +1,74 @@
++/*
++ * include/asm-powerpc/kgdb.h
++ *
++ * The PowerPC (32/64) specific defines / externs for KGDB.  Based on
++ * the previous 32bit and 64bit specific files, which had the following
++ * copyrights:
++ *
++ * PPC64 Mods (C) 2005 Frank Rowand (frowand@mvista.com)
++ * PPC Mods (C) 2004 Tom Rini (trini@mvista.com)
++ * PPC Mods (C) 2003 John Whitney (john.whitney@timesys.com)
++ * PPC Mods (C) 1998 Michael Tesch (tesch@cs.wisc.edu)
++ *
++ *
++ * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu)
++ * Author: Tom Rini <trini@kernel.crashing.org>
++ *
++ * 2006 (c) MontaVista Software, Inc. This file is licensed under
++ * the terms of the GNU General Public License version 2. This program
++ * is licensed "as is" without any warranty of any kind, whether express
++ * or implied.
++ */
++#ifdef __KERNEL__
++#ifndef __POWERPC_KGDB_H__
++#define __POWERPC_KGDB_H__
++
++#include <asm-generic/kgdb.h>
++#ifndef __ASSEMBLY__
++
++#define BREAK_INSTR_SIZE      4
++#define BUFMAX                        ((NUMREGBYTES * 2) + 512)
++#define OUTBUFMAX             ((NUMREGBYTES * 2) + 512)
++#define BREAKPOINT()          asm(".long 0x7d821008"); /* twge r2, r2 */
++#define CACHE_FLUSH_IS_SAFE   1
++
++/* The number bytes of registers we have to save depends on a few
++ * things.  For 64bit we default to not including vector registers and
++ * vector state registers. */
++#ifdef CONFIG_PPC64
++/*
++ * 64 bit (8 byte) registers:
++ *   32 gpr, 32 fpr, nip, msr, link, ctr
++ * 32 bit (4 byte) registers:
++ *   ccr, xer, fpscr
++ */
++#define NUMREGBYTES           ((68 * 8) + (3 * 4))
++#if 0
++/* The following adds in vector registers and vector state registers. */
++/* 128 bit (16 byte) registers:
++ *   32 vr
++ * 64 bit (8 byte) registers:
++ *   32 gpr, 32 fpr, nip, msr, link, ctr
++ * 32 bit (4 byte) registers:
++ *   ccr, xer, fpscr, vscr, vrsave
++ */
++#define NUMREGBYTES           ((128 * 16) + (68 * 8) + (5 * 4))
++#endif
++#define NUMCRITREGBYTES               184
++#else /* CONFIG_PPC32 */
++/* On non-E500 family PPC32 we determine the size by picking the last
++ * register we need, but on E500 we skip sections so we list what we
++ * need to store, and add it up. */
++#ifndef CONFIG_E500
++#define MAXREG                        (PT_FPSCR+1)
++#else
++/* 32 GPRs (8 bytes), nip, msr, ccr, link, ctr, xer, acc (8 bytes), spefscr*/
++#define MAXREG                 ((32*2)+6+2+1)
++#endif
++#define NUMREGBYTES           (MAXREG * sizeof(int))
++/* CR/LR, R1, R2, R13-R31 inclusive. */
++#define NUMCRITREGBYTES               (23 * sizeof(int))
++#endif /* 32/64 */
++#endif /* !(__ASSEMBLY__) */
++#endif /* !__POWERPC_KGDB_H__ */
++#endif /* __KERNEL__ */
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/asm-ppc/kgdb.h linux-2.6.18-53.1.14.kgdb/include/asm-ppc/kgdb.h
+--- linux-2.6.18-53.1.14/include/asm-ppc/kgdb.h        2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/include/asm-ppc/kgdb.h   2008-06-10 15:38:14.000000000 +0400
+@@ -1,57 +1,18 @@
+-/*
+- * kgdb.h: Defines and declarations for serial line source level
+- *         remote debugging of the Linux kernel using gdb.
+- *
+- * PPC Mods (C) 1998 Michael Tesch (tesch@cs.wisc.edu)
+- *
+- * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu)
+- */
+ #ifdef __KERNEL__
+-#ifndef _PPC_KGDB_H
+-#define _PPC_KGDB_H
+-
++#ifndef __PPC_KGDB_H__
++#define __PPC_KGDB_H__
++#include <asm-powerpc/kgdb.h>
+ #ifndef __ASSEMBLY__
+-
+-/* Things specific to the gen550 backend. */
+-struct uart_port;
+-
+-extern void gen550_progress(char *, unsigned short);
+-extern void gen550_kgdb_map_scc(void);
+-extern void gen550_init(int, struct uart_port *);
+-
+-/* Things specific to the pmac backend. */
+-extern void zs_kgdb_hook(int tty_num);
+-
+-/* To init the kgdb engine. (called by serial hook)*/
+-extern void set_debug_traps(void);
+-
+-/* To enter the debugger explicitly. */
+-extern void breakpoint(void);
+-
+-/* For taking exceptions
++ /* For taking exceptions
+  * these are defined in traps.c
+  */
+-extern int (*debugger)(struct pt_regs *regs);
++struct pt_regs;
++extern void (*debugger)(struct pt_regs *regs);
+ extern int (*debugger_bpt)(struct pt_regs *regs);
+ extern int (*debugger_sstep)(struct pt_regs *regs);
+ extern int (*debugger_iabr_match)(struct pt_regs *regs);
+ extern int (*debugger_dabr_match)(struct pt_regs *regs);
+ extern void (*debugger_fault_handler)(struct pt_regs *regs);
+-
+-/* What we bring to the party */
+-int kgdb_bpt(struct pt_regs *regs);
+-int kgdb_sstep(struct pt_regs *regs);
+-void kgdb(struct pt_regs *regs);
+-int kgdb_iabr_match(struct pt_regs *regs);
+-int kgdb_dabr_match(struct pt_regs *regs);
+-
+-/*
+- * external low-level support routines (ie macserial.c)
+- */
+-extern void kgdb_interruptible(int); /* control interrupts from serial */
+-extern void putDebugChar(char);   /* write a single character      */
+-extern char getDebugChar(void);   /* read and return a single char */
+-
+-#endif /* !(__ASSEMBLY__) */
+-#endif /* !(_PPC_KGDB_H) */
++#endif /* !__ASSEMBLY__ */
++#endif /* __PPC_KGDB_H__ */
+ #endif /* __KERNEL__ */
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/asm-ppc/machdep.h linux-2.6.18-53.1.14.kgdb/include/asm-ppc/machdep.h
+--- linux-2.6.18-53.1.14/include/asm-ppc/machdep.h     2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/include/asm-ppc/machdep.h        2008-06-10 15:38:14.000000000 +0400
+@@ -72,9 +72,7 @@ struct machdep_calls {
+       unsigned long   (*find_end_of_memory)(void);
+       void            (*setup_io_mappings)(void);
+ 
+-      void            (*early_serial_map)(void);
+       void            (*progress)(char *, unsigned short);
+-      void            (*kgdb_map_scc)(void);
+ 
+       unsigned char   (*nvram_read_val)(int addr);
+       void            (*nvram_write_val)(int addr, unsigned char val);
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/asm-ppc/mv64x60.h linux-2.6.18-53.1.14.kgdb/include/asm-ppc/mv64x60.h
+--- linux-2.6.18-53.1.14/include/asm-ppc/mv64x60.h     2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/include/asm-ppc/mv64x60.h        2008-06-10 15:38:14.000000000 +0400
+@@ -348,6 +348,8 @@ u32 mv64x60_calc_mem_size(struct mv64x60
+ 
+ void mv64x60_progress_init(u32 base);
+ void mv64x60_mpsc_progress(char *s, unsigned short hex);
++struct platform_device * mv64x60_early_get_pdev_data(const char *name,
++              int id, int remove);
+ 
+ extern struct mv64x60_32bit_window
+       gt64260_32bit_windows[MV64x60_32BIT_WIN_COUNT];
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/asm-ppc/mv64x60_defs.h linux-2.6.18-53.1.14.kgdb/include/asm-ppc/mv64x60_defs.h
+--- linux-2.6.18-53.1.14/include/asm-ppc/mv64x60_defs.h        2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/include/asm-ppc/mv64x60_defs.h   2008-06-10 15:38:14.000000000 +0400
+@@ -57,7 +57,8 @@
+ #define       MV64x60_IRQ_I2C                         37
+ #define       MV64x60_IRQ_BRG                         39
+ #define       MV64x60_IRQ_MPSC_0                      40
+-#define       MV64x60_IRQ_MPSC_1                      42
++#define       MV64360_IRQ_MPSC_1                      41
++#define       GT64260_IRQ_MPSC_1                      42
+ #define       MV64x60_IRQ_COMM                        43
+ #define       MV64x60_IRQ_P0_GPP_0_7                  56
+ #define       MV64x60_IRQ_P0_GPP_8_15                 57
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/asm-sh/kgdb.h linux-2.6.18-53.1.14.kgdb/include/asm-sh/kgdb.h
+--- linux-2.6.18-53.1.14/include/asm-sh/kgdb.h 2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/include/asm-sh/kgdb.h    2008-06-10 15:39:01.000000000 +0400
+@@ -2,94 +2,40 @@
+  * May be copied or modified under the terms of the GNU General Public
+  * License.  See linux/COPYING for more information.
+  *
+- * Based on original code by Glenn Engel, Jim Kingdon,
+- * David Grothe <dave@gcom.com>, Tigran Aivazian, <tigran@sco.com> and
+- * Amit S. Kale <akale@veritas.com>
++ * Based on a file that was modified or based on files by: Glenn Engel,
++ * Jim Kingdon, David Grothe <dave@gcom.com>, Tigran Aivazian <tigran@sco.com>,
++ * Amit S. Kale <akale@veritas.com>, sh-stub.c from Ben Lee and
++ * Steve Chamberlain, Henry Bell <henry.bell@st.com>
+  * 
+- * Super-H port based on sh-stub.c (Ben Lee and Steve Chamberlain) by
+- * Henry Bell <henry.bell@st.com>
+- * 
+- * Header file for low-level support for remote debug using GDB. 
++ * Maintainer: Tom Rini <trini@kernel.crashing.org>
+  *
+  */
+ 
+ #ifndef __KGDB_H
+ #define __KGDB_H
+ 
+-#include <asm/ptrace.h>
+-
+-struct console;
++#include <asm-generic/kgdb.h>
++/* Based on sh-gdb.c from gdb-6.1, Glenn
++     Engel at HP  Ben Lee and Steve Chamberlain */
++#define NUMREGBYTES   112     /* 92 */
++#define NUMCRITREGBYTES       (9 << 2)
++#define BUFMAX                400
+ 
+-/* Same as pt_regs but has vbr in place of syscall_nr */
++#ifndef __ASSEMBLY__
+ struct kgdb_regs {
+         unsigned long regs[16];
+         unsigned long pc;
+         unsigned long pr;
+-        unsigned long sr;
+         unsigned long gbr;
++        unsigned long vbr;
+         unsigned long mach;
+         unsigned long macl;
+-        unsigned long vbr;
+-};
+-
+-/* State info */
+-extern char kgdb_in_gdb_mode;
+-extern int kgdb_done_init;
+-extern int kgdb_enabled;
+-extern int kgdb_nofault;      /* Ignore bus errors (in gdb mem access) */
+-extern int kgdb_halt;         /* Execute initial breakpoint at startup */
+-extern char in_nmi;           /* Debounce flag to prevent NMI reentry*/
+-
+-/* SCI */
+-extern int kgdb_portnum;
+-extern int kgdb_baud;
+-extern char kgdb_parity;
+-extern char kgdb_bits;
+-extern int kgdb_console_setup(struct console *, char *);
+-
+-/* Init and interface stuff */
+-extern int kgdb_init(void);
+-extern int (*kgdb_serial_setup)(void);
+-extern int (*kgdb_getchar)(void);
+-extern void (*kgdb_putchar)(int);
+-
+-struct kgdb_sermap {
+-      char *name;
+-      int namelen;
+-      int (*setup_fn)(struct console *, char *);
+-      struct kgdb_sermap *next;
++        unsigned long sr;
+ };
+-extern void kgdb_register_sermap(struct kgdb_sermap *map);
+-extern struct kgdb_sermap *kgdb_porttype;
+ 
+-/* Trap functions */
+-typedef void (kgdb_debug_hook_t)(struct pt_regs *regs); 
+-typedef void (kgdb_bus_error_hook_t)(void);
+-extern kgdb_debug_hook_t  *kgdb_debug_hook;
+-extern kgdb_bus_error_hook_t *kgdb_bus_err_hook;
+-
+-extern void breakpoint(void);
+-
+-/* Console */
+-struct console;
+-void kgdb_console_write(struct console *co, const char *s, unsigned count);
+-void kgdb_console_init(void);
+-
+-/* Prototypes for jmp fns */
+-#define _JBLEN 9
+-typedef        int jmp_buf[_JBLEN];
+-extern void    longjmp(jmp_buf __jmpb, int __retval);
+-extern int     setjmp(jmp_buf __jmpb);
+-
+-/* Variadic macro to print our own message to the console */
+-#define KGDB_PRINTK(...) printk("KGDB: " __VA_ARGS__)
+-
+-/* Forced breakpoint */
+-#define BREAKPOINT() do {                                     \
+-  if (kgdb_enabled) {                                         \
+-    asm volatile("trapa   #0xff");                            \
+-  }                                                           \
+-} while (0)
++#define BREAKPOINT()          asm("trapa #0xff");
++#define BREAK_INSTR_SIZE      2
++#define CACHE_FLUSH_IS_SAFE   1
+ 
+ /* KGDB should be able to flush all kernel text space */
+ #if defined(CONFIG_CPU_SH4)
+@@ -102,30 +48,5 @@ extern int     setjmp(jmp_buf __jmpb);
+ #else
+ #define kgdb_flush_icache_range(start, end)   do { } while (0)
+ #endif
+-
+-/* Kernel assert macros */
+-#ifdef CONFIG_KGDB_KERNEL_ASSERTS
+-
+-/* Predefined conditions */
+-#define KA_VALID_ERRNO(errno) ((errno) > 0 && (errno) <= EMEDIUMTYPE)
+-#define KA_VALID_PTR_ERR(ptr) KA_VALID_ERRNO(-PTR_ERR(ptr))
+-#define KA_VALID_KPTR(ptr)  (!(ptr) || \
+-              ((void *)(ptr) >= (void *)PAGE_OFFSET &&  \
+-               (void *)(ptr) < ERR_PTR(-EMEDIUMTYPE)))
+-#define KA_VALID_PTRORERR(errptr) \
+-               (KA_VALID_KPTR(errptr) || KA_VALID_PTR_ERR(errptr))
+-#define KA_HELD_GKL()  (current->lock_depth >= 0)
+-
+-/* The actual assert */
+-#define KGDB_ASSERT(condition, message) do {                   \
+-       if (!(condition) && (kgdb_enabled)) {                   \
+-               KGDB_PRINTK("Assertion failed at %s:%d: %s\n",  \
+-                                  __FILE__, __LINE__, message);\
+-               BREAKPOINT();                                   \
+-       }                                                       \
+-} while (0)
+-#else
+-#define KGDB_ASSERT(condition, message)
+-#endif
+-
++#endif                                /* !__ASSEMBLY__ */
+ #endif
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/asm-sh/system.h linux-2.6.18-53.1.14.kgdb/include/asm-sh/system.h
+--- linux-2.6.18-53.1.14/include/asm-sh/system.h       2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/include/asm-sh/system.h  2008-06-10 15:38:50.000000000 +0400
+@@ -6,6 +6,7 @@
+  * Copyright (C) 2002 Paul Mundt
+  */
+ 
++#include <asm/types.h>
+ 
+ /*
+  *    switch_to() should switch tasks to task nr n, first
+@@ -260,6 +261,45 @@ static __inline__ unsigned long __xchg(u
+       return x;
+ }
+ 
++static inline unsigned long __cmpxchg_u32(volatile int * m, unsigned long old,
++      unsigned long new)
++{
++      __u32 retval;
++      unsigned long flags;
++
++      local_irq_save(flags);
++      retval = *m;
++      if (retval == old)
++              *m = new;
++      local_irq_restore(flags);       /* implies memory barrier  */
++      return retval;
++}
++
++/* This function doesn't exist, so you'll get a linker error
++ * if something tries to do an invalid cmpxchg(). */
++extern void __cmpxchg_called_with_bad_pointer(void);
++
++#define __HAVE_ARCH_CMPXCHG   1
++
++static inline unsigned long __cmpxchg(volatile void * ptr, unsigned long old,
++              unsigned long new, int size)
++{
++      switch (size) {
++      case 4:
++              return __cmpxchg_u32(ptr, old, new);
++      }
++      __cmpxchg_called_with_bad_pointer();
++      return old;
++}
++
++#define cmpxchg(ptr,o,n)                                               \
++  ({                                                                   \
++     __typeof__(*(ptr)) _o_ = (o);                                     \
++     __typeof__(*(ptr)) _n_ = (n);                                     \
++     (__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)_o_,                 \
++                                  (unsigned long)_n_, sizeof(*(ptr))); \
++  })
++
+ /* XXX
+  * disable hlt during certain critical i/o operations
+  */
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/asm-x86_64/kdebug.h linux-2.6.18-53.1.14.kgdb/include/asm-x86_64/kdebug.h
+--- linux-2.6.18-53.1.14/include/asm-x86_64/kdebug.h   2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/include/asm-x86_64/kdebug.h      2008-06-10 15:38:41.000000000 +0400
+@@ -34,6 +34,7 @@ enum die_val {
+       DIE_CALL,
+       DIE_NMI_IPI,
+       DIE_PAGE_FAULT,
++      DIE_PAGE_FAULT_NO_CONTEXT,
+ };
+ 
+ static inline int notify_die(enum die_val val, const char *str,
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/asm-x86_64/kgdb.h linux-2.6.18-53.1.14.kgdb/include/asm-x86_64/kgdb.h
+--- linux-2.6.18-53.1.14/include/asm-x86_64/kgdb.h     1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/include/asm-x86_64/kgdb.h        2008-06-10 15:39:01.000000000 +0400
+@@ -0,0 +1,54 @@
++#ifdef __KERNEL__
++#ifndef _ASM_KGDB_H_
++#define _ASM_KGDB_H_
++
++/*
++ * Copyright (C) 2001-2004 Amit S. Kale
++ */
++
++#include <asm-generic/kgdb.h>
++
++/*
++ *  Note that this register image is in a different order than
++ *  the register image that Linux produces at interrupt time.
++ *
++ *  Linux's register image is defined by struct pt_regs in ptrace.h.
++ *  Just why GDB uses a different order is a historical mystery.
++ */
++#define _RAX  0
++#define _RDX  1
++#define _RCX  2
++#define _RBX  3
++#define _RSI  4
++#define _RDI  5
++#define _RBP  6
++#define _RSP  7
++#define _R8   8
++#define _R9   9
++#define _R10  10
++#define _R11  11
++#define _R12  12
++#define _R13  13
++#define _R14  14
++#define _R15  15
++#define _PC   16
++#define _PS   17
++
++/* Number of bytes of registers.  */
++#define NUMREGBYTES           ((_PS+1)*8)
++#define NUMCRITREGBYTES               (8 * 8)         /* 8 registers. */
++
++/* Help GDB to know when to stop backtracing. */
++#define CFI_END_FRAME(func)   __CFI_END_FRAME(_PC,_RSP,func)
++#ifndef __ASSEMBLY__
++/* BUFMAX defines the maximum number of characters in inbound/outbound
++ * buffers at least NUMREGBYTES*2 are needed for register packets, and
++ * a longer buffer is needed to list all threads. */
++#define BUFMAX                        1024
++#define BREAKPOINT()          asm("   int $3");
++#define CHECK_EXCEPTION_STACK() ((&__get_cpu_var(init_tss))[0].ist[0])
++#define BREAK_INSTR_SIZE      1
++#define CACHE_FLUSH_IS_SAFE   1
++#endif                                /* !__ASSEMBLY__ */
++#endif                                /* _ASM_KGDB_H_ */
++#endif                                /* __KERNEL__ */
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/asm-x86_64/system.h linux-2.6.18-53.1.14.kgdb/include/asm-x86_64/system.h
+--- linux-2.6.18-53.1.14/include/asm-x86_64/system.h   2008-03-06 05:54:38.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/include/asm-x86_64/system.h      2008-06-10 15:38:44.000000000 +0400
+@@ -22,7 +22,9 @@
+ 
+ /* Save restore flags to clear handle leaking NT */
+ #define switch_to(prev,next,last) \
+-      asm volatile(SAVE_CONTEXT                                                   \
++       asm volatile(".globl __switch_to_begin\n\t"                                \
++                   "__switch_to_begin:\n\t"                                     \
++                   SAVE_CONTEXT                                                 \
+                    "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */       \
+                    "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */    \
+                    "call __switch_to\n\t"                                       \
+@@ -34,6 +36,8 @@
+                    "movq %%rax,%%rdi\n\t"                                       \
+                    "jc   ret_from_fork\n\t"                                     \
+                    RESTORE_CONTEXT                                                \
++                   ".globl __switch_to_end\n\t"                                 \
++                   "__switch_to_end:\n\t"                                       \
+                    : "=a" (last)                                                \
+                    : [next] "S" (next), [prev] "D" (prev),                      \
+                      [threadrsp] "i" (offsetof(struct task_struct, thread.rsp)), \
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/linux/dwarf2-lang.h linux-2.6.18-53.1.14.kgdb/include/linux/dwarf2-lang.h
+--- linux-2.6.18-53.1.14/include/linux/dwarf2-lang.h   1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/include/linux/dwarf2-lang.h      2008-06-10 15:39:01.000000000 +0400
+@@ -0,0 +1,300 @@
++#ifndef DWARF2_LANG
++#define DWARF2_LANG
++
++/*
++ * This is free software; you can redistribute it and/or modify it under
++ * the terms of the GNU General Public License as published by the Free
++ * Software Foundation; either version 2, or (at your option) any later
++ * version.
++ */
++/*
++ * This file defines macros that allow generation of DWARF debug records
++ * for asm files.  This file is platform independent.  Register numbers
++ * (which are about the only thing that is platform dependent) are to be
++ * supplied by a platform defined file.
++ */
++/*
++ * We need this to work for both asm and C.  In asm we are using the
++ * old comment trick to concatenate while C uses the new ANSI thing.
++ * Here we have concat macro...  The multi level thing is to allow and
++ * macros used in the names to be resolved prior to the cat (at which
++ * time they are no longer the same string).
++ */
++#define CAT3(a,b,c) _CAT3(a,b,c)
++#define _CAT3(a,b,c) __CAT3(a,b,c)
++#ifndef __STDC__
++#define __CAT3(a,b,c) a/**/b/**/c
++#else
++#define __CAT3(a,b,c) a##b##c
++#endif
++#ifdef __ASSEMBLY__
++#define IFC(a)
++#define IFN_C(a) a
++#define NL ;
++#define QUOTE_THIS(a) a
++#define DWARF_preamble .section .debug_frame,"",%progbits;
++#else
++#define IFC(a) a
++#define IFN_C(a)
++#define NL \n\t
++#define QUOTE_THIS(a) _QUOTE_THIS(a)
++#define _QUOTE_THIS(a) #a
++/* Don't let CPP see the " and , \042=" \054=, */
++#define DWARF_preamble .section .debug_frame \054\042\042\054%progbits
++#endif
++
++#ifdef CONFIG_64BIT
++#define DATA_ALIGN_FACTOR     8
++#define ADDR_LOC              .quad
++#else
++#define DATA_ALIGN_FACTOR     4
++#define ADDR_LOC              .long
++#endif
++
++#include <linux/dwarf2-defs.h>
++/*
++ * This macro starts a debug frame section.  The debug_frame describes
++ * where to find the registers that the enclosing function saved on
++ * entry.
++ *
++ * ORD is use by the label generator and should be the same as what is
++ * passed to CFI_postamble.
++ *
++ * pc,        pc register gdb ordinal.
++ *
++ * code_align this is the factor used to define locations or regions
++ * where the given definitions apply.  If you use labels to define these
++ * this should be 1.
++ *
++ * data_align this is the factor used to define register offsets.  If
++ * you use struct offset, this should be the size of the register in
++ * bytes or the negative of that.  This is how it is used: you will
++ * define a register as the reference register, say the stack pointer,
++ * then you will say where a register is located relative to this
++ * reference registers value, say 40 for register 3 (the gdb register
++ * number).  The <40> will be multiplied by <data_align> to define the
++ * byte offset of the given register (3, in this example).  So if your
++ * <40> is the byte offset and the reference register points at the
++ * begining, you would want 1 for the data_offset.  If <40> was the 40th
++ * 4-byte element in that structure you would want 4.  And if your
++ * reference register points at the end of the structure you would want
++ * a negative data_align value(and you would have to do other math as
++ * well).
++ */
++
++#define CFI_preamble(ORD, pc, code_align, data_align) \
++         DWARF_preamble       NL                              \
++      .align DATA_ALIGN_FACTOR NL                     \
++        .globl CAT3(frame,_,ORD) NL                   \
++CAT3(frame,_,ORD): NL                                 \
++      .long 7f-6f NL                                  \
++6:                                                    \
++      .long   DW_CIE_ID NL                            \
++      .byte   DW_CIE_VERSION NL                       \
++      .byte 0  NL                                     \
++      .uleb128 code_align NL                          \
++      .sleb128 data_align NL                          \
++      .byte pc NL
++
++/*
++ * After the above macro and prior to the CFI_postamble, you need to
++ * define the initial state.  This starts with defining the reference
++ * register and, usually the pc.  Here are some helper macros:
++ */
++
++#define CFA_define_reference(reg, offset)     \
++      .byte DW_CFA_def_cfa NL                 \
++      .uleb128 reg NL                         \
++      .uleb128 (offset) NL
++
++#define CFA_define_offset(reg, offset)                \
++      .byte (DW_CFA_offset + reg) NL          \
++      .uleb128 (offset) NL
++
++#define CFA_restore(reg)                      \
++        .byte (DW_CFA_restore + reg) NL
++
++#define CFI_postamble()                               \
++      .align DATA_ALIGN_FACTOR NL                             \
++7: NL                                         \
++.previous NL
++
++/*
++ * So now your code pushs stuff on the stack, you need a new location
++ * and the rules for what to do.  This starts a running description of
++ * the call frame.  You need to describe what changes with respect to
++ * the call registers as the location of the pc moves through the code.
++ * The following builds an FDE (fram descriptor entry?).  Like the
++ * above, it has a preamble and a postamble.  It also is tied to the CFI
++ * above.
++ * The preamble macro is tied to the CFI thru the first parameter.  The
++ * second is the code start address and then the code end address+1.
++ */
++#define FDE_preamble(ORD, initial_address, end_address)       \
++        DWARF_preamble NL                             \
++      .align DATA_ALIGN_FACTOR NL                                     \
++      .long 9f-8f NL                                  \
++8:                                                    \
++      .long CAT3(frame,_,ORD) NL                      \
++      ADDR_LOC initial_address NL                     \
++      ADDR_LOC (end_address - initial_address) NL
++
++#define FDE_postamble()                               \
++      .align DATA_ALIGN_FACTOR NL                             \
++9:     NL                                     \
++.previous NL
++
++/*
++ * That done, you can now add registers, subtract registers, move the
++ * reference and even change the reference.  You can also define a new
++ * area of code the info applies to.  For discontinuous bits you should
++ * start a new FDE.  You may have as many as you like.
++ */
++
++/*
++ * To advance the stack address by <bytes> (0x3f max)
++ */
++
++#define CFA_advance_loc(bytes)                        \
++      .byte DW_CFA_advance_loc+bytes NL
++
++/*
++ * This one is good for 0xff or 255
++ */
++#define CFA_advance_loc1(bytes)                       \
++      .byte DW_CFA_advance_loc1 NL            \
++        .byte bytes NL
++
++#define CFA_undefine_reg(reg)                 \
++        .byte DW_CFA_undefined NL             \
++      .uleb128 reg NL
++/*
++ * With the above you can define all the register locations.  But
++ * suppose the reference register moves... Takes the new offset NOT an
++ * increment.  This is how esp is tracked if it is not saved.
++ */
++
++#define CFA_define_cfa_offset(offset)         \
++      .byte DW_CFA_def_cfa_offset NL          \
++      .uleb128 (offset) NL
++/*
++ * Or suppose you want to use a different reference register...
++ */
++#define CFA_define_cfa_register(reg)          \
++      .byte DW_CFA_def_cfa_register NL        \
++      .uleb128 reg NL
++
++/*
++ * If you want to mess with the stack pointer, here is the expression.
++ * The stack starts empty.
++ */
++#define CFA_def_cfa_expression                        \
++        .byte DW_CFA_def_cfa_expression       NL      \
++      .uleb128 20f-10f NL                     \
++10:     NL
++/*
++ * This expression is to be used for other regs.  The stack starts with the
++ * stack address.
++ */
++
++#define CFA_expression(reg)                   \
++        .byte DW_CFA_expression        NL             \
++        .uleb128 reg NL                               \
++      .uleb128 20f-10f NL                     \
++10:     NL
++/*
++ * Here we do the expression stuff.  You should code the above followed
++ *  by expression OPs followed by CFA_expression_end.
++ */
++
++
++#define CFA_expression_end                    \
++20:    NL
++
++#define CFA_exp_OP_const4s(a)                 \
++        .byte DW_OP_const4s NL                        \
++        .long a NL
++
++#define  CFA_exp_OP_swap  .byte DW_OP_swap NL
++#define  CFA_exp_OP_dup  .byte DW_OP_dup NL
++#define  CFA_exp_OP_drop  .byte DW_OP_drop NL
++/*
++ * All these work on the top two elements on the stack, replacing them
++ * with the result.  Top comes first where it matters.  True is 1, false 0.
++ */
++#define  CFA_exp_OP_deref .byte DW_OP_deref NL
++#define  CFA_exp_OP_and   .byte DW_OP_and NL
++#define  CFA_exp_OP_div   .byte DW_OP_div NL
++#define  CFA_exp_OP_minus .byte DW_OP_minus NL
++#define  CFA_exp_OP_mod   .byte DW_OP_mod NL
++#define  CFA_exp_OP_neg   .byte DW_OP_neg NL
++#define  CFA_exp_OP_plus  .byte DW_OP_plus NL
++#define  CFA_exp_OP_not   .byte DW_OP_not NL
++#define  CFA_exp_OP_or    .byte DW_OP_or NL
++#define  CFA_exp_OP_xor   .byte DW_OP_xor NL
++#define  CFA_exp_OP_le    .byte DW_OP_le NL
++#define  CFA_exp_OP_ge    .byte DW_OP_ge NL
++#define  CFA_exp_OP_eq    .byte DW_OP_eq NL
++#define  CFA_exp_OP_lt    .byte DW_OP_lt NL
++#define  CFA_exp_OP_gt    .byte DW_OP_gt NL
++#define  CFA_exp_OP_ne    .byte DW_OP_ne NL
++/*
++ * These take a parameter as noted
++ */
++/*
++ * Unconditional skip to loc. loc is a label (loc:)
++ */
++#define CFA_exp_OP_skip(loc)                  \
++         .byte DW_OP_skip  NL                         \
++       .hword  loc-.-2 NL
++/*
++ * Conditional skip to loc (TOS != 0, TOS--) (loc is a label)
++ */
++#define CFA_exp_OP_bra(loc)                   \
++         .byte DW_OP_bra NL                   \
++       .hword loc-.-2 NL
++
++/*
++ * TOS += no (an unsigned number)
++ */
++#define CFA_exp_OP_plus_uconst(no)            \
++         .byte DW_OP_plus_uconst NL           \
++         .uleb128 no NL
++
++/*
++ * ++TOS = no (a unsigned number)
++ */
++#define CFA_exp_OP_constu(no)                 \
++         .byte DW_OP_constu NL                        \
++       .uleb128 no NL
++/*
++ * ++TOS = no (a signed number)
++ */
++#define CFA_exp_OP_consts(no)                 \
++         .byte DW_OP_consts NL                        \
++       .sleb128 no NL
++/*
++ * ++TOS = no (an unsigned byte)
++ */
++#define CFA_exp_OP_const1u(no)                        \
++         .byte DW_OP_const1u NL                       \
++       .byte no NL
++
++
++/*
++ * ++TOS = no (a address)
++ */
++#define CFA_exp_OP_addr(no)                   \
++         .byte DW_OP_addr NL                  \
++       .long no NL
++
++/*
++ * Push current frames value for "reg" + offset
++ * We take advantage of the opcode assignments to make this a litteral reg
++ * rather than use the DW_OP_bregx opcode.
++ */
++
++#define CFA_exp_OP_breg(reg,offset)           \
++         .byte DW_OP_breg0+reg NL             \
++         .sleb128 offset NL
++#endif
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/linux/dwarf2.h linux-2.6.18-53.1.14.kgdb/include/linux/dwarf2.h
+--- linux-2.6.18-53.1.14/include/linux/dwarf2.h        1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/include/linux/dwarf2.h   2008-06-10 15:39:01.000000000 +0400
+@@ -0,0 +1,775 @@
++/* Declarations and definitions of codes relating to the DWARF2 symbolic
++   debugging information format.
++   Copyright (C) 1992, 1993, 1995, 1996, 1997, 1999, 2000, 2001, 2002,
++   2003 Free Software Foundation, Inc.
++
++   Written by Gary Funck (gary@intrepid.com) The Ada Joint Program
++   Office (AJPO), Florida State Unviversity and Silicon Graphics Inc.
++   provided support for this effort -- June 21, 1995.
++
++   Derived from the DWARF 1 implementation written by Ron Guilmette
++   (rfg@netcom.com), November 1990.
++
++   This file is part of GCC.
++
++   GCC is free software; you can redistribute it and/or modify it under
++   the terms of the GNU General Public License as published by the Free
++   Software Foundation; either version 2, or (at your option) any later
++   version.
++
++   GCC is distributed in the hope that it will be useful, but WITHOUT
++   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
++   License for more details.
++
++   You should have received a copy of the GNU General Public License
++   along with GCC; see the file COPYING.  If not, write to the Free
++   Software Foundation, 59 Temple Place - Suite 330, Boston, MA
++   02111-1307, USA.  */
++
++/* This file is derived from the DWARF specification (a public document)
++   Revision 2.0.0 (July 27, 1993) developed by the UNIX International
++   Programming Languages Special Interest Group (UI/PLSIG) and distributed
++   by UNIX International.  Copies of this specification are available from
++   UNIX International, 20 Waterview Boulevard, Parsippany, NJ, 07054.
++
++   This file also now contains definitions from the DWARF 3 specification.  */
++
++/* This file is shared between GCC and GDB, and should not contain
++   prototypes.  */
++
++#ifndef _ELF_DWARF2_H
++#define _ELF_DWARF2_H
++
++/* Structure found in the .debug_line section.  */
++typedef struct
++{
++  unsigned char li_length          [4];
++  unsigned char li_version         [2];
++  unsigned char li_prologue_length [4];
++  unsigned char li_min_insn_length [1];
++  unsigned char li_default_is_stmt [1];
++  unsigned char li_line_base       [1];
++  unsigned char li_line_range      [1];
++  unsigned char li_opcode_base     [1];
++}
++DWARF2_External_LineInfo;
++
++typedef struct
++{
++  unsigned long  li_length;
++  unsigned short li_version;
++  unsigned int   li_prologue_length;
++  unsigned char  li_min_insn_length;
++  unsigned char  li_default_is_stmt;
++  int            li_line_base;
++  unsigned char  li_line_range;
++  unsigned char  li_opcode_base;
++}
++DWARF2_Internal_LineInfo;
++
++/* Structure found in .debug_pubnames section.  */
++typedef struct
++{
++  unsigned char pn_length  [4];
++  unsigned char pn_version [2];
++  unsigned char pn_offset  [4];
++  unsigned char pn_size    [4];
++}
++DWARF2_External_PubNames;
++
++typedef struct
++{
++  unsigned long  pn_length;
++  unsigned short pn_version;
++  unsigned long  pn_offset;
++  unsigned long  pn_size;
++}
++DWARF2_Internal_PubNames;
++
++/* Structure found in .debug_info section.  */
++typedef struct
++{
++  unsigned char  cu_length        [4];
++  unsigned char  cu_version       [2];
++  unsigned char  cu_abbrev_offset [4];
++  unsigned char  cu_pointer_size  [1];
++}
++DWARF2_External_CompUnit;
++
++typedef struct
++{
++  unsigned long  cu_length;
++  unsigned short cu_version;
++  unsigned long  cu_abbrev_offset;
++  unsigned char  cu_pointer_size;
++}
++DWARF2_Internal_CompUnit;
++
++typedef struct
++{
++  unsigned char  ar_length       [4];
++  unsigned char  ar_version      [2];
++  unsigned char  ar_info_offset  [4];
++  unsigned char  ar_pointer_size [1];
++  unsigned char  ar_segment_size [1];
++}
++DWARF2_External_ARange;
++
++typedef struct
++{
++  unsigned long  ar_length;
++  unsigned short ar_version;
++  unsigned long  ar_info_offset;
++  unsigned char  ar_pointer_size;
++  unsigned char  ar_segment_size;
++}
++DWARF2_Internal_ARange;
++
++
++/* Tag names and codes.  */
++enum dwarf_tag
++  {
++    DW_TAG_padding = 0x00,
++    DW_TAG_array_type = 0x01,
++    DW_TAG_class_type = 0x02,
++    DW_TAG_entry_point = 0x03,
++    DW_TAG_enumeration_type = 0x04,
++    DW_TAG_formal_parameter = 0x05,
++    DW_TAG_imported_declaration = 0x08,
++    DW_TAG_label = 0x0a,
++    DW_TAG_lexical_block = 0x0b,
++    DW_TAG_member = 0x0d,
++    DW_TAG_pointer_type = 0x0f,
++    DW_TAG_reference_type = 0x10,
++    DW_TAG_compile_unit = 0x11,
++    DW_TAG_string_type = 0x12,
++    DW_TAG_structure_type = 0x13,
++    DW_TAG_subroutine_type = 0x15,
++    DW_TAG_typedef = 0x16,
++    DW_TAG_union_type = 0x17,
++    DW_TAG_unspecified_parameters = 0x18,
++    DW_TAG_variant = 0x19,
++    DW_TAG_common_block = 0x1a,
++    DW_TAG_common_inclusion = 0x1b,
++    DW_TAG_inheritance = 0x1c,
++    DW_TAG_inlined_subroutine = 0x1d,
++    DW_TAG_module = 0x1e,
++    DW_TAG_ptr_to_member_type = 0x1f,
++    DW_TAG_set_type = 0x20,
++    DW_TAG_subrange_type = 0x21,
++    DW_TAG_with_stmt = 0x22,
++    DW_TAG_access_declaration = 0x23,
++    DW_TAG_base_type = 0x24,
++    DW_TAG_catch_block = 0x25,
++    DW_TAG_const_type = 0x26,
++    DW_TAG_constant = 0x27,
++    DW_TAG_enumerator = 0x28,
++    DW_TAG_file_type = 0x29,
++    DW_TAG_friend = 0x2a,
++    DW_TAG_namelist = 0x2b,
++    DW_TAG_namelist_item = 0x2c,
++    DW_TAG_packed_type = 0x2d,
++    DW_TAG_subprogram = 0x2e,
++    DW_TAG_template_type_param = 0x2f,
++    DW_TAG_template_value_param = 0x30,
++    DW_TAG_thrown_type = 0x31,
++    DW_TAG_try_block = 0x32,
++    DW_TAG_variant_part = 0x33,
++    DW_TAG_variable = 0x34,
++    DW_TAG_volatile_type = 0x35,
++    /* DWARF 3.  */
++    DW_TAG_dwarf_procedure = 0x36,
++    DW_TAG_restrict_type = 0x37,
++    DW_TAG_interface_type = 0x38,
++    DW_TAG_namespace = 0x39,
++    DW_TAG_imported_module = 0x3a,
++    DW_TAG_unspecified_type = 0x3b,
++    DW_TAG_partial_unit = 0x3c,
++    DW_TAG_imported_unit = 0x3d,
++    /* SGI/MIPS Extensions.  */
++    DW_TAG_MIPS_loop = 0x4081,
++    /* HP extensions.  See: ftp://ftp.hp.com/pub/lang/tools/WDB/wdb-4.0.tar.gz .  */
++    DW_TAG_HP_array_descriptor = 0x4090,
++    /* GNU extensions.  */
++    DW_TAG_format_label = 0x4101,     /* For FORTRAN 77 and Fortran 90.  */
++    DW_TAG_function_template = 0x4102,        /* For C++.  */
++    DW_TAG_class_template = 0x4103,   /* For C++.  */
++    DW_TAG_GNU_BINCL = 0x4104,
++    DW_TAG_GNU_EINCL = 0x4105,
++    /* Extensions for UPC.  See: http://upc.gwu.edu/~upc.  */
++    DW_TAG_upc_shared_type = 0x8765,
++    DW_TAG_upc_strict_type = 0x8766,
++    DW_TAG_upc_relaxed_type = 0x8767,
++    /* PGI (STMicroelectronics) extensions.  No documentation available.  */
++    DW_TAG_PGI_kanji_type      = 0xA000,
++    DW_TAG_PGI_interface_block = 0xA020
++  };
++
++#define DW_TAG_lo_user        0x4080
++#define DW_TAG_hi_user        0xffff
++
++/* Flag that tells whether entry has a child or not.  */
++#define DW_children_no   0
++#define       DW_children_yes  1
++
++/* Form names and codes.  */
++enum dwarf_form
++  {
++    DW_FORM_addr = 0x01,
++    DW_FORM_block2 = 0x03,
++    DW_FORM_block4 = 0x04,
++    DW_FORM_data2 = 0x05,
++    DW_FORM_data4 = 0x06,
++    DW_FORM_data8 = 0x07,
++    DW_FORM_string = 0x08,
++    DW_FORM_block = 0x09,
++    DW_FORM_block1 = 0x0a,
++    DW_FORM_data1 = 0x0b,
++    DW_FORM_flag = 0x0c,
++    DW_FORM_sdata = 0x0d,
++    DW_FORM_strp = 0x0e,
++    DW_FORM_udata = 0x0f,
++    DW_FORM_ref_addr = 0x10,
++    DW_FORM_ref1 = 0x11,
++    DW_FORM_ref2 = 0x12,
++    DW_FORM_ref4 = 0x13,
++    DW_FORM_ref8 = 0x14,
++    DW_FORM_ref_udata = 0x15,
++    DW_FORM_indirect = 0x16
++  };
++
++/* Attribute names and codes.  */
++enum dwarf_attribute
++  {
++    DW_AT_sibling = 0x01,
++    DW_AT_location = 0x02,
++    DW_AT_name = 0x03,
++    DW_AT_ordering = 0x09,
++    DW_AT_subscr_data = 0x0a,
++    DW_AT_byte_size = 0x0b,
++    DW_AT_bit_offset = 0x0c,
++    DW_AT_bit_size = 0x0d,
++    DW_AT_element_list = 0x0f,
++    DW_AT_stmt_list = 0x10,
++    DW_AT_low_pc = 0x11,
++    DW_AT_high_pc = 0x12,
++    DW_AT_language = 0x13,
++    DW_AT_member = 0x14,
++    DW_AT_discr = 0x15,
++    DW_AT_discr_value = 0x16,
++    DW_AT_visibility = 0x17,
++    DW_AT_import = 0x18,
++    DW_AT_string_length = 0x19,
++    DW_AT_common_reference = 0x1a,
++    DW_AT_comp_dir = 0x1b,
++    DW_AT_const_value = 0x1c,
++    DW_AT_containing_type = 0x1d,
++    DW_AT_default_value = 0x1e,
++    DW_AT_inline = 0x20,
++    DW_AT_is_optional = 0x21,
++    DW_AT_lower_bound = 0x22,
++    DW_AT_producer = 0x25,
++    DW_AT_prototyped = 0x27,
++    DW_AT_return_addr = 0x2a,
++    DW_AT_start_scope = 0x2c,
++    DW_AT_stride_size = 0x2e,
++    DW_AT_upper_bound = 0x2f,
++    DW_AT_abstract_origin = 0x31,
++    DW_AT_accessibility = 0x32,
++    DW_AT_address_class = 0x33,
++    DW_AT_artificial = 0x34,
++    DW_AT_base_types = 0x35,
++    DW_AT_calling_convention = 0x36,
++    DW_AT_count = 0x37,
++    DW_AT_data_member_location = 0x38,
++    DW_AT_decl_column = 0x39,
++    DW_AT_decl_file = 0x3a,
++    DW_AT_decl_line = 0x3b,
++    DW_AT_declaration = 0x3c,
++    DW_AT_discr_list = 0x3d,
++    DW_AT_encoding = 0x3e,
++    DW_AT_external = 0x3f,
++    DW_AT_frame_base = 0x40,
++    DW_AT_friend = 0x41,
++    DW_AT_identifier_case = 0x42,
++    DW_AT_macro_info = 0x43,
++    DW_AT_namelist_items = 0x44,
++    DW_AT_priority = 0x45,
++    DW_AT_segment = 0x46,
++    DW_AT_specification = 0x47,
++    DW_AT_static_link = 0x48,
++    DW_AT_type = 0x49,
++    DW_AT_use_location = 0x4a,
++    DW_AT_variable_parameter = 0x4b,
++    DW_AT_virtuality = 0x4c,
++    DW_AT_vtable_elem_location = 0x4d,
++    /* DWARF 3 values.  */
++    DW_AT_allocated     = 0x4e,
++    DW_AT_associated    = 0x4f,
++    DW_AT_data_location = 0x50,
++    DW_AT_stride        = 0x51,
++    DW_AT_entry_pc      = 0x52,
++    DW_AT_use_UTF8      = 0x53,
++    DW_AT_extension     = 0x54,
++    DW_AT_ranges        = 0x55,
++    DW_AT_trampoline    = 0x56,
++    DW_AT_call_column   = 0x57,
++    DW_AT_call_file     = 0x58,
++    DW_AT_call_line     = 0x59,
++    /* SGI/MIPS extensions.  */
++    DW_AT_MIPS_fde = 0x2001,
++    DW_AT_MIPS_loop_begin = 0x2002,
++    DW_AT_MIPS_tail_loop_begin = 0x2003,
++    DW_AT_MIPS_epilog_begin = 0x2004,
++    DW_AT_MIPS_loop_unroll_factor = 0x2005,
++    DW_AT_MIPS_software_pipeline_depth = 0x2006,
++    DW_AT_MIPS_linkage_name = 0x2007,
++    DW_AT_MIPS_stride = 0x2008,
++    DW_AT_MIPS_abstract_name = 0x2009,
++    DW_AT_MIPS_clone_origin = 0x200a,
++    DW_AT_MIPS_has_inlines = 0x200b,
++    /* HP extensions.  */
++    DW_AT_HP_block_index         = 0x2000,
++    DW_AT_HP_unmodifiable        = 0x2001, /* Same as DW_AT_MIPS_fde.  */
++    DW_AT_HP_actuals_stmt_list   = 0x2010,
++    DW_AT_HP_proc_per_section    = 0x2011,
++    DW_AT_HP_raw_data_ptr        = 0x2012,
++    DW_AT_HP_pass_by_reference   = 0x2013,
++    DW_AT_HP_opt_level           = 0x2014,
++    DW_AT_HP_prof_version_id     = 0x2015,
++    DW_AT_HP_opt_flags           = 0x2016,
++    DW_AT_HP_cold_region_low_pc  = 0x2017,
++    DW_AT_HP_cold_region_high_pc = 0x2018,
++    DW_AT_HP_all_variables_modifiable = 0x2019,
++    DW_AT_HP_linkage_name        = 0x201a,
++    DW_AT_HP_prof_flags          = 0x201b,  /* In comp unit of procs_info for -g.  */
++    /* GNU extensions.  */
++    DW_AT_sf_names   = 0x2101,
++    DW_AT_src_info   = 0x2102,
++    DW_AT_mac_info   = 0x2103,
++    DW_AT_src_coords = 0x2104,
++    DW_AT_body_begin = 0x2105,
++    DW_AT_body_end   = 0x2106,
++    DW_AT_GNU_vector = 0x2107,
++    /* VMS extensions.  */
++    DW_AT_VMS_rtnbeg_pd_address = 0x2201,
++    /* UPC extension.  */
++    DW_AT_upc_threads_scaled = 0x3210,
++    /* PGI (STMicroelectronics) extensions.  */
++    DW_AT_PGI_lbase    = 0x3a00,
++    DW_AT_PGI_soffset  = 0x3a01,
++    DW_AT_PGI_lstride  = 0x3a02
++  };
++
++#define DW_AT_lo_user 0x2000  /* Implementation-defined range start.  */
++#define DW_AT_hi_user 0x3ff0  /* Implementation-defined range end.  */
++
++/* Location atom names and codes.  */
++enum dwarf_location_atom
++  {
++    DW_OP_addr = 0x03,
++    DW_OP_deref = 0x06,
++    DW_OP_const1u = 0x08,
++    DW_OP_const1s = 0x09,
++    DW_OP_const2u = 0x0a,
++    DW_OP_const2s = 0x0b,
++    DW_OP_const4u = 0x0c,
++    DW_OP_const4s = 0x0d,
++    DW_OP_const8u = 0x0e,
++    DW_OP_const8s = 0x0f,
++    DW_OP_constu = 0x10,
++    DW_OP_consts = 0x11,
++    DW_OP_dup = 0x12,
++    DW_OP_drop = 0x13,
++    DW_OP_over = 0x14,
++    DW_OP_pick = 0x15,
++    DW_OP_swap = 0x16,
++    DW_OP_rot = 0x17,
++    DW_OP_xderef = 0x18,
++    DW_OP_abs = 0x19,
++    DW_OP_and = 0x1a,
++    DW_OP_div = 0x1b,
++    DW_OP_minus = 0x1c,
++    DW_OP_mod = 0x1d,
++    DW_OP_mul = 0x1e,
++    DW_OP_neg = 0x1f,
++    DW_OP_not = 0x20,
++    DW_OP_or = 0x21,
++    DW_OP_plus = 0x22,
++    DW_OP_plus_uconst = 0x23,
++    DW_OP_shl = 0x24,
++    DW_OP_shr = 0x25,
++    DW_OP_shra = 0x26,
++    DW_OP_xor = 0x27,
++    DW_OP_bra = 0x28,
++    DW_OP_eq = 0x29,
++    DW_OP_ge = 0x2a,
++    DW_OP_gt = 0x2b,
++    DW_OP_le = 0x2c,
++    DW_OP_lt = 0x2d,
++    DW_OP_ne = 0x2e,
++    DW_OP_skip = 0x2f,
++    DW_OP_lit0 = 0x30,
++    DW_OP_lit1 = 0x31,
++    DW_OP_lit2 = 0x32,
++    DW_OP_lit3 = 0x33,
++    DW_OP_lit4 = 0x34,
++    DW_OP_lit5 = 0x35,
++    DW_OP_lit6 = 0x36,
++    DW_OP_lit7 = 0x37,
++    DW_OP_lit8 = 0x38,
++    DW_OP_lit9 = 0x39,
++    DW_OP_lit10 = 0x3a,
++    DW_OP_lit11 = 0x3b,
++    DW_OP_lit12 = 0x3c,
++    DW_OP_lit13 = 0x3d,
++    DW_OP_lit14 = 0x3e,
++    DW_OP_lit15 = 0x3f,
++    DW_OP_lit16 = 0x40,
++    DW_OP_lit17 = 0x41,
++    DW_OP_lit18 = 0x42,
++    DW_OP_lit19 = 0x43,
++    DW_OP_lit20 = 0x44,
++    DW_OP_lit21 = 0x45,
++    DW_OP_lit22 = 0x46,
++    DW_OP_lit23 = 0x47,
++    DW_OP_lit24 = 0x48,
++    DW_OP_lit25 = 0x49,
++    DW_OP_lit26 = 0x4a,
++    DW_OP_lit27 = 0x4b,
++    DW_OP_lit28 = 0x4c,
++    DW_OP_lit29 = 0x4d,
++    DW_OP_lit30 = 0x4e,
++    DW_OP_lit31 = 0x4f,
++    DW_OP_reg0 = 0x50,
++    DW_OP_reg1 = 0x51,
++    DW_OP_reg2 = 0x52,
++    DW_OP_reg3 = 0x53,
++    DW_OP_reg4 = 0x54,
++    DW_OP_reg5 = 0x55,
++    DW_OP_reg6 = 0x56,
++    DW_OP_reg7 = 0x57,
++    DW_OP_reg8 = 0x58,
++    DW_OP_reg9 = 0x59,
++    DW_OP_reg10 = 0x5a,
++    DW_OP_reg11 = 0x5b,
++    DW_OP_reg12 = 0x5c,
++    DW_OP_reg13 = 0x5d,
++    DW_OP_reg14 = 0x5e,
++    DW_OP_reg15 = 0x5f,
++    DW_OP_reg16 = 0x60,
++    DW_OP_reg17 = 0x61,
++    DW_OP_reg18 = 0x62,
++    DW_OP_reg19 = 0x63,
++    DW_OP_reg20 = 0x64,
++    DW_OP_reg21 = 0x65,
++    DW_OP_reg22 = 0x66,
++    DW_OP_reg23 = 0x67,
++    DW_OP_reg24 = 0x68,
++    DW_OP_reg25 = 0x69,
++    DW_OP_reg26 = 0x6a,
++    DW_OP_reg27 = 0x6b,
++    DW_OP_reg28 = 0x6c,
++    DW_OP_reg29 = 0x6d,
++    DW_OP_reg30 = 0x6e,
++    DW_OP_reg31 = 0x6f,
++    DW_OP_breg0 = 0x70,
++    DW_OP_breg1 = 0x71,
++    DW_OP_breg2 = 0x72,
++    DW_OP_breg3 = 0x73,
++    DW_OP_breg4 = 0x74,
++    DW_OP_breg5 = 0x75,
++    DW_OP_breg6 = 0x76,
++    DW_OP_breg7 = 0x77,
++    DW_OP_breg8 = 0x78,
++    DW_OP_breg9 = 0x79,
++    DW_OP_breg10 = 0x7a,
++    DW_OP_breg11 = 0x7b,
++    DW_OP_breg12 = 0x7c,
++    DW_OP_breg13 = 0x7d,
++    DW_OP_breg14 = 0x7e,
++    DW_OP_breg15 = 0x7f,
++    DW_OP_breg16 = 0x80,
++    DW_OP_breg17 = 0x81,
++    DW_OP_breg18 = 0x82,
++    DW_OP_breg19 = 0x83,
++    DW_OP_breg20 = 0x84,
++    DW_OP_breg21 = 0x85,
++    DW_OP_breg22 = 0x86,
++    DW_OP_breg23 = 0x87,
++    DW_OP_breg24 = 0x88,
++    DW_OP_breg25 = 0x89,
++    DW_OP_breg26 = 0x8a,
++    DW_OP_breg27 = 0x8b,
++    DW_OP_breg28 = 0x8c,
++    DW_OP_breg29 = 0x8d,
++    DW_OP_breg30 = 0x8e,
++    DW_OP_breg31 = 0x8f,
++    DW_OP_regx = 0x90,
++    DW_OP_fbreg = 0x91,
++    DW_OP_bregx = 0x92,
++    DW_OP_piece = 0x93,
++    DW_OP_deref_size = 0x94,
++    DW_OP_xderef_size = 0x95,
++    DW_OP_nop = 0x96,
++    /* DWARF 3 extensions.  */
++    DW_OP_push_object_address = 0x97,
++    DW_OP_call2 = 0x98,
++    DW_OP_call4 = 0x99,
++    DW_OP_call_ref = 0x9a,
++    /* GNU extensions.  */
++    DW_OP_GNU_push_tls_address = 0xe0,
++    /* HP extensions.  */
++    DW_OP_HP_unknown     = 0xe0, /* Ouch, the same as GNU_push_tls_address.  */
++    DW_OP_HP_is_value    = 0xe1,
++    DW_OP_HP_fltconst4   = 0xe2,
++    DW_OP_HP_fltconst8   = 0xe3,
++    DW_OP_HP_mod_range   = 0xe4,
++    DW_OP_HP_unmod_range = 0xe5,
++    DW_OP_HP_tls         = 0xe6
++  };
++
++#define DW_OP_lo_user 0xe0    /* Implementation-defined range start.  */
++#define DW_OP_hi_user 0xff    /* Implementation-defined range end.  */
++
++/* Type encodings.  */
++enum dwarf_type
++  {
++    DW_ATE_void = 0x0,
++    DW_ATE_address = 0x1,
++    DW_ATE_boolean = 0x2,
++    DW_ATE_complex_float = 0x3,
++    DW_ATE_float = 0x4,
++    DW_ATE_signed = 0x5,
++    DW_ATE_signed_char = 0x6,
++    DW_ATE_unsigned = 0x7,
++    DW_ATE_unsigned_char = 0x8,
++    /* DWARF 3.  */
++    DW_ATE_imaginary_float = 0x9,
++    /* HP extensions.  */
++    DW_ATE_HP_float80            = 0x80, /* Floating-point (80 bit).  */
++    DW_ATE_HP_complex_float80    = 0x81, /* Complex floating-point (80 bit).  */
++    DW_ATE_HP_float128           = 0x82, /* Floating-point (128 bit).  */
++    DW_ATE_HP_complex_float128   = 0x83, /* Complex floating-point (128 bit).  */
++    DW_ATE_HP_floathpintel       = 0x84, /* Floating-point (82 bit IA64).  */
++    DW_ATE_HP_imaginary_float80  = 0x85,
++    DW_ATE_HP_imaginary_float128 = 0x86
++  };
++
++#define       DW_ATE_lo_user 0x80
++#define       DW_ATE_hi_user 0xff
++
++/* Array ordering names and codes.  */
++enum dwarf_array_dim_ordering
++  {
++    DW_ORD_row_major = 0,
++    DW_ORD_col_major = 1
++  };
++
++/* Access attribute.  */
++enum dwarf_access_attribute
++  {
++    DW_ACCESS_public = 1,
++    DW_ACCESS_protected = 2,
++    DW_ACCESS_private = 3
++  };
++
++/* Visibility.  */
++enum dwarf_visibility_attribute
++  {
++    DW_VIS_local = 1,
++    DW_VIS_exported = 2,
++    DW_VIS_qualified = 3
++  };
++
++/* Virtuality.  */
++enum dwarf_virtuality_attribute
++  {
++    DW_VIRTUALITY_none = 0,
++    DW_VIRTUALITY_virtual = 1,
++    DW_VIRTUALITY_pure_virtual = 2
++  };
++
++/* Case sensitivity.  */
++enum dwarf_id_case
++  {
++    DW_ID_case_sensitive = 0,
++    DW_ID_up_case = 1,
++    DW_ID_down_case = 2,
++    DW_ID_case_insensitive = 3
++  };
++
++/* Calling convention.  */
++enum dwarf_calling_convention
++  {
++    DW_CC_normal = 0x1,
++    DW_CC_program = 0x2,
++    DW_CC_nocall = 0x3
++  };
++
++#define DW_CC_lo_user 0x40
++#define DW_CC_hi_user 0xff
++
++/* Inline attribute.  */
++enum dwarf_inline_attribute
++  {
++    DW_INL_not_inlined = 0,
++    DW_INL_inlined = 1,
++    DW_INL_declared_not_inlined = 2,
++    DW_INL_declared_inlined = 3
++  };
++
++/* Discriminant lists.  */
++enum dwarf_discrim_list
++  {
++    DW_DSC_label = 0,
++    DW_DSC_range = 1
++  };
++
++/* Line number opcodes.  */
++enum dwarf_line_number_ops
++  {
++    DW_LNS_extended_op = 0,
++    DW_LNS_copy = 1,
++    DW_LNS_advance_pc = 2,
++    DW_LNS_advance_line = 3,
++    DW_LNS_set_file = 4,
++    DW_LNS_set_column = 5,
++    DW_LNS_negate_stmt = 6,
++    DW_LNS_set_basic_block = 7,
++    DW_LNS_const_add_pc = 8,
++    DW_LNS_fixed_advance_pc = 9,
++    /* DWARF 3.  */
++    DW_LNS_set_prologue_end = 10,
++    DW_LNS_set_epilogue_begin = 11,
++    DW_LNS_set_isa = 12
++  };
++
++/* Line number extended opcodes.  */
++enum dwarf_line_number_x_ops
++  {
++    DW_LNE_end_sequence = 1,
++    DW_LNE_set_address = 2,
++    DW_LNE_define_file = 3,
++    /* HP extensions.  */
++    DW_LNE_HP_negate_is_UV_update      = 0x11,
++    DW_LNE_HP_push_context             = 0x12,
++    DW_LNE_HP_pop_context              = 0x13,
++    DW_LNE_HP_set_file_line_column     = 0x14,
++    DW_LNE_HP_set_routine_name         = 0x15,
++    DW_LNE_HP_set_sequence             = 0x16,
++    DW_LNE_HP_negate_post_semantics    = 0x17,
++    DW_LNE_HP_negate_function_exit     = 0x18,
++    DW_LNE_HP_negate_front_end_logical = 0x19,
++    DW_LNE_HP_define_proc              = 0x20
++  };
++
++/* Call frame information.  */
++enum dwarf_call_frame_info
++  {
++    DW_CFA_advance_loc = 0x40,
++    DW_CFA_offset = 0x80,
++    DW_CFA_restore = 0xc0,
++    DW_CFA_nop = 0x00,
++    DW_CFA_set_loc = 0x01,
++    DW_CFA_advance_loc1 = 0x02,
++    DW_CFA_advance_loc2 = 0x03,
++    DW_CFA_advance_loc4 = 0x04,
++    DW_CFA_offset_extended = 0x05,
++    DW_CFA_restore_extended = 0x06,
++    DW_CFA_undefined = 0x07,
++    DW_CFA_same_value = 0x08,
++    DW_CFA_register = 0x09,
++    DW_CFA_remember_state = 0x0a,
++    DW_CFA_restore_state = 0x0b,
++    DW_CFA_def_cfa = 0x0c,
++    DW_CFA_def_cfa_register = 0x0d,
++    DW_CFA_def_cfa_offset = 0x0e,
++    /* DWARF 3.  */
++    DW_CFA_def_cfa_expression = 0x0f,
++    DW_CFA_expression = 0x10,
++    DW_CFA_offset_extended_sf = 0x11,
++    DW_CFA_def_cfa_sf = 0x12,
++    DW_CFA_def_cfa_offset_sf = 0x13,
++    /* SGI/MIPS specific.  */
++    DW_CFA_MIPS_advance_loc8 = 0x1d,
++    /* GNU extensions.  */
++    DW_CFA_GNU_window_save = 0x2d,
++    DW_CFA_GNU_args_size = 0x2e,
++    DW_CFA_GNU_negative_offset_extended = 0x2f
++  };
++
++#define DW_CIE_ID       0xffffffff
++#define DW_CIE_VERSION          1
++
++#define DW_CFA_extended   0
++#define DW_CFA_lo_user    0x1c
++#define DW_CFA_hi_user    0x3f
++
++#define DW_CHILDREN_no                     0x00
++#define DW_CHILDREN_yes                    0x01
++
++#define DW_ADDR_none          0
++
++/* Source language names and codes.  */
++enum dwarf_source_language
++  {
++    DW_LANG_C89 = 0x0001,
++    DW_LANG_C = 0x0002,
++    DW_LANG_Ada83 = 0x0003,
++    DW_LANG_C_plus_plus = 0x0004,
++    DW_LANG_Cobol74 = 0x0005,
++    DW_LANG_Cobol85 = 0x0006,
++    DW_LANG_Fortran77 = 0x0007,
++    DW_LANG_Fortran90 = 0x0008,
++    DW_LANG_Pascal83 = 0x0009,
++    DW_LANG_Modula2 = 0x000a,
++    DW_LANG_Java = 0x000b,
++    /* DWARF 3.  */
++    DW_LANG_C99 = 0x000c,
++    DW_LANG_Ada95 = 0x000d,
++    DW_LANG_Fortran95 = 0x000e,
++    /* MIPS.  */
++    DW_LANG_Mips_Assembler = 0x8001,
++    /* UPC.  */
++    DW_LANG_Upc = 0x8765
++  };
++
++#define DW_LANG_lo_user 0x8000        /* Implementation-defined range start.  */
++#define DW_LANG_hi_user 0xffff        /* Implementation-defined range start.  */
++
++/* Names and codes for macro information.  */
++enum dwarf_macinfo_record_type
++  {
++    DW_MACINFO_define = 1,
++    DW_MACINFO_undef = 2,
++    DW_MACINFO_start_file = 3,
++    DW_MACINFO_end_file = 4,
++    DW_MACINFO_vendor_ext = 255
++  };
++\f
++/* @@@ For use with GNU frame unwind information.  */
++
++#define DW_EH_PE_absptr               0x00
++#define DW_EH_PE_omit         0xff
++
++#define DW_EH_PE_uleb128      0x01
++#define DW_EH_PE_udata2               0x02
++#define DW_EH_PE_udata4               0x03
++#define DW_EH_PE_udata8               0x04
++#define DW_EH_PE_sleb128      0x09
++#define DW_EH_PE_sdata2               0x0A
++#define DW_EH_PE_sdata4               0x0B
++#define DW_EH_PE_sdata8               0x0C
++#define DW_EH_PE_signed               0x08
++
++#define DW_EH_PE_pcrel                0x10
++#define DW_EH_PE_textrel      0x20
++#define DW_EH_PE_datarel      0x30
++#define DW_EH_PE_funcrel      0x40
++#define DW_EH_PE_aligned      0x50
++
++#define DW_EH_PE_indirect     0x80
++
++#endif /* _ELF_DWARF2_H */
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/linux/kgdb.h linux-2.6.18-53.1.14.kgdb/include/linux/kgdb.h
+--- linux-2.6.18-53.1.14/include/linux/kgdb.h  1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/include/linux/kgdb.h     2008-06-10 15:39:21.000000000 +0400
+@@ -0,0 +1,279 @@
++/*
++ * include/linux/kgdb.h
++ *
++ * This provides the hooks and functions that KGDB needs to share between
++ * the core, I/O and arch-specific portions.
++ *
++ * Author: Amit Kale <amitkale@linsyssoft.com> and
++ *         Tom Rini <trini@kernel.crashing.org>
++ *
++ * 2001-2004 (c) Amit S. Kale and 2003-2005 (c) MontaVista Software, Inc.
++ * This file is licensed under the terms of the GNU General Public License
++ * version 2. This program is licensed "as is" without any warranty of any
++ * kind, whether express or implied.
++ */
++#ifdef __KERNEL__
++#ifndef _KGDB_H_
++#define _KGDB_H_
++
++#include <asm/atomic.h>
++
++#ifdef CONFIG_KGDB
++#include <asm/kgdb.h>
++#include <linux/serial_8250.h>
++#include <linux/linkage.h>
++#include <linux/init.h>
++
++#ifndef CHECK_EXCEPTION_STACK
++#define CHECK_EXCEPTION_STACK()       1
++#endif
++
++struct tasklet_struct;
++struct pt_regs;
++struct task_struct;
++struct uart_port;
++
++#ifdef CONFIG_KGDB_CONSOLE
++extern struct console kgdbcons;
++#endif
++
++/* To enter the debugger explicitly. */
++extern void breakpoint(void);
++extern int kgdb_connected;
++extern int kgdb_may_fault;
++extern struct tasklet_struct kgdb_tasklet_breakpoint;
++
++extern atomic_t kgdb_setting_breakpoint;
++extern atomic_t cpu_doing_single_step;
++extern atomic_t kgdb_sync_softlockup[NR_CPUS];
++
++extern struct task_struct *kgdb_usethread, *kgdb_contthread;
++
++enum kgdb_bptype {
++      bp_breakpoint = '0',
++      bp_hardware_breakpoint,
++      bp_write_watchpoint,
++      bp_read_watchpoint,
++      bp_access_watchpoint
++};
++
++enum kgdb_bpstate {
++      bp_none = 0,
++      bp_removed,
++      bp_set,
++      bp_active
++};
++
++struct kgdb_bkpt {
++      unsigned long bpt_addr;
++      unsigned char saved_instr[BREAK_INSTR_SIZE];
++      enum kgdb_bptype type;
++      enum kgdb_bpstate state;
++};
++
++/* The maximum number of KGDB I/O modules that can be loaded */
++#define MAX_KGDB_IO_HANDLERS 3
++
++#ifndef MAX_BREAKPOINTS
++#define MAX_BREAKPOINTS               1000
++#endif
++
++#define KGDB_HW_BREAKPOINT    1
++
++/* Required functions. */
++/**
++ *    regs_to_gdb_regs - Convert ptrace regs to GDB regs
++ *    @gdb_regs: A pointer to hold the registers in the order GDB wants.
++ *    @regs: The &struct pt_regs of the current process.
++ *
++ *    Convert the pt_regs in @regs into the format for registers that
++ *    GDB expects, stored in @gdb_regs.
++ */
++extern void regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs);
++
++/**
++ *    sleeping_regs_to_gdb_regs - Convert ptrace regs to GDB regs
++ *    @gdb_regs: A pointer to hold the registers in the order GDB wants.
++ *    @p: The &struct task_struct of the desired process.
++ *
++ *    Convert the register values of the sleeping process in @p to
++ *    the format that GDB expects.
++ *    This function is called when kgdb does not have access to the
++ *    &struct pt_regs and therefore it should fill the gdb registers
++ *    @gdb_regs with what has been saved in &struct thread_struct
++ *    thread field during switch_to.
++ */
++extern void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs,
++                                      struct task_struct *p);
++
++/**
++ *    gdb_regs_to_regs - Convert GDB regs to ptrace regs.
++ *    @gdb_regs: A pointer to hold the registers we've recieved from GDB.
++ *    @regs: A pointer to a &struct pt_regs to hold these values in.
++ *
++ *    Convert the GDB regs in @gdb_regs into the pt_regs, and store them
++ *    in @regs.
++ */
++extern void gdb_regs_to_regs(unsigned long *gdb_regs, struct pt_regs *regs);
++
++/**
++ *    kgdb_arch_handle_exception - Handle architecture specific GDB packets.
++ *    @vector: The error vector of the exception that happened.
++ *    @signo: The signal number of the exception that happened.
++ *    @err_code: The error code of the exception that happened.
++ *    @remcom_in_buffer: The buffer of the packet we have read.
++ *    @remcom_out_buffer: The buffer, of %BUFMAX to write a packet into.
++ *    @regs: The &struct pt_regs of the current process.
++ *
++ *    This function MUST handle the 'c' and 's' command packets,
++ *    as well packets to set / remove a hardware breakpoint, if used.
++ *    If there are additional packets which the hardware needs to handle,
++ *    they are handled here.  The code should return -1 if it wants to
++ *    process more packets, and a %0 or %1 if it wants to exit from the
++ *    kgdb hook.
++ */
++extern int kgdb_arch_handle_exception(int vector, int signo, int err_code,
++                                    char *remcom_in_buffer,
++                                    char *remcom_out_buffer,
++                                    struct pt_regs *regs);
++
++#ifndef JMP_REGS_ALIGNMENT
++#define JMP_REGS_ALIGNMENT
++#endif
++
++extern unsigned long kgdb_fault_jmp_regs[];
++
++/**
++ *    kgdb_fault_setjmp - Store state in case we fault.
++ *    @curr_context: An array to store state into.
++ *
++ *    Certain functions may try and access memory, and in doing so may
++ *    cause a fault.  When this happens, we trap it, restore state to
++ *    this call, and let ourself know that something bad has happened.
++ */
++extern asmlinkage int kgdb_fault_setjmp(unsigned long *curr_context);
++
++/**
++ *    kgdb_fault_longjmp - Restore state when we have faulted.
++ *    @curr_context: The previously stored state.
++ *
++ *    When something bad does happen, this function is called to
++ *    restore the known good state, and set the return value to 1, so
++ *    we know something bad happened.
++ */
++extern asmlinkage void kgdb_fault_longjmp(unsigned long *curr_context);
++
++/* Optional functions. */
++extern int kgdb_arch_init(void);
++extern void kgdb_disable_hw_debug(struct pt_regs *regs);
++extern void kgdb_post_master_code(struct pt_regs *regs, int e_vector,
++                                int err_code);
++extern void kgdb_roundup_cpus(unsigned long flags);
++extern int kgdb_set_hw_break(unsigned long addr);
++extern int kgdb_remove_hw_break(unsigned long addr);
++extern void kgdb_remove_all_hw_break(void);
++extern void kgdb_correct_hw_break(void);
++extern void kgdb_shadowinfo(struct pt_regs *regs, char *buffer,
++                          unsigned threadid);
++extern struct task_struct *kgdb_get_shadow_thread(struct pt_regs *regs,
++                                                int threadid);
++extern struct pt_regs *kgdb_shadow_regs(struct pt_regs *regs, int threadid);
++extern int kgdb_validate_break_address(unsigned long addr);
++extern int kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr);
++extern int kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle);
++
++/**
++ * struct kgdb_arch - Desribe architecture specific values.
++ * @gdb_bpt_instr: The instruction to trigger a breakpoint.
++ * @flags: Flags for the breakpoint, currently just %KGDB_HW_BREAKPOINT.
++ * @shadowth: A value of %1 indicates we shadow information on processes.
++ * @set_breakpoint: Allow an architecture to specify how to set a software
++ * breakpoint.
++ * @remove_breakpoint: Allow an architecture to specify how to remove a
++ * software breakpoint.
++ * @set_hw_breakpoint: Allow an architecture to specify how to set a hardware
++ * breakpoint.
++ * @remove_hw_breakpoint: Allow an architecture to specify how to remove a
++ * hardware breakpoint.
++ *
++ * The @shadowth flag is an option to shadow information not retrievable by
++ * gdb otherwise.  This is deprecated in favor of a binutils which supports
++ * CFI macros.
++ */
++struct kgdb_arch {
++      unsigned char gdb_bpt_instr[BREAK_INSTR_SIZE];
++      unsigned long flags;
++      unsigned shadowth;
++      int (*set_breakpoint) (unsigned long, char *);
++      int (*remove_breakpoint)(unsigned long, char *);
++      int (*set_hw_breakpoint)(unsigned long, int, enum kgdb_bptype);
++      int (*remove_hw_breakpoint)(unsigned long, int, enum kgdb_bptype);
++};
++
++/* Thread reference */
++typedef unsigned char threadref[8];
++
++/**
++ * struct kgdb_io - Desribe the interface for an I/O driver to talk with KGDB.
++ * @read_char: Pointer to a function that will return one char.
++ * @write_char: Pointer to a function that will write one char.
++ * @flush: Pointer to a function that will flush any pending writes.
++ * @init: Pointer to a function that will initialize the device.
++ * @late_init: Pointer to a function that will do any setup that has
++ * other dependencies.
++ * @pre_exception: Pointer to a function that will do any prep work for
++ * the I/O driver.
++ * @post_exception: Pointer to a function that will do any cleanup work
++ * for the I/O driver.
++ *
++ * The @init and @late_init function pointers allow for an I/O driver
++ * such as a serial driver to fully initialize the port with @init and
++ * be called very early, yet safely call request_irq() later in the boot
++ * sequence.
++ *
++ * @init is allowed to return a non-0 return value to indicate failure.
++ * If this is called early on, then KGDB will try again when it would call
++ * @late_init.  If it has failed later in boot as well, the user will be
++ * notified.
++ */
++struct kgdb_io {
++      int (*read_char) (void);
++      void (*write_char) (u8);
++      void (*flush) (void);
++      int (*init) (void);
++      void (*late_init) (void);
++      void (*pre_exception) (void);
++      void (*post_exception) (void);
++};
++
++extern struct kgdb_io kgdb_io_ops;
++extern struct kgdb_arch arch_kgdb_ops;
++extern int kgdb_initialized;
++
++extern int kgdb_register_io_module(struct kgdb_io *local_kgdb_io_ops);
++extern void kgdb_unregister_io_module(struct kgdb_io *local_kgdb_io_ops);
++
++extern void __init kgdb8250_add_port(int i, struct uart_port *serial_req);
++extern void __init kgdb8250_add_platform_port(int i, struct plat_serial8250_port *serial_req);
++
++extern int kgdb_hex2long(char **ptr, long *long_val);
++extern char *kgdb_mem2hex(char *mem, char *buf, int count);
++extern char *kgdb_hex2mem(char *buf, char *mem, int count);
++extern int kgdb_get_mem(char *addr, unsigned char *buf, int count);
++extern int kgdb_set_mem(char *addr, unsigned char *buf, int count);
++
++int kgdb_isremovedbreak(unsigned long addr);
++int kgdb_skipexception(int exception, struct pt_regs *regs);
++
++extern int kgdb_handle_exception(int ex_vector, int signo, int err_code,
++                              struct pt_regs *regs);
++extern void kgdb_nmihook(int cpu, void *regs);
++extern int debugger_step;
++extern atomic_t debugger_active;
++extern struct kgdb_arch *kgdb_ops;
++#else
++/* Stubs for when KGDB is not set. */
++static const atomic_t debugger_active = ATOMIC_INIT(0);
++#endif                                /* CONFIG_KGDB */
++#endif                                /* _KGDB_H_ */
++#endif                                /* __KERNEL__ */
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/linux/module.h linux-2.6.18-53.1.14.kgdb/include/linux/module.h
+--- linux-2.6.18-53.1.14/include/linux/module.h        2008-03-06 05:54:41.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/include/linux/module.h   2008-06-10 15:39:15.000000000 +0400
+@@ -229,8 +229,17 @@ enum module_state
+       MODULE_STATE_LIVE,
+       MODULE_STATE_COMING,
+       MODULE_STATE_GOING,
++      MODULE_STATE_GONE,
+ };
+ 
++#ifdef CONFIG_KGDB
++#define MAX_SECTNAME 31
++struct mod_section {
++       void *address;
++       char name[MAX_SECTNAME + 1];
++};
++#endif
++
+ /* Similar stuff for section attributes. */
+ #define MODULE_SECT_NAME_LEN 32
+ struct module_sect_attr
+@@ -258,6 +267,13 @@ struct module
+       /* Unique handle for this module */
+       char name[MODULE_NAME_LEN];
+ 
++#ifdef CONFIG_KGDB
++      /* keep kgdb info at the begining so that gdb doesn't have a chance to
++       * miss out any fields */
++      unsigned long num_sections;
++      struct mod_section *mod_sections;
++#endif
++
+       /* Sysfs stuff. */
+       struct module_kobject mkobj;
+       struct module_param_attrs *param_attrs;
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/linux/netpoll.h linux-2.6.18-53.1.14.kgdb/include/linux/netpoll.h
+--- linux-2.6.18-53.1.14/include/linux/netpoll.h       2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/include/linux/netpoll.h  2008-06-10 15:37:49.000000000 +0400
+@@ -17,7 +17,7 @@ struct netpoll;
+ struct netpoll {
+       struct net_device *dev;
+       char dev_name[16], *name;
+-      void (*rx_hook)(struct netpoll *, int, char *, int);
++      void (*rx_hook)(struct netpoll *, int, char *, int, struct sk_buff *);
+       void (*drop)(struct sk_buff *skb);
+       u32 local_ip, remote_ip;
+       u16 local_port, remote_port;
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/linux/serial_8250.h linux-2.6.18-53.1.14.kgdb/include/linux/serial_8250.h
+--- linux-2.6.18-53.1.14/include/linux/serial_8250.h   2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18-53.1.14.kgdb/include/linux/serial_8250.h      2008-06-10 15:37:43.000000000 +0400
+@@ -56,6 +56,7 @@ struct uart_port;
+ 
+ int serial8250_register_port(struct uart_port *);
+ void serial8250_unregister_port(int line);
++void serial8250_unregister_by_port(struct uart_port *port);
+ void serial8250_suspend_port(int line);
+ void serial8250_resume_port(int line);
+ 
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/kernel/Makefile linux-2.6.18-53.1.14.kgdb/kernel/Makefile
+--- linux-2.6.18-53.1.14/kernel/Makefile       2008-03-06 05:54:50.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/kernel/Makefile  2008-06-10 15:37:25.000000000 +0400
+@@ -42,6 +42,7 @@ obj-$(CONFIG_STOP_MACHINE) += stop_machi
+ obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
+ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o audit_tree.o
+ obj-$(CONFIG_KPROBES) += kprobes.o
++obj-$(CONFIG_KGDB) += kgdb.o kgdbarchlib.o
+ obj-$(CONFIG_SYSFS) += ksysfs.o
+ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
+ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/kernel/kgdb.c linux-2.6.18-53.1.14.kgdb/kernel/kgdb.c
+--- linux-2.6.18-53.1.14/kernel/kgdb.c 1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/kernel/kgdb.c    2008-06-10 15:39:21.000000000 +0400
+@@ -0,0 +1,1778 @@
++/*
++ * kernel/kgdb.c
++ *
++ * Maintainer: Tom Rini <trini@kernel.crashing.org>
++ *
++ * Copyright (C) 2000-2001 VERITAS Software Corporation.
++ * Copyright (C) 2002-2004 Timesys Corporation
++ * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
++ * Copyright (C) 2004 Pavel Machek <pavel@suse.cz>
++ * Copyright (C) 2004-2005 Tom Rini <trini@kernel.crashing.org>
++ * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
++ * Copyright (C) 2005 Wind River Systems, Inc.
++ *
++ * Contributors at various stages not listed above:
++ *  Jason Wessel ( jason.wessel@windriver.com )
++ *  George Anzinger <george@mvista.com>
++ *  Anurekh Saxena (anurekh.saxena@timesys.com)
++ *  Lake Stevens Instrument Division (Glenn Engel)
++ *  Jim Kingdon, Cygnus Support.
++ *
++ * Original KGDB stub: David Grothe <dave@gcom.com>,
++ * Tigran Aivazian <tigran@sco.com>
++ *
++ * This file is licensed under the terms of the GNU General Public License
++ * version 2. This program is licensed "as is" without any warranty of any
++ * kind, whether express or implied.
++ */
++
++#include <linux/string.h>
++#include <linux/kernel.h>
++#include <linux/interrupt.h>
++#include <linux/sched.h>
++#include <linux/smp.h>
++#include <linux/spinlock.h>
++#include <linux/delay.h>
++#include <linux/mm.h>
++#include <linux/threads.h>
++#include <linux/reboot.h>
++#include <asm/system.h>
++#include <asm/ptrace.h>
++#include <asm/uaccess.h>
++#include <linux/kgdb.h>
++#include <asm/atomic.h>
++#include <linux/notifier.h>
++#include <linux/module.h>
++#include <asm/cacheflush.h>
++#include <linux/init.h>
++#include <linux/sysrq.h>
++#include <linux/console.h>
++#include <linux/sched.h>
++#include <asm/byteorder.h>
++
++extern int pid_max;
++/* How many times to count all of the waiting CPUs */
++#define ROUNDUP_WAIT          640000  /* Arbitrary, increase if needed. */
++#define BUF_THREAD_ID_SIZE    16
++
++/*
++ * kgdb_initialized with a value of 1 indicates that kgdb is setup and is
++ * all ready to serve breakpoints and other kernel exceptions.  A value of
++ * -1 indicates that we have tried to initialize early, and need to try
++ * again later.
++ */
++int kgdb_initialized;
++/* Is a host GDB connected to us? */
++int kgdb_connected;
++/* Could we be about to try and access a bad memory location? If so we
++ * also need to flag this has happend. */
++int kgdb_may_fault;
++/* All the KGDB handlers are installed */
++int kgdb_from_module_registered = 0;
++
++/* We provide a kgdb_io_ops structure that may be overriden. */
++struct kgdb_io __attribute__ ((weak)) kgdb_io_ops;
++
++static struct kgdb_io kgdb_io_ops_prev[MAX_KGDB_IO_HANDLERS];
++static int kgdb_io_handler_cnt = 0;
++
++/* Export the following symbols for use with kernel modules */
++EXPORT_SYMBOL(kgdb_io_ops);
++EXPORT_SYMBOL(kgdb_tasklet_breakpoint);
++EXPORT_SYMBOL(kgdb_connected);
++EXPORT_SYMBOL(kgdb_register_io_module);
++EXPORT_SYMBOL(kgdb_unregister_io_module);
++EXPORT_SYMBOL(debugger_active);
++
++/*
++ * Holds information about breakpoints in a kernel. These breakpoints are
++ * added and removed by gdb.
++ */
++struct kgdb_bkpt kgdb_break[MAX_BREAKPOINTS];
++
++static const char hexchars[] = "0123456789abcdef";
++
++static spinlock_t slavecpulocks[NR_CPUS];
++static atomic_t procindebug[NR_CPUS];
++atomic_t kgdb_setting_breakpoint;
++EXPORT_SYMBOL(kgdb_setting_breakpoint);
++struct task_struct *kgdb_usethread, *kgdb_contthread;
++
++int debugger_step;
++atomic_t debugger_active;
++
++/* Our I/O buffers. */
++static char remcom_in_buffer[BUFMAX];
++static char remcom_out_buffer[BUFMAX];
++/* Storage for the registers, in GDB format. */
++static unsigned long gdb_regs[(NUMREGBYTES + sizeof(unsigned long) - 1) /
++                            sizeof(unsigned long)];
++/* Storage of registers for handling a fault. */
++unsigned long kgdb_fault_jmp_regs[NUMCRITREGBYTES / sizeof(unsigned long)]
++ JMP_REGS_ALIGNMENT;
++static int kgdb_notify_reboot(struct notifier_block *this,
++                              unsigned long code ,void *x);
++struct debuggerinfo_struct {
++      void *debuggerinfo;
++      struct task_struct *task;
++} kgdb_info[NR_CPUS];
++
++/* to keep track of the CPU which is doing the single stepping*/
++atomic_t cpu_doing_single_step = ATOMIC_INIT(-1);
++
++atomic_t  kgdb_sync_softlockup[NR_CPUS] = {ATOMIC_INIT(0)};
++
++/* reboot notifier block */
++static struct notifier_block kgdb_reboot_notifier = {
++      .notifier_call  = kgdb_notify_reboot,
++      .next           = NULL,
++      .priority       = INT_MAX,
++};
++
++static int hex(char ch)
++{
++      if ((ch >= 'a') && (ch <= 'f'))
++              return (ch - 'a' + 10);
++      if ((ch >= '0') && (ch <= '9'))
++              return (ch - '0');
++      if ((ch >= 'A') && (ch <= 'F'))
++              return (ch - 'A' + 10);
++      return (-1);
++}
++
++/* scan for the sequence $<data>#<checksum>   */
++static void get_packet(char *buffer)
++{
++      unsigned char checksum;
++      unsigned char xmitcsum;
++      int count;
++      char ch;
++      if (!kgdb_io_ops.read_char)
++              return;
++      do {
++              /* Spin and wait around for the start character, ignore all
++               * other characters */
++              while ((ch = (kgdb_io_ops.read_char())) != '$') ;
++              kgdb_connected = 1;
++              checksum = 0;
++              xmitcsum = -1;
++
++              count = 0;
++
++              /* now, read until a # or end of buffer is found */
++              while (count < (BUFMAX - 1)) {
++                      ch = kgdb_io_ops.read_char();
++                      if (ch == '#')
++                              break;
++                      checksum = checksum + ch;
++                      buffer[count] = ch;
++                      count = count + 1;
++              }
++              buffer[count] = 0;
++
++              if (ch == '#') {
++                      xmitcsum = hex(kgdb_io_ops.read_char()) << 4;
++                      xmitcsum += hex(kgdb_io_ops.read_char());
++
++                      if (checksum != xmitcsum)
++                              /* failed checksum */
++                              kgdb_io_ops.write_char('-');
++                      else
++                              /* successful transfer */
++                              kgdb_io_ops.write_char('+');
++                      if (kgdb_io_ops.flush)
++                              kgdb_io_ops.flush();
++              }
++      } while (checksum != xmitcsum);
++}
++
++/*
++ * Send the packet in buffer.
++ * Check for gdb connection if asked for.
++ */
++static void put_packet(char *buffer)
++{
++      unsigned char checksum;
++      int count;
++      char ch;
++
++      if (!kgdb_io_ops.write_char)
++              return;
++      /* $<packet info>#<checksum>. */
++      while (1) {
++              kgdb_io_ops.write_char('$');
++              checksum = 0;
++              count = 0;
++
++              while ((ch = buffer[count])) {
++                      kgdb_io_ops.write_char(ch);
++                      checksum += ch;
++                      count++;
++              }
++
++              kgdb_io_ops.write_char('#');
++              kgdb_io_ops.write_char(hexchars[checksum >> 4]);
++              kgdb_io_ops.write_char(hexchars[checksum % 16]);
++              if (kgdb_io_ops.flush)
++                      kgdb_io_ops.flush();
++
++              /* Now see what we get in reply. */
++              ch = kgdb_io_ops.read_char();
++
++              if (ch == 3)
++                      ch = kgdb_io_ops.read_char();
++
++              /* If we get an ACK, we are done. */
++              if (ch == '+')
++                      return;
++
++              /* If we get the start of another packet, this means
++               * that GDB is attempting to reconnect.  We will NAK
++               * the packet being sent, and stop trying to send this
++               * packet. */
++              if (ch == '$') {
++                      kgdb_io_ops.write_char('-');
++                      if (kgdb_io_ops.flush)
++                              kgdb_io_ops.flush();
++                      return;
++              }
++      }
++}
++
++/*
++ * convert the memory pointed to by mem into hex, placing result in buf
++ * return a pointer to the last char put in buf (null). May return an error.
++ */
++char *kgdb_mem2hex(char *mem, char *buf, int count)
++{
++      kgdb_may_fault = 1;
++      if ((kgdb_fault_setjmp(kgdb_fault_jmp_regs)) != 0) {
++              kgdb_may_fault = 0;
++              return ERR_PTR(-EINVAL);
++      }
++      /* Accessing some registers in a single load instruction is
++       * required to avoid bad side effects for some I/O registers.
++       */
++      if ((count == 2) && (((long)mem & 1) == 0)) {
++              unsigned short tmp_s = *(unsigned short *)mem;
++              mem += 2;
++#ifdef __BIG_ENDIAN
++              *buf++ = hexchars[(tmp_s >> 12) & 0xf];
++              *buf++ = hexchars[(tmp_s >> 8) & 0xf];
++              *buf++ = hexchars[(tmp_s >> 4) & 0xf];
++              *buf++ = hexchars[tmp_s & 0xf];
++#else
++              *buf++ = hexchars[(tmp_s >> 4) & 0xf];
++              *buf++ = hexchars[tmp_s & 0xf];
++              *buf++ = hexchars[(tmp_s >> 12) & 0xf];
++              *buf++ = hexchars[(tmp_s >> 8) & 0xf];
++#endif
++      } else if ((count == 4) && (((long)mem & 3) == 0)) {
++              unsigned long tmp_l = *(unsigned int *)mem;
++              mem += 4;
++#ifdef __BIG_ENDIAN
++              *buf++ = hexchars[(tmp_l >> 28) & 0xf];
++              *buf++ = hexchars[(tmp_l >> 24) & 0xf];
++              *buf++ = hexchars[(tmp_l >> 20) & 0xf];
++              *buf++ = hexchars[(tmp_l >> 16) & 0xf];
++              *buf++ = hexchars[(tmp_l >> 12) & 0xf];
++              *buf++ = hexchars[(tmp_l >> 8) & 0xf];
++              *buf++ = hexchars[(tmp_l >> 4) & 0xf];
++              *buf++ = hexchars[tmp_l & 0xf];
++#else
++              *buf++ = hexchars[(tmp_l >> 4) & 0xf];
++              *buf++ = hexchars[tmp_l & 0xf];
++              *buf++ = hexchars[(tmp_l >> 12) & 0xf];
++              *buf++ = hexchars[(tmp_l >> 8) & 0xf];
++              *buf++ = hexchars[(tmp_l >> 20) & 0xf];
++              *buf++ = hexchars[(tmp_l >> 16) & 0xf];
++              *buf++ = hexchars[(tmp_l >> 28) & 0xf];
++              *buf++ = hexchars[(tmp_l >> 24) & 0xf];
++#endif
++#ifdef CONFIG_64BIT
++      } else if ((count == 8) && (((long)mem & 7) == 0)) {
++              unsigned long long tmp_ll = *(unsigned long long *)mem;
++              mem += 8;
++#ifdef __BIG_ENDIAN
++              *buf++ = hexchars[(tmp_ll >> 60) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 56) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 52) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 48) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 44) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 40) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 36) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 32) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 28) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 24) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 20) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 16) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 12) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 8) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 4) & 0xf];
++              *buf++ = hexchars[tmp_ll & 0xf];
++#else
++              *buf++ = hexchars[(tmp_ll >> 4) & 0xf];
++              *buf++ = hexchars[tmp_ll & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 12) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 8) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 20) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 16) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 28) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 24) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 36) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 32) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 44) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 40) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 52) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 48) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 60) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 56) & 0xf];
++#endif
++#endif
++      } else {
++              while (count-- > 0) {
++                      unsigned char ch = *mem++;
++                      *buf++ = hexchars[ch >> 4];
++                      *buf++ = hexchars[ch & 0xf];
++              }
++      }
++      kgdb_may_fault = 0;
++      *buf = 0;
++      return (buf);
++}
++
++/*
++ * Copy the binary array pointed to by buf into mem.  Fix $, #, and
++ * 0x7d escaped with 0x7d.  Return a pointer to the character after
++ * the last byte written.
++ */
++static char *kgdb_ebin2mem(char *buf, char *mem, int count)
++{
++      kgdb_may_fault = 1;
++      if ((kgdb_fault_setjmp(kgdb_fault_jmp_regs)) != 0) {
++              kgdb_may_fault = 0;
++              return ERR_PTR(-EINVAL);
++      }
++      for (; count > 0; count--, buf++) {
++              if (*buf == 0x7d)
++                      *mem++ = *(++buf) ^ 0x20;
++              else
++                      *mem++ = *buf;
++      }
++      kgdb_may_fault = 0;
++      return mem;
++}
++
++/*
++ * convert the hex array pointed to by buf into binary to be placed in mem
++ * return a pointer to the character AFTER the last byte written
++ * May return an error.
++ */
++char *kgdb_hex2mem(char *buf, char *mem, int count)
++{
++      kgdb_may_fault = 1;
++      if ((kgdb_fault_setjmp(kgdb_fault_jmp_regs)) != 0) {
++              kgdb_may_fault = 0;
++              return ERR_PTR(-EINVAL);
++      }
++      if ((count == 2) && (((long)mem & 1) == 0)) {
++              unsigned short tmp_s = 0;
++#ifdef __BIG_ENDIAN
++              tmp_s |= hex(*buf++) << 12;
++              tmp_s |= hex(*buf++) << 8;
++              tmp_s |= hex(*buf++) << 4;
++              tmp_s |= hex(*buf++);
++#else
++              tmp_s |= hex(*buf++) << 4;
++              tmp_s |= hex(*buf++);
++              tmp_s |= hex(*buf++) << 12;
++              tmp_s |= hex(*buf++) << 8;
++#endif
++              *(unsigned short *)mem = tmp_s;
++              mem += 2;
++      } else if ((count == 4) && (((long)mem & 3) == 0)) {
++              unsigned long tmp_l = 0;
++#ifdef __BIG_ENDIAN
++              tmp_l |= hex(*buf++) << 28;
++              tmp_l |= hex(*buf++) << 24;
++              tmp_l |= hex(*buf++) << 20;
++              tmp_l |= hex(*buf++) << 16;
++              tmp_l |= hex(*buf++) << 12;
++              tmp_l |= hex(*buf++) << 8;
++              tmp_l |= hex(*buf++) << 4;
++              tmp_l |= hex(*buf++);
++#else
++              tmp_l |= hex(*buf++) << 4;
++              tmp_l |= hex(*buf++);
++              tmp_l |= hex(*buf++) << 12;
++              tmp_l |= hex(*buf++) << 8;
++              tmp_l |= hex(*buf++) << 20;
++              tmp_l |= hex(*buf++) << 16;
++              tmp_l |= hex(*buf++) << 28;
++              tmp_l |= hex(*buf++) << 24;
++#endif
++              *(unsigned long *)mem = tmp_l;
++              mem += 4;
++      } else {
++              int i;
++              for (i = 0; i < count; i++) {
++                      unsigned char ch = hex(*buf++) << 4;
++                      ch |= hex(*buf++);
++                      *mem++ = ch;
++              }
++      }
++      kgdb_may_fault = 0;
++      return (mem);
++}
++
++/*
++ * While we find nice hex chars, build a long_val.
++ * Return number of chars processed.
++ */
++int kgdb_hex2long(char **ptr, long *long_val)
++{
++      int hex_val, num = 0;
++
++      *long_val = 0;
++
++      while (**ptr) {
++              hex_val = hex(**ptr);
++              if (hex_val >= 0) {
++                      *long_val = (*long_val << 4) | hex_val;
++                      num++;
++              } else
++                      break;
++
++              (*ptr)++;
++      }
++
++      return (num);
++}
++
++/* Write memory due to an 'M' or 'X' packet. */
++static char *write_mem_msg(int binary)
++{
++      char *ptr = &remcom_in_buffer[1];
++      unsigned long addr, length;
++
++      if (kgdb_hex2long(&ptr, &addr) > 0 && *(ptr++) == ',' &&
++          kgdb_hex2long(&ptr, &length) > 0 && *(ptr++) == ':') {
++              if (binary)
++                      ptr = kgdb_ebin2mem(ptr, (char *)addr, length);
++              else
++                      ptr = kgdb_hex2mem(ptr, (char *)addr, length);
++              if (CACHE_FLUSH_IS_SAFE)
++                      flush_icache_range(addr, addr + length + 1);
++              if (IS_ERR(ptr))
++                      return ptr;
++              return NULL;
++      }
++
++      return ERR_PTR(-EINVAL);
++}
++
++static inline char *pack_hex_byte(char *pkt, int byte)
++{
++      *pkt++ = hexchars[(byte >> 4) & 0xf];
++      *pkt++ = hexchars[(byte & 0xf)];
++      return pkt;
++}
++
++static inline void error_packet(char *pkt, int error)
++{
++      error = -error;
++      pkt[0] = 'E';
++      pkt[1] = hexchars[(error / 10)];
++      pkt[2] = hexchars[(error % 10)];
++      pkt[3] = '\0';
++}
++
++static char *pack_threadid(char *pkt, threadref * id)
++{
++      char *limit;
++      unsigned char *altid;
++
++      altid = (unsigned char *)id;
++      limit = pkt + BUF_THREAD_ID_SIZE;
++      while (pkt < limit)
++              pkt = pack_hex_byte(pkt, *altid++);
++
++      return pkt;
++}
++
++void int_to_threadref(threadref * id, int value)
++{
++      unsigned char *scan;
++      int i = 4;
++
++      scan = (unsigned char *)id;
++      while (i--)
++              *scan++ = 0;
++      *scan++ = (value >> 24) & 0xff;
++      *scan++ = (value >> 16) & 0xff;
++      *scan++ = (value >> 8) & 0xff;
++      *scan++ = (value & 0xff);
++}
++
++static struct task_struct *getthread(struct pt_regs *regs, int tid)
++{
++      if (last_pid == 0)
++              return current;
++
++      if (num_online_cpus() &&
++          (tid >= pid_max + num_online_cpus() + kgdb_ops->shadowth))
++              return NULL;
++
++      if (kgdb_ops->shadowth && (tid >= pid_max + num_online_cpus()))
++              return kgdb_get_shadow_thread(regs, tid - pid_max -
++                                            num_online_cpus());
++
++      if (tid >= pid_max)
++              return idle_task(tid - pid_max);
++
++      if (!tid)
++              return NULL;
++
++      return find_task_by_pid(tid);
++}
++
++#ifdef CONFIG_SMP
++static void kgdb_wait(struct pt_regs *regs)
++{
++      unsigned long flags;
++      int processor;
++
++      local_irq_save(flags);
++      processor = smp_processor_id();
++      kgdb_info[processor].debuggerinfo = regs;
++      kgdb_info[processor].task = current;
++      atomic_set(&procindebug[processor], 1);
++      atomic_set(&kgdb_sync_softlockup[smp_processor_id()], 1);
++
++      /* Wait till master processor goes completely into the debugger.
++       * FIXME: this looks racy */
++      while (!atomic_read(&procindebug[atomic_read(&debugger_active) - 1])) {
++              int i = 10;     /* an arbitrary number */
++
++              while (--i)
++                      cpu_relax();
++      }
++
++      /* Wait till master processor is done with debugging */
++      spin_lock(&slavecpulocks[processor]);
++
++      /* This has been taken from x86 kgdb implementation and
++       * will be needed by architectures that have SMP support
++       */
++      kgdb_correct_hw_break();
++
++      kgdb_info[processor].debuggerinfo = NULL;
++      kgdb_info[processor].task = NULL;
++
++      /* Signal the master processor that we are done */
++      atomic_set(&procindebug[processor], 0);
++      spin_unlock(&slavecpulocks[processor]);
++      local_irq_restore(flags);
++}
++#endif
++
++int kgdb_get_mem(char *addr, unsigned char *buf, int count)
++{
++      kgdb_may_fault = 1;
++      if ((kgdb_fault_setjmp(kgdb_fault_jmp_regs)) != 0) {
++              kgdb_may_fault = 0;
++              return -EINVAL;
++      }
++      while (count) {
++              if ((unsigned long)addr < TASK_SIZE)
++                      return -EINVAL;
++              *buf++ = *addr++;
++              count--;
++      }
++      kgdb_may_fault = 0;
++      return 0;
++}
++
++int kgdb_set_mem(char *addr, unsigned char *buf, int count)
++{
++      kgdb_may_fault = 1;
++      if ((kgdb_fault_setjmp(kgdb_fault_jmp_regs)) != 0) {
++              kgdb_may_fault = 0;
++              return -EINVAL;
++      }
++      while (count) {
++              if ((unsigned long)addr < TASK_SIZE)
++                      return -EINVAL;
++              *addr++ = *buf++;
++              count--;
++      }
++      kgdb_may_fault = 0;
++      return 0;
++}
++int kgdb_activate_sw_breakpoints(void)
++{
++      int i;
++      int error = 0;
++      unsigned long addr;
++      for (i = 0; i < MAX_BREAKPOINTS; i++) {
++              if (kgdb_break[i].state != bp_set)
++                      continue;
++              addr = kgdb_break[i].bpt_addr;
++              if ((error = kgdb_arch_set_breakpoint(addr,
++                                      kgdb_break[i].saved_instr)))
++                      return error;
++
++              if (CACHE_FLUSH_IS_SAFE) {
++                      if (current->mm && addr < TASK_SIZE)
++                              flush_cache_range(current->mm->mmap_cache,
++                                              addr, addr + BREAK_INSTR_SIZE);
++                      else
++                              flush_icache_range(addr, addr +
++                                              BREAK_INSTR_SIZE);
++              }
++
++              kgdb_break[i].state = bp_active;
++        }
++      return 0;
++}
++
++static int kgdb_set_sw_break(unsigned long addr)
++{
++      int i, breakno = -1;
++      int error = 0;
++      if ((error = kgdb_validate_break_address(addr)) < 0)
++              return error;
++      for (i = 0; i < MAX_BREAKPOINTS; i++) {
++              if ((kgdb_break[i].state == bp_set) &&
++                      (kgdb_break[i].bpt_addr == addr))
++                      return -EEXIST;
++      }
++      for (i = 0; i < MAX_BREAKPOINTS; i++) {
++              if (kgdb_break[i].state == bp_removed &&
++                              kgdb_break[i].bpt_addr == addr) {
++                      breakno = i;
++                      break;
++              }
++      }
++
++      if (breakno == -1) {
++              for (i = 0; i < MAX_BREAKPOINTS; i++) {
++                      if (kgdb_break[i].state == bp_none) {
++                              breakno = i;
++                              break;
++                      }
++              }
++      }
++      if (breakno == -1)
++              return -E2BIG;
++
++      kgdb_break[breakno].state = bp_set;
++      kgdb_break[breakno].type = bp_breakpoint;
++      kgdb_break[breakno].bpt_addr = addr;
++
++      return 0;
++}
++
++int kgdb_deactivate_sw_breakpoints(void)
++{
++      int i;
++      int error = 0;
++      unsigned long addr;
++      for (i = 0; i < MAX_BREAKPOINTS; i++) {
++              if (kgdb_break[i].state != bp_active)
++                      continue;
++              addr = kgdb_break[i].bpt_addr;
++              if ((error = kgdb_arch_remove_breakpoint(addr,
++                                      kgdb_break[i].saved_instr)))
++                      return error;
++
++              if (CACHE_FLUSH_IS_SAFE && current->mm &&
++                              addr < TASK_SIZE)
++                      flush_cache_range(current->mm->mmap_cache,
++                                      addr, addr + BREAK_INSTR_SIZE);
++              else if (CACHE_FLUSH_IS_SAFE)
++                      flush_icache_range(addr,
++                                      addr + BREAK_INSTR_SIZE);
++              kgdb_break[i].state = bp_set;
++      }
++      return 0;
++}
++
++static int kgdb_remove_sw_break(unsigned long addr)
++{
++      int i;
++
++      for (i = 0; i < MAX_BREAKPOINTS; i++) {
++              if ((kgdb_break[i].state == bp_set) &&
++                      (kgdb_break[i].bpt_addr == addr)) {
++                      kgdb_break[i].state = bp_removed;
++                      return 0;
++              }
++      }
++      return -ENOENT;
++}
++
++int kgdb_isremovedbreak(unsigned long addr)
++{
++      int i;
++      for (i = 0; i < MAX_BREAKPOINTS; i++) {
++              if ((kgdb_break[i].state == bp_removed) &&
++                      (kgdb_break[i].bpt_addr == addr)) {
++                      return 1;
++              }
++      }
++      return 0;
++}
++
++int remove_all_break(void)
++{
++      int i;
++      int error;
++      unsigned long addr;
++
++      /* Clear memory breakpoints. */
++      for (i = 0; i < MAX_BREAKPOINTS; i++) {
++              if (kgdb_break[i].state != bp_set)
++                      continue;
++              addr = kgdb_break[i].bpt_addr;
++              if ((error = kgdb_arch_remove_breakpoint(addr,
++                                      kgdb_break[i].saved_instr)))
++                      return error;
++              kgdb_break[i].state = bp_removed;
++      }
++
++      /* Clear hardware breakpoints. */
++      kgdb_remove_all_hw_break();
++
++      return 0;
++}
++
++static inline int shadow_pid(int realpid)
++{
++      if (realpid) {
++              return realpid;
++      }
++      return pid_max + smp_processor_id();
++}
++
++static char gdbmsgbuf[BUFMAX + 1];
++static void kgdb_msg_write(const char *s, int len)
++{
++      int i;
++      int wcount;
++      char *bufptr;
++
++      /* 'O'utput */
++      gdbmsgbuf[0] = 'O';
++
++      /* Fill and send buffers... */
++      while (len > 0) {
++              bufptr = gdbmsgbuf + 1;
++
++              /* Calculate how many this time */
++              if ((len << 1) > (BUFMAX - 2))
++                      wcount = (BUFMAX - 2) >> 1;
++              else
++                      wcount = len;
++
++              /* Pack in hex chars */
++              for (i = 0; i < wcount; i++)
++                      bufptr = pack_hex_byte(bufptr, s[i]);
++              *bufptr = '\0';
++
++              /* Move up */
++              s += wcount;
++              len -= wcount;
++
++              /* Write packet */
++              put_packet(gdbmsgbuf);
++      }
++}
++
++/*
++ * This function does all command procesing for interfacing to gdb.
++ *
++ * Locking hierarchy:
++ *    interface locks, if any (begin_session)
++ *    kgdb lock (debugger_active)
++ *
++ * Note that since we can be in here prior to our cpumask being filled
++ * out, we err on the side of caution and loop over NR_CPUS instead
++ * of a for_each_online_cpu.
++ *
++ */
++int kgdb_handle_exception(int ex_vector, int signo, int err_code,
++                        struct pt_regs *linux_regs)
++{
++      unsigned long length, addr;
++      char *ptr;
++      unsigned long flags;
++      unsigned i;
++      long threadid;
++      threadref thref;
++      struct task_struct *thread = NULL;
++      unsigned procid;
++      int numshadowth = num_online_cpus() + kgdb_ops->shadowth;
++      long kgdb_usethreadid = 0;
++      int error = 0, all_cpus_synced = 0;
++      struct pt_regs *shadowregs;
++      int processor = smp_processor_id();
++      void *local_debuggerinfo;
++
++      /* Panic on recursive debugger calls. */
++      if (atomic_read(&debugger_active) == smp_processor_id() + 1)
++              return 0;
++
++      acquirelock:
++
++      /* Call the I/O drivers pre_exception routine if the I/O
++       * driver defined one
++       */
++      if (kgdb_io_ops.pre_exception)
++              kgdb_io_ops.pre_exception();
++
++      /*
++       * Interrupts will be restored by the 'trap return' code, except when
++       * single stepping.
++       */
++      local_irq_save(flags);
++
++      /* Hold debugger_active */
++      procid = smp_processor_id();
++
++      while (cmpxchg(&atomic_read(&debugger_active), 0, (procid + 1)) != 0) {
++              int i = 25;     /* an arbitrary number */
++
++              while (--i)
++                      cpu_relax();
++
++              if (atomic_read(&cpu_doing_single_step) != -1 &&
++                              atomic_read(&cpu_doing_single_step) != procid)
++                      udelay(1);
++      }
++
++      atomic_set(&kgdb_sync_softlockup[smp_processor_id()], 1);
++
++      /*
++       * Don't enter if the last instance of the exception handler wanted to
++       * come into the debugger again.
++       */
++      if (atomic_read(&cpu_doing_single_step) != -1 &&
++          atomic_read(&cpu_doing_single_step) != procid) {
++              atomic_set(&debugger_active, 0);
++              local_irq_restore(flags);
++              goto acquirelock;
++      }
++
++      /*
++      * Don't enter if we have hit a removed breakpoint.
++      */
++      if (kgdb_skipexception(ex_vector, linux_regs))
++              goto kgdb_restore;
++
++      kgdb_info[processor].debuggerinfo = linux_regs;
++      kgdb_info[processor].task = current;
++
++      kgdb_disable_hw_debug(linux_regs);
++
++      if (!debugger_step || !kgdb_contthread)
++              for (i = 0; i < NR_CPUS; i++)
++                      spin_lock(&slavecpulocks[i]);
++
++      /* Make sure we get the other CPUs */
++      if (!debugger_step || !kgdb_contthread)
++              kgdb_roundup_cpus(flags);
++
++      /* spin_lock code is good enough as a barrier so we don't
++       * need one here */
++      atomic_set(&procindebug[processor], 1);
++
++      /* Wait a reasonable time for the other CPUs to be notified and
++       * be waiting for us.  Very early on this could be imperfect
++       * as num_online_cpus() could be 0.*/
++      for (i = 0; i < ROUNDUP_WAIT; i++) {
++              int cpu, num = 0;
++              for (cpu = 0; cpu < NR_CPUS; cpu++) {
++                      if (atomic_read(&procindebug[cpu]))
++                              num++;
++              }
++              if (num >= num_online_cpus()) {
++                      all_cpus_synced = 1;
++                      break;
++              }
++      }
++
++      /* Clear the out buffer. */
++      memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
++
++      /* Master processor is completely in the debugger */
++      kgdb_post_master_code(linux_regs, ex_vector, err_code);
++      kgdb_deactivate_sw_breakpoints();
++      debugger_step = 0;
++      kgdb_contthread = NULL;
++
++      if (kgdb_connected) {
++              /* If we're still unable to roundup all of the CPUs,
++               * send an 'O' packet informing the user again. */
++              if (!all_cpus_synced)
++                      kgdb_msg_write("Not all CPUs have been synced for "
++                                     "KGDB\n", 39);
++              /* Reply to host that an exception has occurred */
++              ptr = remcom_out_buffer;
++              *ptr++ = 'T';
++              *ptr++ = hexchars[(signo >> 4) % 16];
++              *ptr++ = hexchars[signo % 16];
++              ptr += strlen(strcpy(ptr, "thread:"));
++              int_to_threadref(&thref, shadow_pid(current->pid));
++              ptr = pack_threadid(ptr, &thref);
++              *ptr++ = ';';
++
++              put_packet(remcom_out_buffer);
++      }
++
++      kgdb_usethread = kgdb_info[processor].task;
++      kgdb_usethreadid = shadow_pid(kgdb_info[processor].task->pid);
++
++      while (kgdb_io_ops.read_char) {
++              char *bpt_type;
++              error = 0;
++
++              /* Clear the out buffer. */
++              memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
++
++              get_packet(remcom_in_buffer);
++
++              switch (remcom_in_buffer[0]) {
++              case '?':
++                      /* We know that this packet is only sent
++                       * during initial connect.  So to be safe,
++                       * we clear out our breakpoints now incase
++                       * GDB is reconnecting. */
++                      remove_all_break();
++                      /* Also, if we haven't been able to roundup all
++                       * CPUs, send an 'O' packet informing the user
++                       * as much.  Only need to do this once. */
++                      if (!all_cpus_synced)
++                              kgdb_msg_write("Not all CPUs have been "
++                                             "synced for KGDB\n", 39);
++                      remcom_out_buffer[0] = 'S';
++                      remcom_out_buffer[1] = hexchars[signo >> 4];
++                      remcom_out_buffer[2] = hexchars[signo % 16];
++                      break;
++
++              case 'g':       /* return the value of the CPU registers */
++                      thread = kgdb_usethread;
++
++                      if (!thread) {
++                              thread = kgdb_info[processor].task;
++                              local_debuggerinfo =
++                                  kgdb_info[processor].debuggerinfo;
++                      } else {
++                              local_debuggerinfo = NULL;
++                              for (i = 0; i < NR_CPUS; i++) {
++                                      /* Try to find the task on some other
++                                       * or possibly this node if we do not
++                                       * find the matching task then we try
++                                       * to approximate the results.
++                                       */
++                                      if (thread == kgdb_info[i].task)
++                                              local_debuggerinfo =
++                                                  kgdb_info[i].debuggerinfo;
++                              }
++                      }
++
++                      /* All threads that don't have debuggerinfo should be
++                       * in __schedule() sleeping, since all other CPUs
++                       * are in kgdb_wait, and thus have debuggerinfo. */
++                      if (kgdb_ops->shadowth &&
++                          kgdb_usethreadid >= pid_max + num_online_cpus()) {
++                              shadowregs = kgdb_shadow_regs(linux_regs,
++                                                            kgdb_usethreadid -
++                                                            pid_max -
++                                                            num_online_cpus
++                                                            ());
++                              if (!shadowregs) {
++                                      error_packet(remcom_out_buffer,
++                                                   -EINVAL);
++                                      break;
++                              }
++                              regs_to_gdb_regs(gdb_regs, shadowregs);
++                      } else if (local_debuggerinfo)
++                              regs_to_gdb_regs(gdb_regs, local_debuggerinfo);
++                      else {
++                              /* Pull stuff saved during
++                               * switch_to; nothing else is
++                               * accessible (or even particularly relevant).
++                               * This should be enough for a stack trace. */
++                              sleeping_thread_to_gdb_regs(gdb_regs, thread);
++                      }
++                      kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer,
++                                   NUMREGBYTES);
++                      break;
++
++                      /* set the value of the CPU registers - return OK */
++              case 'G':
++                      kgdb_hex2mem(&remcom_in_buffer[1], (char *)gdb_regs,
++                                   NUMREGBYTES);
++
++                      if (kgdb_usethread && kgdb_usethread != current)
++                              error_packet(remcom_out_buffer, -EINVAL);
++                      else {
++                              gdb_regs_to_regs(gdb_regs, linux_regs);
++                              strcpy(remcom_out_buffer, "OK");
++                      }
++                      break;
++
++                      /* mAA..AA,LLLL  Read LLLL bytes at address AA..AA */
++              case 'm':
++                      ptr = &remcom_in_buffer[1];
++                      if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' &&
++                          kgdb_hex2long(&ptr, &length) > 0) {
++                              if (IS_ERR(ptr = kgdb_mem2hex((char *)addr,
++                                                            remcom_out_buffer,
++                                                            length)))
++                                      error_packet(remcom_out_buffer,
++                                                   PTR_ERR(ptr));
++                      } else
++                              error_packet(remcom_out_buffer, -EINVAL);
++                      break;
++
++                      /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */
++              case 'M':
++                      if (IS_ERR(ptr = write_mem_msg(0)))
++                              error_packet(remcom_out_buffer, PTR_ERR(ptr));
++                      else
++                              strcpy(remcom_out_buffer, "OK");
++                      break;
++                      /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */
++              case 'X':
++                      if (IS_ERR(ptr = write_mem_msg(1)))
++                              error_packet(remcom_out_buffer, PTR_ERR(ptr));
++                      else
++                              strcpy(remcom_out_buffer, "OK");
++                      break;
++
++                      /* kill or detach. KGDB should treat this like a
++                       * continue.
++                       */
++              case 'D':
++                      if ((error = remove_all_break()) < 0) {
++                              error_packet(remcom_out_buffer, error);
++                      } else {
++                              strcpy(remcom_out_buffer, "OK");
++                              kgdb_connected = 0;
++                      }
++                      put_packet(remcom_out_buffer);
++                      goto default_handle;
++
++              case 'k':
++                      /* Don't care about error from remove_all_break */
++                      remove_all_break();
++                      kgdb_connected = 0;
++                      goto default_handle;
++
++                      /* Reboot */
++              case 'R':
++                      /* For now, only honor R0 */
++                      if (strcmp(remcom_in_buffer, "R0") == 0) {
++                              printk(KERN_CRIT "Executing reboot\n");
++                              strcpy(remcom_out_buffer, "OK");
++                              put_packet(remcom_out_buffer);
++                              emergency_sync();
++                              /* Execution should not return from
++                               * machine_restart()
++                               */
++                              machine_restart(NULL);
++                              kgdb_connected = 0;
++                              goto default_handle;
++                      }
++
++                      /* query */
++              case 'q':
++                      switch (remcom_in_buffer[1]) {
++                      case 's':
++                      case 'f':
++                              if (memcmp(remcom_in_buffer + 2, "ThreadInfo",
++                                         10)) {
++                                      error_packet(remcom_out_buffer,
++                                                   -EINVAL);
++                                      break;
++                              }
++
++                              /*
++                               * If we have not yet completed in
++                               * pidhash_init() there isn't much we
++                               * can give back.
++                               */
++                              if (last_pid == 0) {
++                                      if (remcom_in_buffer[1] == 'f')
++                                              strcpy(remcom_out_buffer,
++                                                     "m0000000000000001");
++                                      break;
++                              }
++
++                              if (remcom_in_buffer[1] == 'f') {
++                                      threadid = 1;
++                              }
++                              remcom_out_buffer[0] = 'm';
++                              ptr = remcom_out_buffer + 1;
++                              for (i = 0; i < 17 && threadid < pid_max +
++                                   numshadowth; threadid++) {
++                                      thread = getthread(linux_regs,
++                                                         threadid);
++                                      if (thread) {
++                                              int_to_threadref(&thref,
++                                                               threadid);
++                                              pack_threadid(ptr, &thref);
++                                              ptr += 16;
++                                              *(ptr++) = ',';
++                                              i++;
++                                      }
++                              }
++                              *(--ptr) = '\0';
++                              break;
++
++                      case 'C':
++                              /* Current thread id */
++                              strcpy(remcom_out_buffer, "QC");
++
++                              threadid = shadow_pid(current->pid);
++
++                              int_to_threadref(&thref, threadid);
++                              pack_threadid(remcom_out_buffer + 2, &thref);
++                              break;
++                      case 'T':
++                              if (memcmp(remcom_in_buffer + 1,
++                                         "ThreadExtraInfo,", 16)) {
++                                      error_packet(remcom_out_buffer,
++                                                   -EINVAL);
++                                      break;
++                              }
++                              threadid = 0;
++                              ptr = remcom_in_buffer + 17;
++                              kgdb_hex2long(&ptr, &threadid);
++                              if (!getthread(linux_regs, threadid)) {
++                                      error_packet(remcom_out_buffer,
++                                                   -EINVAL);
++                                      break;
++                              }
++                              if (threadid < pid_max) {
++                                      kgdb_mem2hex(getthread(linux_regs,
++                                                             threadid)->comm,
++                                                   remcom_out_buffer, 16);
++                              } else if (threadid >= pid_max +
++                                         num_online_cpus()) {
++                                      kgdb_shadowinfo(linux_regs,
++                                                      remcom_out_buffer,
++                                                      threadid - pid_max -
++                                                      num_online_cpus());
++                              } else {
++                                      static char tmpstr[23 +
++                                                         BUF_THREAD_ID_SIZE];
++                                      sprintf(tmpstr, "Shadow task %d"
++                                              " for pid 0",
++                                              (int)(threadid - pid_max));
++                                      kgdb_mem2hex(tmpstr, remcom_out_buffer,
++                                                   strlen(tmpstr));
++                              }
++                              break;
++                      }
++                      break;
++
++                      /* task related */
++              case 'H':
++                      switch (remcom_in_buffer[1]) {
++                      case 'g':
++                              ptr = &remcom_in_buffer[2];
++                              kgdb_hex2long(&ptr, &threadid);
++                              thread = getthread(linux_regs, threadid);
++                              if (!thread && threadid > 0) {
++                                      error_packet(remcom_out_buffer,
++                                                   -EINVAL);
++                                      break;
++                              }
++                              kgdb_usethread = thread;
++                              kgdb_usethreadid = threadid;
++                              strcpy(remcom_out_buffer, "OK");
++                              break;
++
++                      case 'c':
++                              ptr = &remcom_in_buffer[2];
++                              kgdb_hex2long(&ptr, &threadid);
++                              if (!threadid) {
++                                      kgdb_contthread = NULL;
++                              } else {
++                                      thread = getthread(linux_regs,
++                                                         threadid);
++                                      if (!thread && threadid > 0) {
++                                              error_packet(remcom_out_buffer,
++                                                           -EINVAL);
++                                              break;
++                                      }
++                                      kgdb_contthread = thread;
++                              }
++                              strcpy(remcom_out_buffer, "OK");
++                              break;
++                      }
++                      break;
++
++                      /* Query thread status */
++              case 'T':
++                      ptr = &remcom_in_buffer[1];
++                      kgdb_hex2long(&ptr, &threadid);
++                      thread = getthread(linux_regs, threadid);
++                      if (thread)
++                              strcpy(remcom_out_buffer, "OK");
++                      else
++                              error_packet(remcom_out_buffer, -EINVAL);
++                      break;
++              /* Since GDB-5.3, it's been drafted that '0' is a software
++               * breakpoint, '1' is a hardware breakpoint, so let's do
++               * that.
++               */
++              case 'z':
++              case 'Z':
++                      bpt_type = &remcom_in_buffer[1];
++                      ptr = &remcom_in_buffer[2];
++
++                      if (kgdb_ops->set_hw_breakpoint && *bpt_type >= '1') {
++                              /* Unsupported */
++                              if (*bpt_type > '4')
++                                      break;
++                      } else if (*bpt_type != '0' && *bpt_type != '1')
++                              /* Unsupported. */
++                              break;
++                      /* Test if this is a hardware breakpoint, and
++                       * if we support it. */
++                      if (*bpt_type == '1' &&
++                          !kgdb_ops->flags & KGDB_HW_BREAKPOINT)
++                              /* Unsupported. */
++                              break;
++
++                      if (*(ptr++) != ',') {
++                              error_packet(remcom_out_buffer, -EINVAL);
++                              break;
++                      } else if (kgdb_hex2long(&ptr, &addr)) {
++                              if (*(ptr++) != ',' ||
++                                  !kgdb_hex2long(&ptr, &length)) {
++                                      error_packet(remcom_out_buffer,
++                                                   -EINVAL);
++                                      break;
++                              }
++                      } else {
++                              error_packet(remcom_out_buffer, -EINVAL);
++                              break;
++                      }
++
++                      if (remcom_in_buffer[0] == 'Z' && *bpt_type == '0')
++                              error = kgdb_set_sw_break(addr);
++                      else if (remcom_in_buffer[0] == 'Z' && *bpt_type == '1')
++                              error = kgdb_set_hw_break(addr);
++                      else if (remcom_in_buffer[0] == 'z' && *bpt_type == '0')
++                              error = kgdb_remove_sw_break(addr);
++                      else if (remcom_in_buffer[0] == 'z' && *bpt_type == '1')
++                              error = kgdb_remove_hw_break(addr);
++                      else if (remcom_in_buffer[0] == 'Z')
++                              error = kgdb_ops->set_hw_breakpoint(addr,
++                                                                  (int)length,
++                                                                  *bpt_type);
++                      else if (remcom_in_buffer[0] == 'z')
++                              error = kgdb_ops->remove_hw_breakpoint(addr,
++                                                                     (int)
++                                                                     length,
++                                                                     *bpt_type);
++
++                      if (error == 0)
++                              strcpy(remcom_out_buffer, "OK");
++                      else
++                              error_packet(remcom_out_buffer, error);
++
++                      break;
++              case 'c':
++              case 's':
++                      if (kgdb_contthread && kgdb_contthread != current) {
++                              /* Can't switch threads in kgdb */
++                              error_packet(remcom_out_buffer, -EINVAL);
++                              break;
++                      }
++                      kgdb_activate_sw_breakpoints();
++                      /* Followthrough to default processing */
++              default:
++                    default_handle:
++                      error = kgdb_arch_handle_exception(ex_vector, signo,
++                                                         err_code,
++                                                         remcom_in_buffer,
++                                                         remcom_out_buffer,
++                                                         linux_regs);
++
++                      if (error >= 0 || remcom_in_buffer[0] == 'D' ||
++                          remcom_in_buffer[0] == 'k')
++                              goto kgdb_exit;
++
++              }               /* switch */
++
++              /* reply to the request */
++              put_packet(remcom_out_buffer);
++      }
++
++      kgdb_exit:
++      /* Call the I/O driver's post_exception routine if the I/O
++       * driver defined one.
++       */
++      if (kgdb_io_ops.post_exception)
++              kgdb_io_ops.post_exception();
++
++      kgdb_info[processor].debuggerinfo = NULL;
++      kgdb_info[processor].task = NULL;
++      atomic_set(&procindebug[processor], 0);
++
++      if (!debugger_step || !kgdb_contthread) {
++              for (i = 0; i < NR_CPUS; i++)
++                      spin_unlock(&slavecpulocks[i]);
++              /* Wait till all the processors have quit
++               * from the debugger. */
++              for (i = 0; i < NR_CPUS; i++) {
++                      while (atomic_read(&procindebug[i])) {
++                              int j = 10;     /* an arbitrary number */
++
++                              while (--j)
++                                      cpu_relax();
++                      }
++              }
++      }
++
++#ifdef CONFIG_SMP
++      /* This delay has a real purpose.  The problem is that if you
++       * are single-stepping, you are sending an NMI to all the
++       * other processors to stop them.  Interrupts come in, but
++       * don't get handled.  Then you let them go just long enough
++       * to get into their interrupt routines and use up some stack.
++       * You stop them again, and then do the same thing.  After a
++       * while you blow the stack on the other processors.  This
++       * delay gives some time for interrupts to be cleared out on
++       * the other processors.
++       */
++      if (debugger_step)
++              mdelay(2);
++#endif
++kgdb_restore:
++      /* Free debugger_active */
++      atomic_set(&debugger_active, 0);
++      local_irq_restore(flags);
++
++      return error;
++}
++
++/*
++ * GDB places a breakpoint at this function to know dynamically
++ * loaded objects. It's not defined static so that only one instance with this
++ * name exists in the kernel.
++ */
++
++int module_event(struct notifier_block *self, unsigned long val, void *data)
++{
++      return 0;
++}
++
++static struct notifier_block kgdb_module_load_nb = {
++      .notifier_call = module_event,
++};
++
++void kgdb_nmihook(int cpu, void *regs)
++{
++#ifdef CONFIG_SMP
++      if (!atomic_read(&procindebug[cpu]) && atomic_read(&debugger_active) != (cpu + 1))
++              kgdb_wait((struct pt_regs *)regs);
++#endif
++}
++
++/*
++ * This is called when a panic happens.  All we need to do is
++ * breakpoint().
++ */
++static int kgdb_panic_notify(struct notifier_block *self, unsigned long cmd,
++                           void *ptr)
++{
++      breakpoint();
++
++      return 0;
++}
++
++static struct notifier_block kgdb_panic_notifier = {
++      .notifier_call = kgdb_panic_notify,
++};
++
++/*
++ * Initialization that needs to be done in either of our entry points.
++ */
++static void __init kgdb_internal_init(void)
++{
++      int i;
++
++      /* Initialize our spinlocks. */
++      for (i = 0; i < NR_CPUS; i++)
++              spin_lock_init(&slavecpulocks[i]);
++
++      for (i = 0; i < MAX_BREAKPOINTS; i++)
++              kgdb_break[i].state = bp_none;
++
++      /* Initialize the I/O handles */
++      memset(&kgdb_io_ops_prev, 0, sizeof(kgdb_io_ops_prev));
++
++      /* We can't do much if this fails */
++      register_module_notifier(&kgdb_module_load_nb);
++
++      kgdb_initialized = 1;
++}
++
++static void kgdb_register_for_panic(void)
++{
++      /* Register for panics(). */
++      /* The registration is done in the kgdb_register_for_panic
++       * routine because KGDB should not try to handle a panic when
++       * there are no kgdb_io_ops setup. It is assumed that the
++       * kgdb_io_ops are setup at the time this method is called.
++       */
++      if (!kgdb_from_module_registered) {
++              atomic_notifier_chain_register(&panic_notifier_list,
++                                      &kgdb_panic_notifier);
++              kgdb_from_module_registered = 1;
++      }
++}
++
++static void kgdb_unregister_for_panic(void)
++{
++      /* When this routine is called KGDB should unregister from the
++       * panic handler and clean up, making sure it is not handling any
++       * break exceptions at the time.
++       */
++      if (kgdb_from_module_registered) {
++              kgdb_from_module_registered = 0;
++              atomic_notifier_chain_unregister(&panic_notifier_list,
++                                        &kgdb_panic_notifier);
++      }
++}
++
++int kgdb_register_io_module(struct kgdb_io *local_kgdb_io_ops)
++{
++
++      if (kgdb_connected) {
++              printk(KERN_ERR "kgdb: Cannot load I/O module while KGDB "
++                     "connected.\n");
++              return -EINVAL;
++      }
++
++      /* Save the old values so they can be restored */
++      if (kgdb_io_handler_cnt >= MAX_KGDB_IO_HANDLERS) {
++              printk(KERN_ERR "kgdb: No more I/O handles available.\n");
++              return -EINVAL;
++      }
++
++      /* Check to see if there is an existing driver and if so save its
++       * values.  Also check to make sure the same driver was not trying
++       * to re-register.
++       */
++      if (kgdb_io_ops.read_char != NULL &&
++        kgdb_io_ops.read_char != local_kgdb_io_ops->read_char) {
++              memcpy(&kgdb_io_ops_prev[kgdb_io_handler_cnt],
++                     &kgdb_io_ops, sizeof(struct kgdb_io));
++              kgdb_io_handler_cnt++;
++      }
++
++      /* Initialize the io values for this module */
++      memcpy(&kgdb_io_ops, local_kgdb_io_ops, sizeof(struct kgdb_io));
++
++      /* Make the call to register kgdb if is not initialized */
++      kgdb_register_for_panic();
++
++      return 0;
++}
++
++void kgdb_unregister_io_module(struct kgdb_io *local_kgdb_io_ops)
++{
++      int i;
++
++      /* Unregister KGDB if there were no other prior io hooks, else
++       * restore the io hooks.
++       */
++      if (kgdb_io_handler_cnt > 0 && kgdb_io_ops_prev[0].read_char != NULL) {
++              /* First check if the hook that is in use is the one being
++               * removed */
++              if (kgdb_io_ops.read_char == local_kgdb_io_ops->read_char) {
++                      /* Set 'i' to the value of where the list should be
++                       * shifed */
++                      i = kgdb_io_handler_cnt - 1;
++                      memcpy(&kgdb_io_ops, &kgdb_io_ops_prev[i],
++                             sizeof(struct kgdb_io));
++              } else {
++                      /* Simple case to remove an entry for an I/O handler
++                       * that is not in use */
++                      for (i = 0; i < kgdb_io_handler_cnt; i++) {
++                              if (kgdb_io_ops_prev[i].read_char ==
++                                  local_kgdb_io_ops->read_char)
++                                      break;
++                      }
++              }
++
++              /* Shift all the entries in the handler array so it is
++               * ordered from oldest to newest.
++               */
++              kgdb_io_handler_cnt--;
++              for (; i < kgdb_io_handler_cnt; i++) {
++                      memcpy(&kgdb_io_ops_prev[i], &kgdb_io_ops_prev[i + 1],
++                             sizeof(struct kgdb_io));
++              }
++              /* Handle the case if we are on the last element and set it
++               * to NULL; */
++              memset(&kgdb_io_ops_prev[kgdb_io_handler_cnt], 0,
++                              sizeof(struct kgdb_io));
++
++              if (kgdb_connected)
++                      printk(KERN_ERR "kgdb: WARNING: I/O method changed "
++                             "while kgdb was connected state.\n");
++      } else {
++              /* KGDB is no longer able to communicate out, so
++               * unregister our hooks and reset state. */
++              kgdb_unregister_for_panic();
++              if (kgdb_connected) {
++                      printk(KERN_CRIT "kgdb: I/O module was unloaded while "
++                                      "a debugging session was running.  "
++                                      "KGDB will be reset.\n");
++                      if (remove_all_break() < 0)
++                              printk(KERN_CRIT "kgdb: Reset failed.\n");
++                      kgdb_connected = 0;
++              }
++              memset(&kgdb_io_ops, 0, sizeof(struct kgdb_io));
++      }
++}
++
++/*
++ * There are times we need to call a tasklet to cause a breakpoint
++ * as calling breakpoint() at that point might be fatal.  We have to
++ * check that the exception stack is setup, as tasklets may be scheduled
++ * prior to this.  When that happens, it is up to the architecture to
++ * schedule this when it is safe to run.
++ */
++static void kgdb_tasklet_bpt(unsigned long ing)
++{
++      if(CHECK_EXCEPTION_STACK())
++              breakpoint();
++}
++
++DECLARE_TASKLET(kgdb_tasklet_breakpoint, kgdb_tasklet_bpt, 0);
++
++/*
++ * This function can be called very early, either via early_param() or
++ * an explicit breakpoint() early on.
++ */
++static void __init kgdb_early_entry(void)
++{
++      /*
++       * Don't try and do anything until the architecture is able to
++       * setup the exception stack.  In this case, it is up to the
++       * architecture to hook in and look at us when they are ready.
++       */
++      if(!CHECK_EXCEPTION_STACK()) {
++              kgdb_initialized = -1;
++              tasklet_schedule(&kgdb_tasklet_breakpoint);
++              return;
++      }
++
++      /* Let the architecture do any setup that it needs to. */
++      kgdb_arch_init();
++
++      /* Now try the I/O. */
++      /* For early entry kgdb_io_ops.init must be defined */
++      if (!kgdb_io_ops.init || kgdb_io_ops.init()) {
++              /* Try again later. */
++              kgdb_initialized = -1;
++              return;
++      }
++
++      /* Finish up. */
++      kgdb_internal_init();
++
++      /* KGDB can assume that if kgdb_io_ops.init was defined that the
++       * panic registion should be performed at this time. This means
++       * kgdb_io_ops.init did not come from a kernel module and was
++       * initialized statically by a built in.
++       */
++      if (kgdb_io_ops.init)
++              kgdb_register_for_panic();
++}
++
++/*
++ * This function will always be invoked to make sure that KGDB will grab
++ * what it needs to so that if something happens while the system is
++ * running, KGDB will get involved.  If kgdb_early_entry() has already
++ * been invoked, there is little we need to do.
++ */
++static int __init kgdb_late_entry(void)
++{
++      int need_break = 0;
++
++      /* If kgdb_initialized is -1 then we were passed kgdbwait. */
++      if (kgdb_initialized == -1)
++              need_break = 1;
++
++      /*
++       * If we haven't tried to initialize KGDB yet, we need to call
++       * kgdb_arch_init before moving onto the I/O.
++       */
++      if (!kgdb_initialized)
++              kgdb_arch_init();
++
++      if (kgdb_initialized != 1) {
++              if (kgdb_io_ops.init && kgdb_io_ops.init()) {
++                      /* When KGDB allows I/O via modules and the core
++                       * I/O init fails KGDB must default to defering the
++                       * I/O setup, and appropriately print an error about
++                       * it.
++                       */
++                      printk(KERN_ERR "kgdb: Could not setup core I/O "
++                             "for KGDB.\n");
++                      printk(KERN_INFO "kgdb: Defering I/O setup to kernel "
++                             "module.\n");
++                      memset(&kgdb_io_ops, 0, sizeof(struct kgdb_io));
++              }
++
++              kgdb_internal_init();
++
++              /* KGDB can assume that if kgdb_io_ops.init was defined that
++               * panic registion should be performed at this time. This means
++               * kgdb_io_ops.init did not come from a kernel module and was
++               * initialized statically by a built in.
++               */
++              if (kgdb_io_ops.init)
++                      kgdb_register_for_panic();
++      }
++
++      /* Registering to reboot notifier list*/
++      register_reboot_notifier(&kgdb_reboot_notifier);
++
++      /* Now do any late init of the I/O. */
++      if (kgdb_io_ops.late_init)
++              kgdb_io_ops.late_init();
++
++      if (need_break) {
++              printk(KERN_CRIT "kgdb: Waiting for connection from remote"
++                     " gdb...\n");
++              breakpoint();
++      }
++
++      return 0;
++}
++
++late_initcall(kgdb_late_entry);
++
++/*
++ * This function will generate a breakpoint exception.  It is used at the
++ * beginning of a program to sync up with a debugger and can be used
++ * otherwise as a quick means to stop program execution and "break" into
++ * the debugger.
++ */
++void breakpoint(void)
++{
++      if (kgdb_initialized != 1) {
++              kgdb_early_entry();
++              if (kgdb_initialized == 1)
++                      printk(KERN_CRIT "Waiting for connection from remote "
++                             "gdb...\n");
++              else {
++                      printk(KERN_CRIT "KGDB cannot initialize I/O yet.\n");
++                      return;
++              }
++      }
++
++      atomic_set(&kgdb_setting_breakpoint, 1);
++      wmb();
++      BREAKPOINT();
++      wmb();
++      atomic_set(&kgdb_setting_breakpoint, 0);
++}
++
++EXPORT_SYMBOL(breakpoint);
++
++#ifdef CONFIG_MAGIC_SYSRQ
++static void sysrq_handle_gdb(int key, struct pt_regs *pt_regs,
++                           struct tty_struct *tty)
++{
++      printk("Entering GDB stub\n");
++      breakpoint();
++}
++static struct sysrq_key_op sysrq_gdb_op = {
++      .handler = sysrq_handle_gdb,
++      .help_msg = "Gdb",
++      .action_msg = "GDB",
++};
++
++static int gdb_register_sysrq(void)
++{
++      printk("Registering GDB sysrq handler\n");
++      register_sysrq_key('g', &sysrq_gdb_op);
++      return 0;
++}
++
++module_init(gdb_register_sysrq);
++#endif
++
++static int kgdb_notify_reboot(struct notifier_block *this,
++                            unsigned long code, void *x)
++{
++
++      unsigned long flags;
++
++      /* If we're debugging, or KGDB has not connected, don't try
++       * and print. */
++      if (!kgdb_connected || atomic_read(&debugger_active) != 0)
++              return 0;
++      if ((code == SYS_RESTART) || (code == SYS_HALT) || (code == SYS_POWER_OFF)){
++              local_irq_save(flags);
++              put_packet("X00");
++              local_irq_restore(flags);
++      }
++      return NOTIFY_DONE;
++}
++
++#ifdef CONFIG_KGDB_CONSOLE
++void kgdb_console_write(struct console *co, const char *s, unsigned count)
++{
++      unsigned long flags;
++
++      /* If we're debugging, or KGDB has not connected, don't try
++       * and print. */
++      if (!kgdb_connected || atomic_read(&debugger_active) != 0)
++              return;
++
++      local_irq_save(flags);
++      kgdb_msg_write(s, count);
++      local_irq_restore(flags);
++}
++
++struct console kgdbcons = {
++      .name = "kgdb",
++      .write = kgdb_console_write,
++      .flags = CON_PRINTBUFFER | CON_ENABLED,
++};
++static int __init kgdb_console_init(void)
++{
++      register_console(&kgdbcons);
++      return 0;
++}
++
++console_initcall(kgdb_console_init);
++#endif
++
++static int __init opt_kgdb_enter(char *str)
++{
++      /* We've already done this by an explicit breakpoint() call. */
++      if (kgdb_initialized)
++              return 0;
++
++      /* Call breakpoint() which will take care of init. */
++      breakpoint();
++
++      return 0;
++}
++
++early_param("kgdbwait", opt_kgdb_enter);
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/kernel/kgdbarchlib.c linux-2.6.18-53.1.14.kgdb/kernel/kgdbarchlib.c
+--- linux-2.6.18-53.1.14/kernel/kgdbarchlib.c  1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/kernel/kgdbarchlib.c     2008-06-10 15:37:25.000000000 +0400
+@@ -0,0 +1,198 @@
++#include <linux/kgdb.h>
++
++struct kgdb_arch *kgdb_ops = &arch_kgdb_ops;
++
++/**
++ *    kgdb_arch_init - Perform any architecture specific initalization.
++ *
++ *    RETURN:
++ *    The return value is ignored.
++ *
++ *    This function will handle the initalization of any architecture
++ *    specific hooks.
++ */
++int __attribute__ ((weak))
++    kgdb_arch_init(void)
++{
++      return 0;
++}
++
++/**
++ *    kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
++ *    @regs: Current &struct pt_regs.
++ *
++ *    This function will be called if the particular architecture must
++ *    disable hardware debugging while it is processing gdb packets or
++ *    handling exception.
++ */
++void __attribute__ ((weak))
++    kgdb_disable_hw_debug(struct pt_regs *regs)
++{
++}
++
++/*
++ * Skip an int3 exception when it occurs after a breakpoint has been
++ * removed. Backtrack eip by 1 since the int3 would have caused it to
++ * increment by 1.
++ */
++int __attribute__ ((weak))
++      kgdb_skipexception(int exception, struct pt_regs *regs)
++{
++      return 0;
++}
++
++/**
++ *    kgdb_set_hw_break - Set a hardware breakpoint at @addr.
++ *    @addr: The address to set a hardware breakpoint at.
++ */
++int __attribute__ ((weak))
++    kgdb_set_hw_break(unsigned long addr)
++{
++      return 0;
++}
++
++/**
++ *    kgdb_remove_hw_break - Remove a hardware breakpoint at @addr.
++ *    @addr: The address to remove a hardware breakpoint from.
++ */
++int __attribute__ ((weak))
++    kgdb_remove_hw_break(unsigned long addr)
++{
++      return 0;
++}
++
++/**
++ *    kgdb_remove_all_hw_break - Clear all hardware breakpoints.
++ */
++void __attribute__ ((weak))
++    kgdb_remove_all_hw_break(void)
++{
++}
++
++/**
++ *    kgdb_correct_hw_break - Correct hardware breakpoints.
++ *
++ *    A hook to allow for changes to the hardware breakpoint, called
++ *    after a single step (s) or continue (c) packet, and once we're about
++ *    to let the kernel continue running.
++ *
++ *    This is used to set the hardware breakpoint registers for all the
++ *    slave cpus on an SMP configuration. This must be called after any
++ *    changes are made to the hardware breakpoints (such as by a single
++ *    step (s) or continue (c) packet. This is only required on
++ *    architectures that support SMP and every processor has its own set
++ *    of breakpoint registers.
++ */
++void __attribute__ ((weak))
++    kgdb_correct_hw_break(void)
++{
++}
++
++/**
++ *    kgdb_post_master_code - Save error vector/code numbers.
++ *    @regs: Original pt_regs.
++ *    @e_vector: Original error vector.
++ *    @err_code: Original error code.
++ *
++ *    This is needed on architectures which support SMP and KGDB.
++ *    This function is called after all the slave cpus have been put
++ *    to a know spin state and the master CPU has control over KGDB.
++ */
++
++void __attribute__ ((weak))
++    kgdb_post_master_code(struct pt_regs *regs, int e_vector, int err_code)
++{
++}
++
++/**
++ *    kgdb_roundup_cpus - Get other CPUs into a holding pattern
++ *    @flags: Current IRQ state
++ *
++ *    On SMP systems, we need to get the attention of the other CPUs
++ *    and get them be in a known state.  This should do what is needed
++ *    to get the other CPUs to call kgdb_wait(). Note that on some arches,
++ *    the NMI approach is not used for rounding up all the CPUs. For example,
++ *    in case of MIPS, smp_call_function() is used to roundup CPUs. In
++ *    this case, we have to make sure that interrupts are enabled before
++ *    calling smp_call_function(). The argument to this function is
++ *    the flags that will be used when restoring the interrupts. There is
++ *    local_irq_save() call before kgdb_roundup_cpus().
++ */
++void __attribute__ ((weak))
++    kgdb_roundup_cpus(unsigned long flags)
++{
++}
++
++/**
++ *    kgdb_shadowinfo - Get shadowed information on @threadid.
++ *    @regs: The &struct pt_regs of the current process.
++ *    @buffer: A buffer of %BUFMAX size.
++ *    @threadid: The thread id of the shadowed process to get information on.
++ */
++void __attribute__ ((weak))
++    kgdb_shadowinfo(struct pt_regs *regs, char *buffer, unsigned threadid)
++{
++}
++
++/**
++ *    kgdb_get_shadow_thread - Get the shadowed &task_struct of @threadid.
++ *    @regs: The &struct pt_regs of the current thread.
++ *    @threadid: The thread id of the shadowed process to get information on.
++ *
++ *    RETURN:
++ *    This returns a pointer to the &struct task_struct of the shadowed
++ *    thread, @threadid.
++ */
++struct task_struct __attribute__ ((weak))
++    * kgdb_get_shadow_thread(struct pt_regs *regs, int threadid)
++{
++      return NULL;
++}
++
++/**
++ *    kgdb_shadow_regs - Return the shadowed registers of @threadid.
++ *    @regs: The &struct pt_regs of the current thread.
++ *    @threadid: The thread id we want the &struct pt_regs for.
++ *
++ *    RETURN:
++ *    The a pointer to the &struct pt_regs of the shadowed thread @threadid.
++ */
++struct pt_regs __attribute__ ((weak))
++    * kgdb_shadow_regs(struct pt_regs *regs, int threadid)
++{
++      return NULL;
++}
++
++int __attribute__ ((weak))
++     kgdb_validate_break_address(unsigned long addr)
++{
++      int error = 0;
++      char tmp_variable[BREAK_INSTR_SIZE];
++      error = kgdb_get_mem((char *)addr, tmp_variable, BREAK_INSTR_SIZE);
++      return error;
++}
++
++int __attribute__ ((weak))
++     kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr)
++{
++      int error = 0;
++      if ((error = kgdb_get_mem((char *)addr,
++              saved_instr, BREAK_INSTR_SIZE)) < 0)
++                      return error;
++
++      if ((error = kgdb_set_mem((char *)addr, kgdb_ops->gdb_bpt_instr,
++              BREAK_INSTR_SIZE)) < 0)
++                      return error;
++      return 0;
++}
++
++int __attribute__ ((weak))
++     kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle)
++{
++
++      int error = 0;
++      if ((error =kgdb_set_mem((char *)addr, (char *)bundle,
++              BREAK_INSTR_SIZE)) < 0)
++                      return error;
++      return 0;
++}
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/kernel/module.c linux-2.6.18-53.1.14.kgdb/kernel/module.c
+--- linux-2.6.18-53.1.14/kernel/module.c       2008-03-06 05:54:13.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/kernel/module.c  2008-06-10 15:39:15.000000000 +0400
+@@ -65,6 +65,7 @@ static DEFINE_SPINLOCK(modlist_lock);
+ /* List of modules, protected by module_mutex AND modlist_lock */
+ static DEFINE_MUTEX(module_mutex);
+ static LIST_HEAD(modules);
++static DECLARE_MUTEX(notify_mutex);
+ 
+ static BLOCKING_NOTIFIER_HEAD(module_notify_list);
+ 
+@@ -701,6 +702,12 @@ sys_delete_module(const char __user *nam
+       if (ret != 0)
+               goto out;
+ 
++      down(&notify_mutex);
++      blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING,
++                              mod);
++      up(&notify_mutex);
++
++
+       /* Never wait if forced. */
+       if (!forced && module_refcount(mod) != 0)
+               wait_for_zero_refcount(mod);
+@@ -713,6 +720,11 @@ sys_delete_module(const char __user *nam
+       }
+       free_module(mod);
+ 
++      down(&notify_mutex);
++      blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GONE,
++                      NULL);
++      up(&notify_mutex);
++
+  out:
+       mutex_unlock(&module_mutex);
+       return ret;
+@@ -1119,6 +1131,11 @@ static void free_module(struct module *m
+       /* Arch-specific cleanup. */
+       module_arch_cleanup(mod);
+ 
++#ifdef CONFIG_KGDB
++      /* kgdb info */
++      vfree(mod->mod_sections);
++#endif
++
+       /* Module unload stuff */
+       module_unload_free(mod);
+ 
+@@ -1378,6 +1395,31 @@ static void setup_modinfo(struct module 
+       }
+ }
+ 
++#ifdef CONFIG_KGDB
++int add_modsects (struct module *mod, Elf_Ehdr *hdr, Elf_Shdr *sechdrs, const
++                char *secstrings)
++{
++        int i;
++
++        mod->num_sections = hdr->e_shnum - 1;
++        mod->mod_sections = vmalloc((hdr->e_shnum - 1)*
++              sizeof (struct mod_section));
++
++        if (mod->mod_sections == NULL) {
++                return -ENOMEM;
++        }
++
++        for (i = 1; i < hdr->e_shnum; i++) {
++                mod->mod_sections[i - 1].address = (void *)sechdrs[i].sh_addr;
++                strncpy(mod->mod_sections[i - 1].name, secstrings +
++                                sechdrs[i].sh_name, MAX_SECTNAME);
++                mod->mod_sections[i - 1].name[MAX_SECTNAME] = '\0';
++      }
++
++      return 0;
++}
++#endif
++
+ #ifdef CONFIG_KALLSYMS
+ int is_exported(const char *name, const struct module *mod)
+ {
+@@ -1796,6 +1838,12 @@ static struct module *load_module(void _
+ 
+       add_kallsyms(mod, sechdrs, symindex, strindex, secstrings);
+ 
++#ifdef CONFIG_KGDB
++        if ((err = add_modsects(mod, hdr, sechdrs, secstrings)) < 0) {
++                goto nomodsectinfo;
++        }
++#endif
++
+       err = module_finalize(hdr, sechdrs, mod);
+       if (err < 0)
+               goto cleanup;
+@@ -1856,6 +1904,11 @@ static struct module *load_module(void _
+  arch_cleanup:
+       module_arch_cleanup(mod);
+  cleanup:
++
++#ifdef CONFIG_KGDB
++nomodsectinfo:
++       vfree(mod->mod_sections);
++#endif
+       module_unload_free(mod);
+       module_free(mod, mod->module_init);
+  free_core:
+@@ -1927,6 +1980,10 @@ sys_init_module(void __user *umod,
+               /* Init routine failed: abort.  Try to protect us from
+                    buggy refcounters. */
+               mod->state = MODULE_STATE_GOING;
++              down(&notify_mutex);
++              blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING,
++                              mod);
++              up(&notify_mutex);
+               synchronize_sched();
+               if (mod->unsafe)
+                       printk(KERN_ERR "%s: module is now stuck!\n",
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/kernel/sched.c linux-2.6.18-53.1.14.kgdb/kernel/sched.c
+--- linux-2.6.18-53.1.14/kernel/sched.c        2008-03-06 05:54:44.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/kernel/sched.c   2008-06-10 15:37:25.000000000 +0400
+@@ -52,6 +52,7 @@
+ #include <linux/acct.h>
+ #include <linux/kprobes.h>
+ #include <linux/delayacct.h>
++#include <linux/kgdb.h>
+ #include <asm/tlb.h>
+ 
+ #include <asm/unistd.h>
+@@ -6835,6 +6836,9 @@ void __might_sleep(char *file, int line)
+ #ifdef in_atomic
+       static unsigned long prev_jiffy;        /* ratelimiting */
+ 
++      if (atomic_read(&debugger_active))
++              return;
++
+       if ((in_atomic() || irqs_disabled()) &&
+           system_state == SYSTEM_RUNNING && !oops_in_progress) {
+               if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/kernel/softlockup.c linux-2.6.18-53.1.14.kgdb/kernel/softlockup.c
+--- linux-2.6.18-53.1.14/kernel/softlockup.c   2008-03-06 05:54:44.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/kernel/softlockup.c      2008-06-10 15:39:21.000000000 +0400
+@@ -13,6 +13,7 @@
+ #include <linux/kthread.h>
+ #include <linux/notifier.h>
+ #include <linux/module.h>
++#include <linux/kgdb.h>
+ 
+ static DEFINE_SPINLOCK(print_lock);
+ 
+@@ -37,6 +38,9 @@ static struct notifier_block panic_block
+ void touch_softlockup_watchdog(void)
+ {
+       __raw_get_cpu_var(touch_timestamp) = jiffies;
++#ifdef CONFIG_KGDB
++      atomic_set(&kgdb_sync_softlockup[raw_smp_processor_id()], 0);
++#endif
+ }
+ EXPORT_SYMBOL(touch_softlockup_watchdog);
+ 
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/kernel/timer.c linux-2.6.18-53.1.14.kgdb/kernel/timer.c
+--- linux-2.6.18-53.1.14/kernel/timer.c        2008-03-06 05:54:50.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/kernel/timer.c   2008-06-10 15:39:21.000000000 +0400
+@@ -34,6 +34,7 @@
+ #include <linux/cpu.h>
+ #include <linux/syscalls.h>
+ #include <linux/delay.h>
++#include <linux/kgdb.h>
+ 
+ #include <asm/uaccess.h>
+ #include <asm/unistd.h>
+@@ -1385,7 +1386,11 @@ static void run_timer_softirq(struct sof
+  */
+ void run_local_timers(void)
+ {
++      int this_cpu = smp_processor_id();
+       raise_softirq(TIMER_SOFTIRQ);
++#ifdef CONFIG_KGDB
++      if(!atomic_read(&kgdb_sync_softlockup[this_cpu]))
++#endif
+       softlockup_tick();
+ }
+ 
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/lib/Kconfig.debug linux-2.6.18-53.1.14.kgdb/lib/Kconfig.debug
+--- linux-2.6.18-53.1.14/lib/Kconfig.debug     2008-03-06 05:54:32.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/lib/Kconfig.debug        2008-06-10 15:38:56.000000000 +0400
+@@ -324,7 +324,7 @@ config DEBUG_LIST
+ 
+ config FRAME_POINTER
+       bool "Compile the kernel with frame pointers"
+-      depends on DEBUG_KERNEL && (X86 || CRIS || M68K || M68KNOMMU || FRV || UML || S390)
++      depends on DEBUG_KERNEL && (X86 || CRIS || M68K || M68KNOMMU || FRV || UML || S390 || SUPERH)
+       default y if DEBUG_INFO && UML
+       help
+         If you say Y here the resulting kernel image will be slightly larger
+@@ -377,3 +377,158 @@ config RCU_TORTURE_TEST
+         at boot time (you probably don't).
+         Say M if you want the RCU torture tests to build as a module.
+         Say N if you are unsure.
++
++config WANT_EXTRA_DEBUG_INFORMATION
++      bool
++      select DEBUG_INFO
++      select FRAME_POINTER if X86 || SUPERH
++      default n
++
++config KGDB
++      bool "KGDB: kernel debugging with remote gdb"
++      select WANT_EXTRA_DEBUG_INFORMATION
++      depends on DEBUG_KERNEL && (ARM || X86 || MIPS || (SUPERH && !SUPERH64) || IA64 || X86_64 || PPC)
++      help
++        If you say Y here, it will be possible to remotely debug the
++        kernel using gdb. It is strongly suggested that you enable
++        DEBUG_INFO, and if available on your platform, FRAME_POINTER.
++        Documentation of kernel debugger available at
++        http://kgdb.sourceforge.net as well as in DocBook form
++        in Documentation/DocBook/.  If unsure, say N.
++
++config KGDB_CONSOLE
++      bool "KGDB: Console messages through gdb"
++      depends on KGDB
++        help
++          If you say Y here, console messages will appear through gdb.
++          Other consoles such as tty or ttyS will continue to work as usual.
++          Note, that if you use this in conjunction with KGDB_ETH, if the
++          ethernet driver runs into an error condition during use with KGDB
++          it is possible to hit an infinite recusrion, causing the kernel
++          to crash, and typically reboot.  For this reason, it is preferable
++          to use NETCONSOLE in conjunction with KGDB_ETH instead of
++          KGDB_CONSOLE.
++
++choice
++      prompt "Method for KGDB communication"
++      depends on KGDB
++      default KGDB_8250_NOMODULE
++      default KGDB_MPSC if SERIAL_MPSC
++      default KGDB_CPM_UART if (8xx || 8260)
++      default KGDB_SIBYTE if SIBYTE_SB1xxx_SOC
++      help
++        There are a number of different ways in which you can communicate
++        with KGDB.  The most common is via serial, with the 8250 driver
++        (should your hardware have an 8250, or ns1655x style uart).
++        Another option is to use the NETPOLL framework and UDP, should
++        your ethernet card support this.  Other options may exist.
++        You can elect to have one core I/O driver that is built into the
++        kernel for debugging as the kernel is booting, or using only
++        kernel modules.
++
++config KGDB_ONLY_MODULES
++      bool "KGDB: Use only kernel modules for I/O"
++      depends on MODULES
++      help
++        Use only kernel modules to configure KGDB I/O after the
++        kernel is booted.
++
++config KGDB_8250_NOMODULE
++      bool "KGDB: On generic serial port (8250)"
++      select KGDB_8250
++      help
++        Uses generic serial port (8250) to communicate with the host
++        GDB.  This is independent of the normal (SERIAL_8250) driver
++        for this chipset.
++
++config KGDBOE_NOMODULE
++      bool "KGDB: On ethernet - in kernel"
++      select KGDBOE
++      select NETPOLL
++      select NETPOLL_TRAP
++      select NETPOLL_RX
++      help
++        Uses the NETPOLL API to communicate with the host GDB via UDP.
++        In order for this to work, the ethernet interface specified must
++        support the NETPOLL API, and this must be initialized at boot.
++        See the documentation for syntax.
++
++config KGDB_MPSC
++      bool "KGDB on MV64x60 MPSC"
++      depends on SERIAL_MPSC
++      help
++        Uses a Marvell GT64260B or MV64x60 Multi-Purpose Serial
++        Controller (MPSC) channel. Note that the GT64260A is not
++        supported.
++
++config KGDB_CPM_UART
++      bool "KGDB: On CPM UART"
++      depends on PPC && (CPM2 || 8xx)
++      help
++        Uses CPM UART to communicate with the host GDB.
++
++config KGDB_SIBYTE
++      bool "KGDB: On the Broadcom SWARM serial port"
++      depends on MIPS && SIBYTE_SB1xxx_SOC
++endchoice
++
++config KGDBOE
++      tristate "KGDB: On ethernet" if !KGDBOE_NOMODULE
++      depends on m && KGDB
++      select NETPOLL
++      select NETPOLL_TRAP
++      select NETPOLL_RX
++      help
++        Uses the NETPOLL API to communicate with the host GDB via UDP.
++        In order for this to work, the ethernet interface specified must
++        support the NETPOLL API, and this must be initialized at boot.
++        See the documentation for syntax.
++
++config KGDB_8250
++      tristate "KGDB: On generic serial port (8250)" if !KGDB_8250_NOMODULE
++      depends on m && KGDB_ONLY_MODULES
++      help
++        Uses generic serial port (8250) to communicate with the host
++        GDB.  This is independent of the normal (SERIAL_8250) driver
++        for this chipset.
++
++config KGDB_SIMPLE_SERIAL
++      bool "Simple selection of KGDB serial port"
++      depends on KGDB_8250_NOMODULE
++      default y
++      help
++        If you say Y here, you will only have to pick the baud rate
++        and port number that you wish to use for KGDB.  Note that this
++        only works on architectures that register known serial ports
++        early on.  If you say N, you will have to provide, either here
++        or on the command line, the type (I/O or MMIO), IRQ and
++        address to use.  If in doubt, say Y.
++
++config KGDB_BAUDRATE
++      int "Debug serial port baud rate"
++      depends on (KGDB_8250 && KGDB_SIMPLE_SERIAL)
++      default "115200"
++      help
++        gdb and the kernel stub need to agree on the baud rate to be
++        used.  Standard rates from 9600 to 115200 are allowed, and this
++        may be overridden via the commandline.
++
++config KGDB_PORT_NUM
++      int "Serial port number for KGDB"
++      range 0 1 if KGDB_MPSC
++      range 0 3
++      depends on (KGDB_8250 && KGDB_SIMPLE_SERIAL) || KGDB_MPSC
++      default "1"
++      help
++        Pick the port number (0 based) for KGDB to use.
++
++config KGDB_8250_CONF_STRING
++      string "Configuration string for KGDB"
++      depends on KGDB_8250_NOMODULE && !KGDB_SIMPLE_SERIAL
++      default "io,2f8,115200,3" if X86
++      help
++        The format of this string should be <io or
++        mmio>,<address>,<baud rate>,<irq>.  For example, to use the
++        serial port on an i386 box located at 0x2f8 and 115200 baud
++        on IRQ 3 at use:
++        io,2f8,115200,3
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/net/core/netpoll.c linux-2.6.18-53.1.14.kgdb/net/core/netpoll.c
+--- linux-2.6.18-53.1.14/net/core/netpoll.c    2008-03-06 05:54:27.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/net/core/netpoll.c       2008-06-10 15:37:49.000000000 +0400
+@@ -525,7 +525,8 @@ int __netpoll_rx(struct sk_buff *skb)
+ 
+       np->rx_hook(np, ntohs(uh->source),
+                   (char *)(uh+1),
+-                  ulen - sizeof(struct udphdr));
++                  ulen - sizeof(struct udphdr),
++                  skb);
+ 
+       kfree_skb(skb);
+       return 1;
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/scripts/dwarfh.awk linux-2.6.18-53.1.14.kgdb/scripts/dwarfh.awk
+--- linux-2.6.18-53.1.14/scripts/dwarfh.awk    1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18-53.1.14.kgdb/scripts/dwarfh.awk       2008-06-10 15:39:01.000000000 +0400
+@@ -0,0 +1,19 @@
++BEGIN {
++      print "#ifndef  _ELF_DWARF_H"
++              print "/* Machine generated from dwarf2.h by scripts/dwarfh.awk */"
++}
++$2 == "=" {
++      gsub(/,/, "", $3)
++      print "#define " $1 "\t " $3
++}
++$1 == "#define" {
++      print $0
++      while( index($0,"\\") == length($0)){
++              getline
++              print $0
++      }
++}
++/.*/ {}
++END {
++      print "#endif"
++}
diff --git a/lustre/kernel_patches/patches/kgdb-2.6.18-vanilla.patch b/lustre/kernel_patches/patches/kgdb-2.6.18-vanilla.patch

new file mode 100644 (file)

index 0000000..4d157ad
--- /dev/null
+++ b/lustre/kernel_patches/patches/kgdb-2.6.18-vanilla.patch
@@ -0,0 +1,19778 @@
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/Documentation/DocBook/Makefile linux-2.6.18.kgdb/Documentation/DocBook/Makefile
+--- linux-2.6.18/Documentation/DocBook/Makefile        2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/Documentation/DocBook/Makefile   2008-06-10 16:18:58.000000000 +0400
+@@ -11,7 +11,7 @@ DOCBOOKS := wanbook.xml z8530book.xml mc
+           procfs-guide.xml writing_usb_driver.xml \
+           kernel-api.xml journal-api.xml lsm.xml usb.xml \
+           gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml \
+-          genericirq.xml
++          genericirq.xml kgdb.xml
+ 
+ ###
+ # The build process is as follows (targets):
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/Documentation/DocBook/kgdb.tmpl linux-2.6.18.kgdb/Documentation/DocBook/kgdb.tmpl
+--- linux-2.6.18/Documentation/DocBook/kgdb.tmpl       1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18.kgdb/Documentation/DocBook/kgdb.tmpl  2008-06-10 16:19:47.000000000 +0400
+@@ -0,0 +1,250 @@
++<?xml version="1.0" encoding="UTF-8"?>
++<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
++      "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
++
++<book id="kgdbInternals">
++ <bookinfo>
++  <title>KGDB Internals</title>
++
++  <authorgroup>
++   <author>
++    <firstname>Tom</firstname>
++    <surname>Rini</surname>
++    <affiliation>
++     <address>
++      <email>trini@kernel.crashing.org</email>
++     </address>
++    </affiliation>
++   </author>
++  </authorgroup>
++
++  <authorgroup>
++   <author>
++    <firstname>Amit S.</firstname>
++    <surname>Kale</surname>
++    <affiliation>
++     <address>
++      <email>amitkale@linsyssoft.com</email>
++     </address>
++    </affiliation>
++   </author>
++  </authorgroup>
++
++  <copyright>
++   <year>2004-2005</year>
++   <holder>MontaVista Software, Inc.</holder>
++  </copyright>
++  <copyright>
++   <year>2004</year>
++   <holder>Amit S. Kale</holder>
++  </copyright>
++
++  <legalnotice>
++   <para>
++   This file is licensed under the terms of the GNU General Public License
++   version 2. This program is licensed "as is" without any warranty of any
++   kind, whether express or implied.
++   </para>
++
++  </legalnotice>
++ </bookinfo>
++
++<toc></toc>
++  <chapter id="Introduction">
++    <title>Introduction</title>
++    <para>
++    kgdb is a source level debugger for linux kernel. It is used along
++    with gdb to debug a linux kernel. Kernel developers can debug a kernel
++    similar to application programs with the use of kgdb. It makes it
++    possible to place breakpoints in kernel code, step through the code
++    and observe variables.
++    </para>
++    <para>
++    Two machines are required for using kgdb. One of these machines is a
++    development machine and the other is a test machine. The machines are
++    typically connected through a serial line, a null-modem cable which
++    connects their serial ports.  It is also possible however, to use an
++    ethernet connection between the machines.  The kernel to be debugged
++    runs on the test machine. gdb runs on the development machine. The
++    serial line or ethernet connection is used by gdb to communicate to
++    the kernel being debugged.
++    </para>
++  </chapter>
++  <chapter id="CompilingAKernel">
++    <title>Compiling a kernel</title>
++    <para>
++    To enable <symbol>CONFIG_KGDB</symbol>, look under the "Kernel debugging"
++    and then select "KGDB: kernel debugging with remote gdb".
++    </para>
++    <para>
++    The first choice for I/O is <symbol>CONFIG_KGDB_ONLY_MODULES</symbol>.
++    This means that you will only be able to use KGDB after loading a
++    kernel module that defines how you want to be able to talk with
++    KGDB.  There are two other choices (more on some architectures) that
++    can be enabled as modules later, if not picked here.
++    </para>
++    <para>The first of these is <symbol>CONFIG_KGDB_8250_NOMODULE</symbol>.
++    This has sub-options such as <symbol>CONFIG_KGDB_SIMPLE_SERIAL</symbol>
++    which toggles choosing the serial port by ttyS number or by specifying
++    a port and IRQ number.
++    </para>
++    <para>
++    The second of these choices on most systems for I/O is
++    <symbol>CONFIG_KGDBOE</symbol>. This requires that the machine to be
++    debugged has an ethernet card which supports the netpoll API, such as
++    the cards supported by <symbol>CONFIG_E100</symbol>.  There are no
++    sub-options for this, but a kernel command line option is required.
++    </para>
++  </chapter>
++  <chapter id="BootingTheKernel">
++    <title>Booting the kernel</title>
++    <para>
++    The Kernel command line option <constant>kgdbwait</constant> makes kgdb
++    wait for gdb connection during booting of a kernel.  If the
++    <symbol>CONFIG_KGDB_8250</symbol> driver is used (or if applicable,
++    another serial driver) this breakpoint will happen very early on, before
++    console output.  If you wish to change serial port information and you
++    have enabled both <symbol>CONFIG_KGDB_8250</symbol> and
++    <symbol>CONFIG_KGDB_SIMPLE_SERIAL</symbol> then you must pass the option
++    <constant>kgdb8250=&lt;io or mmio&gt;,&lt;address&gt;,&lt;baud
++    rate&gt;,&lt;irq&gt;</constant> before <constant>kgdbwait</constant>.
++    The values <constant>io</constant> or <constant>mmio</constant> refer to
++    if the address being passed next needs to be memory mapped
++    (<constant>mmio</constant>) or not. The <constant>address</constant> must
++    be passed in hex and is the hardware address and will be remapped if
++    passed as <constant>mmio</constant>. The value
++    <constant>baud rate</constant> and <constant>irq</constant> are base-10.
++    The supported values for <constant>baud rate</constant> are
++    <constant>9600</constant>, <constant>19200</constant>,
++    <constant>38400</constant>, <constant>57600</constant>, and
++    <constant>115200</constant>.
++    </para>
++    <para>
++    To have KGDB stop the kernel and wait, with the compiled values for the
++    serial driver, pass in: <constant>kgdbwait</constant>.
++    </para>
++    <para>
++    To specify the values of the SH SCI(F) serial port at boot:
++    <constant>kgdbsci=0,115200</constant>.
++    </para>
++    <para>
++    To specify the values of the serial port at boot:
++    <constant>kgdb8250=io,3f8,115200,3</constant>.
++    On IA64 this could also be:
++    <constant>kgdb8250=mmio,0xff5e0000,115200,74</constant>
++    And to have KGDB also stop the kernel and wait for GDB to connect, pass in
++    <constant>kgdbwait</constant> after this arguement.
++    </para>
++    <para>
++    To configure the <symbol>CONFIG_KGDBOE</symbol> driver, pass in
++    <constant>kgdboe=[src-port]@&lt;src-ip&gt;/[dev],[tgt-port]@&lt;tgt-ip&gt;/[tgt-macaddr]</constant>
++    where:
++    <itemizedlist>
++      <listitem><para>src-port (optional): source for UDP packets (defaults to <constant>6443</constant>)</para></listitem>
++      <listitem><para>src-ip: source IP to use (interface address)</para></listitem>
++      <listitem><para>dev (optional): network interface (<constant>eth0</constant>)</para></listitem>
++      <listitem><para>tgt-port (optional): port GDB will use (defaults to <constant>6442</constant>)</para></listitem>
++      <listitem><para>tgt-ip: IP address GDB will be connecting from</para></listitem>
++      <listitem><para>tgt-macaddr (optional): ethernet MAC address for logging agent (default is broadcast)</para></listitem>
++    </itemizedlist>
++    </para>
++    <para>
++    The <symbol>CONFIG_KGDBOE</symbol> driver can be reconfigured at run
++    time, if <symbol>CONFIG_SYSFS</symbol> and
++    <symbol>CONFIG_MODULES</symbol> by echo'ing a new config string to
++    <constant>/sys/module/kgdboe/parameter/kgdboe</constant>.  The
++    driver can be unconfigured with the special string
++    <constant>not_configured</constant>.
++    </para>
++  </chapter>
++  <chapter id="ConnectingGDB">
++  <title>Connecting gdb</title>
++    <para>
++    If you have used any of the methods to have KGDB stop and create
++    an initial breakpoint described in the previous chapter, kgdb prints
++    the message "Waiting for connection from remote gdb..." on the console
++    and waits for connection from gdb. At this point you connect gdb to kgdb.
++    </para>
++    <para>
++    Example (serial):
++    </para>
++    <programlisting>
++    % gdb ./vmlinux
++    (gdb) set remotebaud 115200
++    (gdb) target remote /dev/ttyS0
++    </programlisting>
++    <para>
++    Example (ethernet):
++    </para>
++    <programlisting>
++    % gdb ./vmlinux
++    (gdb) target remote udp:192.168.2.2:6443
++    </programlisting>
++    <para>
++    Once connected, you can debug a kernel the way you would debug an
++    application program.
++    </para>
++  </chapter>
++  <chapter id="ArchitectureNotes">
++    <title>Architecture specific notes</title>
++      <para>
++      SuperH: The NMI switch found on some boards can be used to trigger an
++      initial breakpoint.  Subsequent triggers do nothing.  If console
++      is enabled on the SCI(F) serial port, and that is the port being used
++      for KGDB, then you must trigger a breakpoint via sysrq, NMI, or
++      some other method prior to connecting, or echo a control-c to the
++      serial port.  Also, to use the SCI(F) port for KGDB, the
++      <symbol>CONFIG_SERIAL_SH_SCI</symbol> driver must be enabled.
++      </para>
++  </chapter>
++  <chapter id="CommonBackEndReq">
++    <title>The common backend (required)</title>
++      <para>
++      There are a few flags which must be set on every architecture in
++      their &lt;asm/kgdb.h&gt; file.  These are:
++      <itemizedlist>
++        <listitem>
++        <para>
++        NUMREGBYTES: The size in bytes of all of the registers, so
++        that we can ensure they will all fit into a packet.
++        </para>
++        <para>
++        BUFMAX: The size in bytes of the buffer GDB will read into.
++        This must be larger than NUMREGBYTES.
++        </para>
++        <para>
++        CACHE_FLUSH_IS_SAFE: Set to one if it always safe to call
++        flush_cache_range or flush_icache_range.  On some architectures,
++        these functions may not be safe to call on SMP since we keep other
++        CPUs in a holding pattern.
++        </para>
++      </listitem>
++      </itemizedlist>
++      </para>
++      <para>
++      There are also the following functions for the common backend,
++      found in kernel/kgdb.c that must be supplied by the
++      architecture-specific backend.  No weak version of these is provided.
++      </para>
++!Iinclude/linux/kgdb.h
++  </chapter>
++  <chapter id="CommonBackEndOpt">
++    <title>The common backend (optional)</title>
++      <para>
++      These functions are part of the common backend, found in kernel/kgdb.c
++      and are optionally implemented.  Some functions (with _hw_ in the name)
++      end up being required on arches which use hardware breakpoints.
++      </para>
++!Ikernel/kgdb.c
++  </chapter>
++  <chapter id="DriverSpecificFunctions">
++    <title>Driver-Specific Functions</title>
++      <para>
++      Some of the I/O drivers have additional functions that can be
++      called, that are specific to the driver.  Calls from other places
++      to these functions must be wrapped in #ifdefs for the driver in
++      question.
++      </para>
++!Idrivers/serial/8250_kgdb.c
++   </chapter>
++</book>
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/MAINTAINERS linux-2.6.18.kgdb/MAINTAINERS
+--- linux-2.6.18/MAINTAINERS   2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/MAINTAINERS      2008-06-10 16:18:58.000000000 +0400
+@@ -1685,6 +1685,15 @@ L:      linux-kernel@vger.kernel.org
+ L:    fastboot@osdl.org
+ S:    Maintained
+ 
++KGDB
++P:    Tom Rini
++P:    Amit S. Kale
++M:    trini@kernel.crashing.org
++M:    amitkale@linsyssoft.com
++W:    http://sourceforge.net/projects/kgdb
++L:    kgdb-bugreport@lists.sourceforge.net
++S:    Maintained
++
+ KPROBES
+ P:    Prasanna S Panchamukhi
+ M:    prasanna@in.ibm.com
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/Makefile linux-2.6.18.kgdb/Makefile
+--- linux-2.6.18/Makefile      2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/Makefile 2008-06-10 16:19:57.000000000 +0400
+@@ -990,6 +990,7 @@ MRPROPER_DIRS  += include/config include
+ MRPROPER_FILES += .config .config.old include/asm .version .old_version \
+                   include/linux/autoconf.h include/linux/version.h      \
+                   include/linux/utsrelease.h                            \
++                include/linux/dwarf2-defs.h                           \
+                 Module.symvers tags TAGS cscope*
+ 
+ # clean - Delete most, but leave enough to build external modules
+@@ -1416,7 +1417,11 @@ clean := -f $(if $(KBUILD_SRC),$(srctree
+ endif # skip-makefile
+ 
+ PHONY += FORCE
+-FORCE:
++include/linux/dwarf2-defs.h: $(srctree)/include/linux/dwarf2.h $(srctree)/scripts/dwarfh.awk
++      mkdir -p include/linux/
++      awk -f $(srctree)/scripts/dwarfh.awk $(srctree)/include/linux/dwarf2.h > include/linux/dwarf2-defs.h
++
++FORCE: include/linux/dwarf2-defs.h
+ 
+ 
+ # Declare the contents of the .PHONY variable as phony.  We keep that
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/arm/kernel/Makefile linux-2.6.18.kgdb/arch/arm/kernel/Makefile
+--- linux-2.6.18/arch/arm/kernel/Makefile      2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/arm/kernel/Makefile 2008-06-10 16:19:51.000000000 +0400
+@@ -20,6 +20,7 @@ obj-$(CONFIG_ISA_DMA)                += dma-isa.o
+ obj-$(CONFIG_PCI)             += bios32.o isa.o
+ obj-$(CONFIG_SMP)             += smp.o
+ obj-$(CONFIG_OABI_COMPAT)     += sys_oabi-compat.o
++obj-$(CONFIG_KGDB)            += kgdb.o kgdb-jmp.o
+ 
+ obj-$(CONFIG_CRUNCH)          += crunch.o crunch-bits.o
+ AFLAGS_crunch-bits.o          := -Wa,-mcpu=ep9312
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/arm/kernel/entry-armv.S linux-2.6.18.kgdb/arch/arm/kernel/entry-armv.S
+--- linux-2.6.18/arch/arm/kernel/entry-armv.S  2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/arm/kernel/entry-armv.S     2008-06-10 16:19:58.000000000 +0400
+@@ -15,6 +15,7 @@
+  *  it to save wrong values...  Be aware!
+  */
+ 
++#include <asm/kgdb.h>
+ #include <asm/memory.h>
+ #include <asm/glue.h>
+ #include <asm/vfpmacros.h>
+@@ -232,6 +233,7 @@ svc_preempt:
+       beq     preempt_return                  @ go again
+       b       1b
+ #endif
++      CFI_END_FRAME(__irq_svc)
+ 
+       .align  5
+ __und_svc:
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/arm/kernel/kgdb-jmp.S linux-2.6.18.kgdb/arch/arm/kernel/kgdb-jmp.S
+--- linux-2.6.18/arch/arm/kernel/kgdb-jmp.S    1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18.kgdb/arch/arm/kernel/kgdb-jmp.S       2008-06-10 16:19:51.000000000 +0400
+@@ -0,0 +1,32 @@
++/*
++ * arch/arm/kernel/kgdb-jmp.S
++ *
++ * Trivial setjmp and longjmp procedures to support bus error recovery
++ * which may occur during kgdb memory read/write operations.
++ *
++ * Author: MontaVista Software, Inc. <source@mvista.com>
++ *         source@mvista.com
++ *
++ * 2002-2005 (c) MontaVista Software, Inc.  This file is licensed under the
++ * terms of the GNU General Public License version 2. This program as licensed
++ * "as is" without any warranty of any kind, whether express or implied.
++ */
++#include <linux/linkage.h>
++
++ENTRY (kgdb_fault_setjmp)
++      /* Save registers */
++      stmia   r0, {r0-r14}
++      str     lr,[r0, #60]
++      mrs     r1,cpsr
++      str     r1,[r0,#64]
++      ldr     r1,[r0,#4]
++      mov     r0, #0
++      mov     pc,lr
++
++ENTRY (kgdb_fault_longjmp)
++      /* Restore registers */
++      mov     r1,#1
++      str     r1,[r0]
++      ldr     r1,[r0, #64]
++      msr     spsr,r1
++      ldmia   r0,{r0-pc}^
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/arm/kernel/kgdb.c linux-2.6.18.kgdb/arch/arm/kernel/kgdb.c
+--- linux-2.6.18/arch/arm/kernel/kgdb.c        1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18.kgdb/arch/arm/kernel/kgdb.c   2008-06-10 16:19:51.000000000 +0400
+@@ -0,0 +1,208 @@
++/*
++ * arch/arm/kernel/kgdb.c
++ *
++ * ARM KGDB support
++ *
++ * Copyright (c) 2002-2004 MontaVista Software, Inc
++ *
++ * Authors:  George Davis <davis_g@mvista.com>
++ *           Deepak Saxena <dsaxena@plexity.net>
++ */
++#include <linux/config.h>
++#include <linux/types.h>
++#include <linux/kernel.h>
++#include <linux/signal.h>
++#include <linux/sched.h>
++#include <linux/mm.h>
++#include <linux/spinlock.h>
++#include <linux/personality.h>
++#include <linux/ptrace.h>
++#include <linux/elf.h>
++#include <linux/interrupt.h>
++#include <linux/init.h>
++#include <linux/kgdb.h>
++
++#include <asm/atomic.h>
++#include <asm/io.h>
++#include <asm/pgtable.h>
++#include <asm/system.h>
++#include <asm/uaccess.h>
++#include <asm/unistd.h>
++#include <asm/ptrace.h>
++#include <asm/traps.h>
++
++/* Make a local copy of the registers passed into the handler (bletch) */
++void regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *kernel_regs)
++{
++      int regno;
++
++      /* Initialize all to zero (??) */
++      for (regno = 0; regno < GDB_MAX_REGS; regno++)
++              gdb_regs[regno] = 0;
++
++      gdb_regs[_R0] = kernel_regs->ARM_r0;
++      gdb_regs[_R1] = kernel_regs->ARM_r1;
++      gdb_regs[_R2] = kernel_regs->ARM_r2;
++      gdb_regs[_R3] = kernel_regs->ARM_r3;
++      gdb_regs[_R4] = kernel_regs->ARM_r4;
++      gdb_regs[_R5] = kernel_regs->ARM_r5;
++      gdb_regs[_R6] = kernel_regs->ARM_r6;
++      gdb_regs[_R7] = kernel_regs->ARM_r7;
++      gdb_regs[_R8] = kernel_regs->ARM_r8;
++      gdb_regs[_R9] = kernel_regs->ARM_r9;
++      gdb_regs[_R10] = kernel_regs->ARM_r10;
++      gdb_regs[_FP] = kernel_regs->ARM_fp;
++      gdb_regs[_IP] = kernel_regs->ARM_ip;
++      gdb_regs[_SP] = kernel_regs->ARM_sp;
++      gdb_regs[_LR] = kernel_regs->ARM_lr;
++      gdb_regs[_PC] = kernel_regs->ARM_pc;
++      gdb_regs[_CPSR] = kernel_regs->ARM_cpsr;
++}
++
++/* Copy local gdb registers back to kgdb regs, for later copy to kernel */
++void gdb_regs_to_regs(unsigned long *gdb_regs, struct pt_regs *kernel_regs)
++{
++      kernel_regs->ARM_r0 = gdb_regs[_R0];
++      kernel_regs->ARM_r1 = gdb_regs[_R1];
++      kernel_regs->ARM_r2 = gdb_regs[_R2];
++      kernel_regs->ARM_r3 = gdb_regs[_R3];
++      kernel_regs->ARM_r4 = gdb_regs[_R4];
++      kernel_regs->ARM_r5 = gdb_regs[_R5];
++      kernel_regs->ARM_r6 = gdb_regs[_R6];
++      kernel_regs->ARM_r7 = gdb_regs[_R7];
++      kernel_regs->ARM_r8 = gdb_regs[_R8];
++      kernel_regs->ARM_r9 = gdb_regs[_R9];
++      kernel_regs->ARM_r10 = gdb_regs[_R10];
++      kernel_regs->ARM_fp = gdb_regs[_FP];
++      kernel_regs->ARM_ip = gdb_regs[_IP];
++      kernel_regs->ARM_sp = gdb_regs[_SP];
++      kernel_regs->ARM_lr = gdb_regs[_LR];
++      kernel_regs->ARM_pc = gdb_regs[_PC];
++      kernel_regs->ARM_cpsr = gdb_regs[GDB_MAX_REGS - 1];
++}
++
++static inline struct pt_regs *kgdb_get_user_regs(struct task_struct *task)
++{
++      return (struct pt_regs *)
++          ((unsigned long)task->thread_info + THREAD_SIZE -
++           8 - sizeof(struct pt_regs));
++}
++
++void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs,
++                               struct task_struct *task)
++{
++      int regno;
++      struct pt_regs *thread_regs;
++
++      /* Just making sure... */
++      if (task == NULL)
++              return;
++
++      /* Initialize to zero */
++      for (regno = 0; regno < GDB_MAX_REGS; regno++)
++              gdb_regs[regno] = 0;
++
++      /* Otherwise, we have only some registers from switch_to() */
++      thread_regs = kgdb_get_user_regs(task);
++      gdb_regs[_R0] = thread_regs->ARM_r0;    /* Not really valid? */
++      gdb_regs[_R1] = thread_regs->ARM_r1;    /* "               " */
++      gdb_regs[_R2] = thread_regs->ARM_r2;    /* "               " */
++      gdb_regs[_R3] = thread_regs->ARM_r3;    /* "               " */
++      gdb_regs[_R4] = thread_regs->ARM_r4;
++      gdb_regs[_R5] = thread_regs->ARM_r5;
++      gdb_regs[_R6] = thread_regs->ARM_r6;
++      gdb_regs[_R7] = thread_regs->ARM_r7;
++      gdb_regs[_R8] = thread_regs->ARM_r8;
++      gdb_regs[_R9] = thread_regs->ARM_r9;
++      gdb_regs[_R10] = thread_regs->ARM_r10;
++      gdb_regs[_FP] = thread_regs->ARM_fp;
++      gdb_regs[_IP] = thread_regs->ARM_ip;
++      gdb_regs[_SP] = thread_regs->ARM_sp;
++      gdb_regs[_LR] = thread_regs->ARM_lr;
++      gdb_regs[_PC] = thread_regs->ARM_pc;
++      gdb_regs[_CPSR] = thread_regs->ARM_cpsr;
++}
++
++static int compiled_break;
++
++int kgdb_arch_handle_exception(int exception_vector, int signo,
++                             int err_code, char *remcom_in_buffer,
++                             char *remcom_out_buffer,
++                             struct pt_regs *linux_regs)
++{
++      long addr;
++      char *ptr;
++
++      switch (remcom_in_buffer[0]) {
++      case 'c':
++              kgdb_contthread = NULL;
++
++              /*
++               * Try to read optional parameter, pc unchanged if no parm.
++               * If this was a compiled breakpoint, we need to move
++               * to the next instruction or we will just breakpoint
++               * over and over again.
++               */
++              ptr = &remcom_in_buffer[1];
++              if (kgdb_hex2long(&ptr, &addr)) {
++                      linux_regs->ARM_pc = addr;
++              } else if (compiled_break == 1) {
++                      linux_regs->ARM_pc += 4;
++              }
++
++              compiled_break = 0;
++
++              return 0;
++      }
++
++      return -1;
++}
++
++static int kgdb_brk_fn(struct pt_regs *regs, unsigned int instr)
++{
++      kgdb_handle_exception(1, SIGTRAP, 0, regs);
++
++      return 0;
++}
++
++static int kgdb_compiled_brk_fn(struct pt_regs *regs, unsigned int instr)
++{
++      compiled_break = 1;
++      kgdb_handle_exception(1, SIGTRAP, 0, regs);
++
++      return 0;
++}
++
++static struct undef_hook kgdb_brkpt_hook = {
++      .instr_mask = 0xffffffff,
++      .instr_val = KGDB_BREAKINST,
++      .fn = kgdb_brk_fn
++};
++
++static struct undef_hook kgdb_compiled_brkpt_hook = {
++      .instr_mask = 0xffffffff,
++      .instr_val = KGDB_COMPILED_BREAK,
++      .fn = kgdb_compiled_brk_fn
++};
++
++/*
++ * Register our undef instruction hooks with ARM undef core.
++ * We regsiter a hook specifically looking for the KGB break inst
++ * and we handle the normal undef case within the do_undefinstr
++ * handler.
++ */
++int kgdb_arch_init(void)
++{
++      register_undef_hook(&kgdb_brkpt_hook);
++      register_undef_hook(&kgdb_compiled_brkpt_hook);
++
++      return 0;
++}
++
++struct kgdb_arch arch_kgdb_ops = {
++#ifndef __ARMEB__
++      .gdb_bpt_instr = {0xfe, 0xde, 0xff, 0xe7}
++#else
++      .gdb_bpt_instr = {0xe7, 0xff, 0xde, 0xfe}
++#endif
++};
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/arm/kernel/setup.c linux-2.6.18.kgdb/arch/arm/kernel/setup.c
+--- linux-2.6.18/arch/arm/kernel/setup.c       2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/arm/kernel/setup.c  2008-06-10 16:19:51.000000000 +0400
+@@ -829,6 +829,11 @@ void __init setup_arch(char **cmdline_p)
+       conswitchp = &dummy_con;
+ #endif
+ #endif
++
++#if   defined(CONFIG_KGDB)
++      extern void __init early_trap_init(void);
++      early_trap_init();
++#endif
+ }
+ 
+ 
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/arm/kernel/traps.c linux-2.6.18.kgdb/arch/arm/kernel/traps.c
+--- linux-2.6.18/arch/arm/kernel/traps.c       2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/arm/kernel/traps.c  2008-06-10 16:19:51.000000000 +0400
+@@ -278,6 +278,7 @@ asmlinkage void do_undefinstr(struct pt_
+       unsigned int instr;
+       struct undef_hook *hook;
+       siginfo_t info;
++      mm_segment_t fs;
+       void __user *pc;
+ 
+       /*
+@@ -287,12 +288,15 @@ asmlinkage void do_undefinstr(struct pt_
+        */
+       regs->ARM_pc -= correction;
+ 
++      fs = get_fs();
++      set_fs(KERNEL_DS);
+       pc = (void __user *)instruction_pointer(regs);
+       if (thumb_mode(regs)) {
+               get_user(instr, (u16 __user *)pc);
+       } else {
+               get_user(instr, (u32 __user *)pc);
+       }
++      set_fs(fs);
+ 
+       spin_lock_irq(&undef_lock);
+       list_for_each_entry(hook, &undef_hook, node) {
+@@ -684,6 +688,13 @@ EXPORT_SYMBOL(abort);
+ 
+ void __init trap_init(void)
+ {
++#if   defined(CONFIG_KGDB)
++      return;
++}
++
++void __init early_trap_init(void)
++{
++#endif
+       unsigned long vectors = CONFIG_VECTORS_BASE;
+       extern char __stubs_start[], __stubs_end[];
+       extern char __vectors_start[], __vectors_end[];
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/arm/mach-ixp2000/core.c linux-2.6.18.kgdb/arch/arm/mach-ixp2000/core.c
+--- linux-2.6.18/arch/arm/mach-ixp2000/core.c  2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/arm/mach-ixp2000/core.c     2008-06-10 16:19:51.000000000 +0400
+@@ -34,6 +34,7 @@
+ #include <asm/system.h>
+ #include <asm/tlbflush.h>
+ #include <asm/pgtable.h>
++#include <asm/kgdb.h>
+ 
+ #include <asm/mach/map.h>
+ #include <asm/mach/time.h>
+@@ -184,6 +185,9 @@ static struct platform_device ixp2000_se
+ void __init ixp2000_uart_init(void)
+ {
+       platform_device_register(&ixp2000_serial_device);
++#ifdef CONFIG_KGDB_8250
++      kgdb8250_add_port(0, &ixp2000_serial_port);
++#endif
+ }
+ 
+ 
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/arm/mach-ixp2000/ixdp2x01.c linux-2.6.18.kgdb/arch/arm/mach-ixp2000/ixdp2x01.c
+--- linux-2.6.18/arch/arm/mach-ixp2000/ixdp2x01.c      2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/arm/mach-ixp2000/ixdp2x01.c 2008-06-10 16:19:51.000000000 +0400
+@@ -38,6 +38,7 @@
+ #include <asm/system.h>
+ #include <asm/hardware.h>
+ #include <asm/mach-types.h>
++#include <asm/kgdb.h>
+ 
+ #include <asm/mach/pci.h>
+ #include <asm/mach/map.h>
+@@ -413,6 +414,11 @@ static void __init ixdp2x01_init_machine
+       platform_add_devices(ixdp2x01_devices, ARRAY_SIZE(ixdp2x01_devices));
+       ixp2000_uart_init();
+       ixdp2x01_uart_init();
++
++#ifdef CONFIG_KGDB_8250
++      kgdb8250_add_port(0, &ixdp425_serial_ports[0]);
++      kgdb8250_add_port(1, &ixdp425_serial_ports[1]);
++#endif
+ }
+ 
+ 
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/arm/mach-ixp4xx/coyote-setup.c linux-2.6.18.kgdb/arch/arm/mach-ixp4xx/coyote-setup.c
+--- linux-2.6.18/arch/arm/mach-ixp4xx/coyote-setup.c   2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/arm/mach-ixp4xx/coyote-setup.c      2008-06-10 16:19:51.000000000 +0400
+@@ -96,6 +96,10 @@ static void __init coyote_init(void)
+       }
+ 
+       platform_add_devices(coyote_devices, ARRAY_SIZE(coyote_devices));
++
++#ifdef CONFIG_KGDB_8250
++      kgdb8250_add_port(0, &coyote_serial_port);
++#endif
+ }
+ 
+ #ifdef CONFIG_ARCH_ADI_COYOTE
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/arm/mach-ixp4xx/ixdp425-setup.c linux-2.6.18.kgdb/arch/arm/mach-ixp4xx/ixdp425-setup.c
+--- linux-2.6.18/arch/arm/mach-ixp4xx/ixdp425-setup.c  2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/arm/mach-ixp4xx/ixdp425-setup.c     2008-06-10 16:19:51.000000000 +0400
+@@ -24,6 +24,7 @@
+ #include <asm/irq.h>
+ #include <asm/mach/arch.h>
+ #include <asm/mach/flash.h>
++#include <asm/kgdb.h>
+ 
+ static struct flash_platform_data ixdp425_flash_data = {
+       .map_name       = "cfi_probe",
+@@ -76,7 +77,8 @@ static struct plat_serial8250_port ixdp4
+               .mapbase        = IXP4XX_UART1_BASE_PHYS,
+               .membase        = (char *)IXP4XX_UART1_BASE_VIRT + REG_OFFSET,
+               .irq            = IRQ_IXP4XX_UART1,
+-              .flags          = UPF_BOOT_AUTOCONF | UPF_SKIP_TEST,
++              .flags          = UPF_BOOT_AUTOCONF | UPF_SKIP_TEST |
++                                      UPF_SHARE_IRQ,
+               .iotype         = UPIO_MEM,
+               .regshift       = 2,
+               .uartclk        = IXP4XX_UART_XTAL,
+@@ -85,7 +87,8 @@ static struct plat_serial8250_port ixdp4
+               .mapbase        = IXP4XX_UART2_BASE_PHYS,
+               .membase        = (char *)IXP4XX_UART2_BASE_VIRT + REG_OFFSET,
+               .irq            = IRQ_IXP4XX_UART2,
+-              .flags          = UPF_BOOT_AUTOCONF | UPF_SKIP_TEST,
++              .flags          = UPF_BOOT_AUTOCONF | UPF_SKIP_TEST |
++                                      UPF_SHARE_IRQ,
+               .iotype         = UPIO_MEM,
+               .regshift       = 2,
+               .uartclk        = IXP4XX_UART_XTAL,
+@@ -116,6 +119,11 @@ static void __init ixdp425_init(void)
+               IXP4XX_EXP_BUS_BASE(0) + ixp4xx_exp_bus_size - 1;
+ 
+       platform_add_devices(ixdp425_devices, ARRAY_SIZE(ixdp425_devices));
++
++#ifdef CONFIG_KGDB_8250
++      kgdb8250_add_port(0, &ixdp425_serial_ports[0]);
++      kgdb8250_add_port(1, &ixdp425_serial_ports[1]);
++#endif
+ }
+ 
+ #ifdef CONFIG_ARCH_IXDP425
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/arm/mach-omap1/serial.c linux-2.6.18.kgdb/arch/arm/mach-omap1/serial.c
+--- linux-2.6.18/arch/arm/mach-omap1/serial.c  2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/arm/mach-omap1/serial.c     2008-06-10 16:19:51.000000000 +0400
+@@ -15,6 +15,7 @@
+ #include <linux/delay.h>
+ #include <linux/serial.h>
+ #include <linux/tty.h>
++#include <linux/kgdb.h>
+ #include <linux/serial_8250.h>
+ #include <linux/serial_reg.h>
+ #include <linux/clk.h>
+@@ -199,6 +200,9 @@ void __init omap_serial_init(void)
+                       break;
+               }
+               omap_serial_reset(&serial_platform_data[i]);
++#ifdef CONFIG_KGDB_8250
++              kgdb8250_add_platform_port(i, &serial_platform_data[i]);
++#endif
+       }
+ }
+ 
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/arm/mach-pxa/Makefile linux-2.6.18.kgdb/arch/arm/mach-pxa/Makefile
+--- linux-2.6.18/arch/arm/mach-pxa/Makefile    2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/arm/mach-pxa/Makefile       2008-06-10 16:19:51.000000000 +0400
+@@ -31,6 +31,7 @@ obj-$(CONFIG_LEDS) += $(led-y)
+ # Misc features
+ obj-$(CONFIG_PM) += pm.o sleep.o
+ obj-$(CONFIG_PXA_SSP) += ssp.o
++obj-$(CONFIG_KGDB_PXA_SERIAL) += kgdb-serial.o
+ 
+ ifeq ($(CONFIG_PXA27x),y)
+ obj-$(CONFIG_PM) += standby.o
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/arm/mach-pxa/kgdb-serial.c linux-2.6.18.kgdb/arch/arm/mach-pxa/kgdb-serial.c
+--- linux-2.6.18/arch/arm/mach-pxa/kgdb-serial.c       1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18.kgdb/arch/arm/mach-pxa/kgdb-serial.c  2008-06-10 16:19:51.000000000 +0400
+@@ -0,0 +1,98 @@
++/*
++ * linux/arch/arm/mach-pxa/kgdb-serial.c
++ *
++ * Provides low level kgdb serial support hooks for PXA2xx boards
++ *
++ * Author:    Nicolas Pitre
++ * Copyright: (C) 2002-2005 MontaVista Software Inc.
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License version 2 as
++ * published by the Free Software Foundation.
++ */
++
++#include <linux/config.h>
++#include <linux/serial_reg.h>
++#include <linux/kgdb.h>
++#include <asm/processor.h>
++#include <asm/hardware.h>
++#include <asm/arch/pxa-regs.h>
++
++#if   defined(CONFIG_KGDB_PXA_FFUART)
++
++#define UART          FFUART
++#define CKEN_UART     CKEN6_FFUART
++#define GPIO_RX_MD    GPIO34_FFRXD_MD
++#define GPIO_TX_MD    GPIO39_FFTXD_MD
++
++#elif defined(CONFIG_KGDB_PXA_BTUART)
++
++#define UART          BTUART
++#define CKEN_UART     CKEN7_BTUART
++#define GPIO_RX_MD    GPIO42_BTRXD_MD
++#define GPIO_TX_MD    GPIO43_BTTXD_MD
++
++#elif defined(CONFIG_KGDB_PXA_STUART)
++
++#define UART          STUART
++#define CKEN_UART     CKEN5_STUART
++#define GPIO_RX_MD    GPIO46_STRXD_MD
++#define GPIO_TX_MD    GPIO47_STTXD_MD
++
++#endif
++
++#define UART_BAUDRATE (CONFIG_KGDB_BAUDRATE)
++
++static volatile unsigned long *port = (unsigned long *)&UART;
++
++static int kgdb_serial_init(void)
++{
++      pxa_set_cken(CKEN_UART, 1);
++      pxa_gpio_mode(GPIO_RX_MD);
++      pxa_gpio_mode(GPIO_TX_MD);
++
++      port[UART_IER] = 0;
++      port[UART_LCR] = LCR_DLAB;
++      port[UART_DLL] = ((921600 / UART_BAUDRATE) & 0xff);
++      port[UART_DLM] = ((921600 / UART_BAUDRATE) >> 8);
++      port[UART_LCR] = LCR_WLS1 | LCR_WLS0;
++      port[UART_MCR] = 0;
++      port[UART_IER] = IER_UUE;
++      port[UART_FCR] = FCR_ITL_16;
++
++      return 0;
++}
++
++static void kgdb_serial_putchar(int c)
++{
++      if (!(CKEN & CKEN_UART) || port[UART_IER] != IER_UUE)
++              kgdb_serial_init();
++      while (!(port[UART_LSR] & LSR_TDRQ))
++              cpu_relax();
++      port[UART_TX] = c;
++}
++
++static void kgdb_serial_flush(void)
++{
++      if ((CKEN & CKEN_UART) && (port[UART_IER] & IER_UUE))
++              while (!(port[UART_LSR] & LSR_TEMT))
++                      cpu_relax();
++}
++
++static int kgdb_serial_getchar(void)
++{
++      unsigned char c;
++      if (!(CKEN & CKEN_UART) || port[UART_IER] != IER_UUE)
++              kgdb_serial_init();
++      while (!(port[UART_LSR] & UART_LSR_DR))
++              cpu_relax();
++      c = port[UART_RX];
++      return c;
++}
++
++struct kgdb_io kgdb_io_ops = {
++      .init = kgdb_serial_init,
++      .write_char = kgdb_serial_putchar,
++      .flush = kgdb_serial_flush,
++      .read_char = kgdb_serial_getchar,
++};
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/arm/mach-versatile/kgdb_serial.c linux-2.6.18.kgdb/arch/arm/mach-versatile/kgdb_serial.c
+--- linux-2.6.18/arch/arm/mach-versatile/kgdb_serial.c 1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18.kgdb/arch/arm/mach-versatile/kgdb_serial.c    2008-06-10 16:19:51.000000000 +0400
+@@ -0,0 +1,121 @@
++/*
++ * arch/arm/mach-versatile/kgdb_serial.c
++ *
++ * Author: Manish Lachwani, mlachwani@mvista.com
++ *
++ * 2005 (c) MontaVista Software, Inc. This file is licensed under
++ * the terms of the GNU General Public License version 2. This program
++ * is licensed "as is" without any warranty of any kind, whether express
++ * or implied.
++ *
++ * Support for KGDB on ARM Versatile.
++ */
++#include <linux/config.h>
++#include <linux/serial_reg.h>
++#include <linux/kgdb.h>
++#include <asm/io.h>
++#include <asm/processor.h>
++#include <asm/hardware.h>
++#include <asm/hardware/amba_serial.h>
++#include <asm/arch-versatile/hardware.h>
++
++#define ARM_BAUD_38400                23
++/*
++ * Functions that will be used later
++ */
++#define UART_GET_INT_STATUS(p)        readb((p) + UART010_IIR)
++#define UART_GET_MIS(p)               readw((p) + UART011_MIS)
++#define UART_PUT_ICR(p, c)    writel((c), (p) + UART010_ICR)
++#define UART_GET_FR(p)                readb((p) + UART01x_FR)
++#define UART_GET_CHAR(p)      readb((p) + UART01x_DR)
++#define UART_PUT_CHAR(p, c)     writel((c), (p) + UART01x_DR)
++#define UART_GET_RSR(p)               readb((p) + UART01x_RSR)
++#define UART_GET_CR(p)                readb((p) + UART010_CR)
++#define UART_PUT_CR(p,c)        writel((c), (p) + UART010_CR)
++#define UART_GET_LCRL(p)      readb((p) + UART010_LCRL)
++#define UART_PUT_LCRL(p,c)    writel((c), (p) + UART010_LCRL)
++#define UART_GET_LCRM(p)        readb((p) + UART010_LCRM)
++#define UART_PUT_LCRM(p,c)    writel((c), (p) + UART010_LCRM)
++#define UART_GET_LCRH(p)      readb((p) + UART010_LCRH)
++#define UART_PUT_LCRH(p,c)    writel((c), (p) + UART010_LCRH)
++#define UART_RX_DATA(s)               (((s) & UART01x_FR_RXFE) == 0)
++#define UART_TX_READY(s)      (((s) & UART01x_FR_TXFF) == 0)
++#define UART_TX_EMPTY(p)      ((UART_GET_FR(p) & UART01x_FR_TMSK) == 0)
++
++/*
++ * KGDB IRQ
++ */
++static int kgdb_irq = 12;
++static volatile unsigned char *port = NULL;
++
++static int kgdb_serial_init(void)
++{
++      int rate = ARM_BAUD_38400;
++
++      port = IO_ADDRESS(0x101F1000);
++      UART_PUT_CR(port, 0);
++
++      /* Set baud rate */
++      UART_PUT_LCRM(port, ((rate & 0xf00) >> 8));
++      UART_PUT_LCRL(port, (rate & 0xff));
++      UART_PUT_LCRH(port, UART01x_LCRH_WLEN_8 | UART01x_LCRH_FEN);
++      UART_PUT_CR(port, UART01x_CR_UARTEN);
++
++      return 0;
++}
++
++static void kgdb_serial_putchar(int ch)
++{
++      unsigned int status;
++
++      do {
++              status = UART_GET_FR(port);
++      } while (!UART_TX_READY(status));
++
++      UART_PUT_CHAR(port, ch);
++}
++
++static int kgdb_serial_getchar(void)
++{
++      unsigned int status;
++      int ch;
++
++      do {
++              status = UART_GET_FR(port);
++      } while (!UART_RX_DATA(status));
++      ch = UART_GET_CHAR(port);
++      return ch;
++}
++
++static struct uart_port kgdb_amba_port = {
++      .irq = 12,
++      .iobase = 0,
++      .iotype = UPIO_MEM,
++      .membase = (unsigned char *)IO_ADDRESS(0x101F1000),
++};
++
++static irqreturn_t kgdb_interrupt(int irq, void *dev_id, struct pt_regs *regs)
++{
++      int status = UART_GET_MIS(port);
++
++      if (irq != kgdb_irq)
++              return IRQ_NONE;
++
++      if (status & 0x40)
++              breakpoint();
++
++      return IRQ_HANDLED;
++}
++
++static void __init kgdb_hookup_irq(void)
++{
++      request_irq(kgdb_irq, kgdb_interrupt, SA_SHIRQ, "GDB-stub",
++                  &kgdb_amba_port);
++}
++
++struct kgdb_io kgdb_io_ops = {
++      .init = kgdb_serial_init,
++      .write_char = kgdb_serial_putchar,
++      .read_char = kgdb_serial_getchar,
++      .late_init = kgdb_hookup_irq,
++};
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/arm/mm/extable.c linux-2.6.18.kgdb/arch/arm/mm/extable.c
+--- linux-2.6.18/arch/arm/mm/extable.c 2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/arm/mm/extable.c    2008-06-10 16:19:51.000000000 +0400
+@@ -2,6 +2,7 @@
+  *  linux/arch/arm/mm/extable.c
+  */
+ #include <linux/module.h>
++#include <linux/kgdb.h>
+ #include <asm/uaccess.h>
+ 
+ int fixup_exception(struct pt_regs *regs)
+@@ -11,6 +12,12 @@ int fixup_exception(struct pt_regs *regs
+       fixup = search_exception_tables(instruction_pointer(regs));
+       if (fixup)
+               regs->ARM_pc = fixup->fixup;
++#ifdef CONFIG_KGDB
++      if (atomic_read(&debugger_active) && kgdb_may_fault)
++              /* Restore our previous state. */
++              kgdb_fault_longjmp(kgdb_fault_jmp_regs);
++              /* Not reached. */
++#endif
+ 
+       return fixup != NULL;
+ }
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/i386/kernel/Makefile linux-2.6.18.kgdb/arch/i386/kernel/Makefile
+--- linux-2.6.18/arch/i386/kernel/Makefile     2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/i386/kernel/Makefile        2008-06-10 16:19:17.000000000 +0400
+@@ -39,6 +39,7 @@ obj-$(CONFIG_VM86)           += vm86.o
+ obj-$(CONFIG_EARLY_PRINTK)    += early_printk.o
+ obj-$(CONFIG_HPET_TIMER)      += hpet.o
+ obj-$(CONFIG_K8_NB)           += k8.o
++obj-$(CONFIG_KGDB)            += kgdb.o kgdb-jmp.o
+ 
+ EXTRA_AFLAGS   := -traditional
+ 
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/i386/kernel/entry.S linux-2.6.18.kgdb/arch/i386/kernel/entry.S
+--- linux-2.6.18/arch/i386/kernel/entry.S      2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/i386/kernel/entry.S 2008-06-10 16:19:58.000000000 +0400
+@@ -201,7 +201,7 @@ VM_MASK            = 0x00020000
+       CFI_OFFSET ecx, ECX-OLDESP;\
+       CFI_OFFSET ebx, EBX-OLDESP
+ 
+-ENTRY(ret_from_fork)
++KPROBE_ENTRY(ret_from_fork)
+       CFI_STARTPROC
+       pushl %eax
+       CFI_ADJUST_CFA_OFFSET 4
+@@ -664,7 +664,7 @@ ENTRY(simd_coprocessor_error)
+       jmp error_code
+       CFI_ENDPROC
+ 
+-ENTRY(device_not_available)
++KPROBE_ENTRY(device_not_available)
+       RING0_INT_FRAME
+       pushl $-1                       # mark this as an int
+       CFI_ADJUST_CFA_OFFSET 4
+@@ -909,7 +909,7 @@ ENTRY(machine_check)
+       CFI_ENDPROC
+ #endif
+ 
+-ENTRY(spurious_interrupt_bug)
++KPROBE_ENTRY(spurious_interrupt_bug)
+       RING0_INT_FRAME
+       pushl $0
+       CFI_ADJUST_CFA_OFFSET 4
+@@ -953,3 +953,108 @@ ENDPROC(arch_unwind_init_running)
+ #include "syscall_table.S"
+ 
+ syscall_table_size=(.-sys_call_table)
++
++#     Here we do call frames.  We cheat a bit as we only really need
++#     correct frames at locations we can actually look at from a
++#     debugger.  Since the break instruction trap actually goes thru
++#     some of this code, we don't really need info on those areas, but
++#     only after the fact.  I.e. if we can not step or break in a
++#     location or end up with a return address pointing at the
++#     location, we don't need a correct call frame for it.
++
++#ifdef CONFIG_KGDB
++
++#include <linux/dwarf2-lang.h>
++/*
++ * The register numbers as known by gdb
++ */
++
++#define _EAX 0
++#define _ECX 1
++#define _EDX 2
++#define _EBX 3
++#define _ESP 4
++#define _EBP 5
++#define _ESI 6
++#define _EDI 7
++#define _PC  8
++#define _EIP 8
++#define _PS  9
++#define _EFLAGS  9
++#define _CS 10
++#define _SS 11
++#define _DS 12
++#define _ES 13
++#define _FS 14
++#define _GS 15
++      /*
++       * This code uses macros defined in linux/dwarf2-lang.h
++       * They attempt to follow the dwarf2 naming conventions... sort of..
++       */
++ENTRY(end_of_stack_stop_unwind_function)
++      .long   end_of_stack_stop_unwind_function+1
++
++      .text
++
++      CFI_preamble(c1,_PC,1,1)
++      CFA_define_reference(_ESP,OLDESP)       /* Stack pointer */
++      CFA_expression(_EIP)
++         CFA_exp_OP_dup                       /* copy old esp */
++         CFA_exp_OP_consts(CS-OLDESP)         /* offset to CS address */
++         CFA_exp_OP_plus                      /* should be CS address */
++         CFA_exp_OP_deref                     /* get the CS */
++         CFA_exp_OP_const4s(VM_MASK|3)        /* prepare to mask it */
++         CFA_exp_OP_and                       /* mask it, zero means kernel */
++         CFA_exp_OP_bra(eip_user_rtn)         /* branch if user */
++         CFA_exp_OP_const4s(EIP-OLDESP)       /* offset to return address */
++         CFA_exp_OP_plus                      /* add that in */
++         CFA_exp_OP_skip(eip_end)             /* done if kernel, skip out */
++eip_user_rtn:
++         CFA_exp_OP_addr(end_of_stack_stop_unwind_function)/*dummy function */
++eip_end:
++         CFA_expression_end
++      CFA_define_offset(_EBX,EBX-OLDESP)
++      CFA_define_offset(_ECX,ECX-OLDESP)
++      CFA_define_offset(_EDX,EDX-OLDESP)
++      CFA_define_offset(_ESI,ESI-OLDESP)
++      CFA_define_offset(_EDI,EDI-OLDESP)
++      CFA_define_offset(_EBP,EBP-OLDESP)
++      CFA_define_offset(_EAX,EAX-OLDESP)
++      CFA_define_offset(_EFLAGS,EFLAGS-OLDESP)
++      CFI_postamble()
++
++/*
++ * This provides an uwind for our dummy end of unwind function.
++ * Current convention is to provied an undefined return address.
++ */
++      CFI_preamble(c2,_PC,1,1)
++      CFA_define_reference(_ESP,0)    /* Stack pointer */
++      CFA_undefine_reg(_EIP)
++      CFI_postamble()
++
++      FDE_preamble(c2,end_of_stack_stop_unwind_function,      \
++                      end_of_stack_stop_unwind_function+5)
++      FDE_postamble()
++      /*
++         * This is VERY sloppy.  At this point all we want to do is get
++         * the frame right for back tracing.  It will not be good if
++         * you try to single step.  We use already defined labels.
++         * We want to cover all call outs.
++         * We could also recode this as just one FDE, but this works and
++         * I want to get it out.
++       */
++      FDE_preamble(c1,ret_from_fork,ret_from_exception)
++      CFA_define_cfa_offset(4)                /* one extra word on stack */
++      FDE_postamble()
++
++      FDE_preamble(c1,ret_from_exception,device_not_available_emulate)
++      FDE_postamble()
++
++              FDE_preamble(c1,device_not_available_emulate,debug)
++      CFA_define_cfa_offset(4)                /* one extra word on stack */
++      FDE_postamble()
++
++      FDE_preamble(c1, debug,spurious_interrupt_bug)
++      FDE_postamble()
++
++#endif
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/i386/kernel/head.S linux-2.6.18.kgdb/arch/i386/kernel/head.S
+--- linux-2.6.18/arch/i386/kernel/head.S       2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/i386/kernel/head.S  2008-06-10 16:19:58.000000000 +0400
+@@ -10,6 +10,7 @@
+ .text
+ #include <linux/threads.h>
+ #include <linux/linkage.h>
++#include <asm/kgdb.h>
+ #include <asm/segment.h>
+ #include <asm/page.h>
+ #include <asm/pgtable.h>
+@@ -326,6 +327,10 @@ is386:    movl $2,%ecx            # set MP
+ #endif /* CONFIG_SMP */
+       jmp start_kernel
+ 
++      /* This dwarf code tells gdb that this is the end of the unwind */
++      /* This uses the CFA set up for pc=1 located in entry.S */
++      CFI_END_FRAME(is386)
++
+ /*
+  * We depend on ET to be correct. This checks for 287/387.
+  */
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/i386/kernel/kgdb-jmp.S linux-2.6.18.kgdb/arch/i386/kernel/kgdb-jmp.S
+--- linux-2.6.18/arch/i386/kernel/kgdb-jmp.S   1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18.kgdb/arch/i386/kernel/kgdb-jmp.S      2008-06-10 16:19:17.000000000 +0400
+@@ -0,0 +1,74 @@
++/*
++ * arch/i386/kernel/kgdb-jmp.S
++ *
++ * Save and restore system registers so that within a limited frame we
++ * may have a fault and "jump back" to a known safe location.
++ *
++ * Author: George Anzinger <george@mvista.com>
++ *
++ * Cribbed from glibc, which carries the following:
++ * Copyright (C) 1996, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
++ * Copyright (C) 2005 by MontaVista Software.
++ *
++ * This file is licensed under the terms of the GNU General Public License
++ * version 2. This program as licensed "as is" without any warranty of
++ * any kind, whether express or implied.
++ */
++
++#include <linux/linkage.h>
++
++#define PCOFF         0
++#define LINKAGE               4               /* just the return address */
++#define PTR_SIZE      4
++#define PARMS         LINKAGE         /* no space for saved regs */
++#define JMPBUF                PARMS
++#define VAL           JMPBUF+PTR_SIZE
++
++#define JB_BX         0
++#define JB_SI         1
++#define JB_DI         2
++#define JB_BP         3
++#define JB_SP         4
++#define JB_PC         5
++
++/* This must be called prior to kgdb_fault_longjmp and
++ * kgdb_fault_longjmp must not be called outside of the context of the
++ * last call to kgdb_fault_setjmp.
++ * kgdb_fault_setjmp(int *jmp_buf[6])
++ */
++ENTRY(kgdb_fault_setjmp)
++      movl JMPBUF(%esp), %eax
++
++      /* Save registers.  */
++      movl    %ebx, (JB_BX*4)(%eax)
++      movl    %esi, (JB_SI*4)(%eax)
++      movl    %edi, (JB_DI*4)(%eax)
++      /* Save SP as it will be after we return.  */
++      leal    JMPBUF(%esp), %ecx
++      movl    %ecx, (JB_SP*4)(%eax)
++      movl    PCOFF(%esp), %ecx       /* Save PC we are returning to now.  */
++      movl    %ecx, (JB_PC*4)(%eax)
++      movl    %ebp, (JB_BP*4)(%eax)   /* Save caller's frame pointer.  */
++
++      /* Restore state so we can now try the access. */
++      movl    JMPBUF(%esp), %ecx      /* User's jmp_buf in %ecx.  */
++      /* Save the return address now.  */
++      movl    (JB_PC*4)(%ecx), %edx
++      /* Restore registers.  */
++      movl    $0, %eax
++      movl    (JB_SP*4)(%ecx), %esp
++      jmp     *%edx           /* Jump to saved PC. */
++
++/* kgdb_fault_longjmp(int *jmp_buf[6]) */
++ENTRY(kgdb_fault_longjmp)
++      movl    JMPBUF(%esp), %ecx      /* User's jmp_buf in %ecx.  */
++      /* Save the return address now.  */
++      movl    (JB_PC*4)(%ecx), %edx
++      /* Restore registers.  */
++      movl    (JB_BX*4)(%ecx), %ebx
++      movl    (JB_SI*4)(%ecx), %esi
++      movl    (JB_DI*4)(%ecx), %edi
++      movl    (JB_BP*4)(%ecx), %ebp
++      movl    $1, %eax
++      movl    (JB_SP*4)(%ecx), %esp
++      jmp     *%edx           /* Jump to saved PC. */
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/i386/kernel/kgdb.c linux-2.6.18.kgdb/arch/i386/kernel/kgdb.c
+--- linux-2.6.18/arch/i386/kernel/kgdb.c       1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18.kgdb/arch/i386/kernel/kgdb.c  2008-06-10 16:20:15.000000000 +0400
+@@ -0,0 +1,363 @@
++/*
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License as published by the
++ * Free Software Foundation; either version 2, or (at your option) any
++ * later version.
++ *
++ * This program is distributed in the hope that it will be useful, but
++ * WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * General Public License for more details.
++ *
++ */
++
++/*
++ * Copyright (C) 2000-2001 VERITAS Software Corporation.
++ */
++/*
++ *  Contributor:     Lake Stevens Instrument Division$
++ *  Written by:      Glenn Engel $
++ *  Updated by:            Amit Kale<akale@veritas.com>
++ *  Updated by:            Tom Rini <trini@kernel.crashing.org>
++ *  Modified for 386 by Jim Kingdon, Cygnus Support.
++ *  Origianl kgdb, compatibility with 2.1.xx kernel by
++ *  David Grothe <dave@gcom.com>
++ *  Additional support from Tigran Aivazian <tigran@sco.com>
++ */
++
++#include <linux/string.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/smp.h>
++#include <linux/spinlock.h>
++#include <linux/delay.h>
++#include <asm/vm86.h>
++#include <asm/system.h>
++#include <asm/ptrace.h>               /* for linux pt_regs struct */
++#include <linux/kgdb.h>
++#include <linux/init.h>
++#include <asm/apicdef.h>
++#include <asm/desc.h>
++#include <asm/kdebug.h>
++
++#include "mach_ipi.h"
++
++/* Put the error code here just in case the user cares.  */
++int gdb_i386errcode;
++/* Likewise, the vector number here (since GDB only gets the signal
++   number through the usual means, and that's not very specific).  */
++int gdb_i386vector = -1;
++
++extern atomic_t cpu_doing_single_step;
++
++void regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
++{
++      gdb_regs[_EAX] = regs->eax;
++      gdb_regs[_EBX] = regs->ebx;
++      gdb_regs[_ECX] = regs->ecx;
++      gdb_regs[_EDX] = regs->edx;
++      gdb_regs[_ESI] = regs->esi;
++      gdb_regs[_EDI] = regs->edi;
++      gdb_regs[_EBP] = regs->ebp;
++      gdb_regs[_DS] = regs->xds;
++      gdb_regs[_ES] = regs->xes;
++      gdb_regs[_PS] = regs->eflags;
++      gdb_regs[_CS] = regs->xcs;
++      gdb_regs[_PC] = regs->eip;
++      gdb_regs[_ESP] = (int)(&regs->esp);
++      gdb_regs[_SS] = __KERNEL_DS;
++      gdb_regs[_FS] = 0xFFFF;
++      gdb_regs[_GS] = 0xFFFF;
++}
++
++/*
++ * Extracts ebp, esp and eip values understandable by gdb from the values
++ * saved by switch_to.
++ * thread.esp points to ebp. flags and ebp are pushed in switch_to hence esp
++ * prior to entering switch_to is 8 greater then the value that is saved.
++ * If switch_to changes, change following code appropriately.
++ */
++void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
++{
++      gdb_regs[_EAX] = 0;
++      gdb_regs[_EBX] = 0;
++      gdb_regs[_ECX] = 0;
++      gdb_regs[_EDX] = 0;
++      gdb_regs[_ESI] = 0;
++      gdb_regs[_EDI] = 0;
++      gdb_regs[_EBP] = *(unsigned long *)p->thread.esp;
++      gdb_regs[_DS] = __KERNEL_DS;
++      gdb_regs[_ES] = __KERNEL_DS;
++      gdb_regs[_PS] = 0;
++      gdb_regs[_CS] = __KERNEL_CS;
++      gdb_regs[_PC] = p->thread.eip;
++      gdb_regs[_ESP] = p->thread.esp;
++      gdb_regs[_SS] = __KERNEL_DS;
++      gdb_regs[_FS] = 0xFFFF;
++      gdb_regs[_GS] = 0xFFFF;
++}
++
++void gdb_regs_to_regs(unsigned long *gdb_regs, struct pt_regs *regs)
++{
++      regs->eax = gdb_regs[_EAX];
++      regs->ebx = gdb_regs[_EBX];
++      regs->ecx = gdb_regs[_ECX];
++      regs->edx = gdb_regs[_EDX];
++      regs->esi = gdb_regs[_ESI];
++      regs->edi = gdb_regs[_EDI];
++      regs->ebp = gdb_regs[_EBP];
++      regs->xds = gdb_regs[_DS];
++      regs->xes = gdb_regs[_ES];
++      regs->eflags = gdb_regs[_PS];
++      regs->xcs = gdb_regs[_CS];
++      regs->eip = gdb_regs[_PC];
++}
++
++static struct hw_breakpoint {
++      unsigned enabled;
++      unsigned type;
++      unsigned len;
++      unsigned addr;
++} breakinfo[4] = {
++      { .enabled = 0 },
++      { .enabled = 0 },
++      { .enabled = 0 },
++      { .enabled = 0 },
++};
++
++void kgdb_correct_hw_break(void)
++{
++      int breakno;
++      int correctit;
++      int breakbit;
++      unsigned dr7;
++
++      asm volatile ("movl %%db7, %0\n":"=r" (dr7)
++                    :);
++      do {
++              unsigned addr0, addr1, addr2, addr3;
++              asm volatile ("movl %%db0, %0\n"
++                            "movl %%db1, %1\n"
++                            "movl %%db2, %2\n"
++                            "movl %%db3, %3\n":"=r" (addr0), "=r"(addr1),
++                            "=r"(addr2), "=r"(addr3):);
++      } while (0);
++      correctit = 0;
++      for (breakno = 0; breakno < 3; breakno++) {
++              breakbit = 2 << (breakno << 1);
++              if (!(dr7 & breakbit) && breakinfo[breakno].enabled) {
++                      correctit = 1;
++                      dr7 |= breakbit;
++                      dr7 &= ~(0xf0000 << (breakno << 2));
++                      dr7 |= (((breakinfo[breakno].len << 2) |
++                               breakinfo[breakno].type) << 16) <<
++                          (breakno << 2);
++                      switch (breakno) {
++                      case 0:
++                              asm volatile ("movl %0, %%dr0\n"::"r"
++                                            (breakinfo[breakno].addr));
++                              break;
++
++                      case 1:
++                              asm volatile ("movl %0, %%dr1\n"::"r"
++                                            (breakinfo[breakno].addr));
++                              break;
++
++                      case 2:
++                              asm volatile ("movl %0, %%dr2\n"::"r"
++                                            (breakinfo[breakno].addr));
++                              break;
++
++                      case 3:
++                              asm volatile ("movl %0, %%dr3\n"::"r"
++                                            (breakinfo[breakno].addr));
++                              break;
++                      }
++              } else if ((dr7 & breakbit) && !breakinfo[breakno].enabled) {
++                      correctit = 1;
++                      dr7 &= ~breakbit;
++                      dr7 &= ~(0xf0000 << (breakno << 2));
++              }
++      }
++      if (correctit)
++              asm volatile ("movl %0, %%db7\n"::"r" (dr7));
++}
++
++int kgdb_remove_hw_break(unsigned long addr)
++{
++      int i, idx = -1;
++      for (i = 0; i < 4; i++) {
++              if (breakinfo[i].addr == addr && breakinfo[i].enabled) {
++                      idx = i;
++                      break;
++              }
++      }
++      if (idx == -1)
++              return -1;
++
++      breakinfo[idx].enabled = 0;
++      return 0;
++}
++
++void kgdb_remove_all_hw_break(void)
++{
++      int i;
++
++      for (i = 0; i < 4; i++) {
++              if (breakinfo[i].enabled) {
++                      /* Do what? */
++                      ;
++              }
++              memset(&breakinfo[i], 0, sizeof(struct hw_breakpoint));
++      }
++}
++
++int kgdb_set_hw_break(unsigned long addr)
++{
++      int i, idx = -1;
++      for (i = 0; i < 4; i++) {
++              if (!breakinfo[i].enabled) {
++                      idx = i;
++                      break;
++              }
++      }
++      if (idx == -1)
++              return -1;
++
++      breakinfo[idx].enabled = 1;
++      breakinfo[idx].type = 1;
++      breakinfo[idx].len = 1;
++      breakinfo[idx].addr = addr;
++      return 0;
++}
++
++void kgdb_disable_hw_debug(struct pt_regs *regs)
++{
++      /* Disable hardware debugging while we are in kgdb */
++      asm volatile ("movl %0,%%db7": /* no output */ :"r" (0));
++}
++
++void kgdb_post_master_code(struct pt_regs *regs, int e_vector, int err_code)
++{
++      /* Master processor is completely in the debugger */
++      gdb_i386vector = e_vector;
++      gdb_i386errcode = err_code;
++}
++
++void kgdb_roundup_cpus(unsigned long flags)
++{
++      send_IPI_allbutself(APIC_DM_NMI);
++}
++
++int kgdb_arch_handle_exception(int e_vector, int signo,
++                             int err_code, char *remcom_in_buffer,
++                             char *remcom_out_buffer,
++                             struct pt_regs *linux_regs)
++{
++      long addr;
++      char *ptr;
++      int newPC, dr6;
++
++      switch (remcom_in_buffer[0]) {
++      case 'c':
++      case 's':
++              /* try to read optional parameter, pc unchanged if no parm */
++              ptr = &remcom_in_buffer[1];
++              if (kgdb_hex2long(&ptr, &addr))
++                      linux_regs->eip = addr;
++              newPC = linux_regs->eip;
++
++              /* clear the trace bit */
++              linux_regs->eflags &= ~TF_MASK;
++              atomic_set(&cpu_doing_single_step, -1);
++
++              /* set the trace bit if we're stepping */
++              if (remcom_in_buffer[0] == 's') {
++                      linux_regs->eflags |= TF_MASK;
++                      debugger_step = 1;
++                      atomic_set(&cpu_doing_single_step,smp_processor_id());
++              }
++
++              asm volatile ("movl %%db6, %0\n":"=r" (dr6));
++              if (!(dr6 & 0x4000)) {
++                      long breakno;
++                      for (breakno = 0; breakno < 4; ++breakno) {
++                              if (dr6 & (1 << breakno) &&
++                                  breakinfo[breakno].type == 0) {
++                                      /* Set restore flag */
++                                      linux_regs->eflags |= X86_EFLAGS_RF;
++                                      break;
++                              }
++                      }
++              }
++              kgdb_correct_hw_break();
++              asm volatile ("movl %0, %%db6\n"::"r" (0));
++
++              return (0);
++      }                       /* switch */
++      /* this means that we do not want to exit from the handler */
++      return -1;
++}
++
++/* Register KGDB with the i386die_chain so that we hook into all of the right
++ * spots. */
++static int kgdb_notify(struct notifier_block *self, unsigned long cmd,
++                     void *ptr)
++{
++      struct die_args *args = ptr;
++      struct pt_regs *regs = args->regs;
++
++      /* Bad memory access? */
++      if (cmd == DIE_PAGE_FAULT_NO_CONTEXT && atomic_read(&debugger_active)
++                      && kgdb_may_fault) {
++              kgdb_fault_longjmp(kgdb_fault_jmp_regs);
++              return NOTIFY_STOP;
++      } else if (cmd == DIE_PAGE_FAULT)
++              /* A normal page fault, ignore. */
++              return NOTIFY_DONE;
++       else if ((cmd == DIE_NMI || cmd == DIE_NMI_IPI ||
++               cmd == DIE_NMIWATCHDOG) && atomic_read(&debugger_active)) {
++               /* CPU roundup */
++               kgdb_nmihook(smp_processor_id(), regs);
++               return NOTIFY_STOP;
++       } else if (cmd == DIE_NMI_IPI || cmd == DIE_NMI || user_mode(regs) ||
++                       (cmd == DIE_DEBUG && atomic_read(&debugger_active)))
++               /* Normal watchdog event or userspace debugging, or spurious
++                * debug exception, ignore. */
++               return NOTIFY_DONE;
++
++      kgdb_handle_exception(args->trapnr, args->signr, args->err, regs);
++
++      return NOTIFY_STOP;
++}
++
++static struct notifier_block kgdb_notifier = {
++      .notifier_call = kgdb_notify,
++};
++
++int kgdb_arch_init(void)
++{
++      atomic_notifier_chain_register(&i386die_chain, &kgdb_notifier);
++      return 0;
++}
++
++/*
++ * Skip an int3 exception when it occurs after a breakpoint has been
++ * removed. Backtrack eip by 1 since the int3 would have caused it to
++ * increment by 1.
++ */
++
++int kgdb_skipexception(int exception, struct pt_regs *regs)
++{
++      if (exception == 3 && kgdb_isremovedbreak(regs->eip - 1)) {
++              regs->eip -= 1;
++              return 1;
++      }
++      return 0;
++}
++
++struct kgdb_arch arch_kgdb_ops = {
++      .gdb_bpt_instr = {0xcc},
++      .flags = KGDB_HW_BREAKPOINT,
++};
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/i386/kernel/process.c linux-2.6.18.kgdb/arch/i386/kernel/process.c
+--- linux-2.6.18/arch/i386/kernel/process.c    2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/i386/kernel/process.c       2008-06-10 16:19:58.000000000 +0400
+@@ -328,7 +328,27 @@ __asm__(".section .text\n"
+       "call *%ebx\n\t"
+       "pushl %eax\n\t"
+       "call do_exit\n"
++      "kernel_thread_helper_end:\n\t"
+       ".previous");
++#ifdef CONFIG_KGDB
++#include <linux/dwarf2-lang.h>
++
++      /* This dwarf code tells gdb that this is the end of the unwind */
++      /* This uses the CFA set up for pc=1 located in entry.S */
++#define _ESP 4
++#define _PC  8
++#define _EIP 8
++__asm__(
++      QUOTE_THIS(
++              CFI_preamble(dwarf_4,_PC,1,1)
++              CFA_define_reference(_ESP,0)    /* Stack pointer */
++              CFA_undefine_reg(_EIP)
++              CFI_postamble()
++
++              FDE_preamble(dwarf_4,kernel_thread_helper,kernel_thread_helper_end)
++              FDE_postamble()
++              ));
++#endif
+ 
+ /*
+  * Create a kernel thread
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/i386/kernel/setup.c linux-2.6.18.kgdb/arch/i386/kernel/setup.c
+--- linux-2.6.18/arch/i386/kernel/setup.c      2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/i386/kernel/setup.c 2008-06-10 16:19:17.000000000 +0400
+@@ -148,6 +148,7 @@ EXPORT_SYMBOL(ist_info);
+ struct e820map e820;
+ 
+ extern void early_cpu_init(void);
++extern void early_trap_init(void);
+ extern void generic_apic_probe(char *);
+ extern int root_mountflags;
+ 
+@@ -1444,6 +1445,7 @@ void __init setup_arch(char **cmdline_p)
+       memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
+       pre_setup_arch_hook();
+       early_cpu_init();
++      early_trap_init();
+ 
+       /*
+        * FIXME: This isn't an official loader_type right
+@@ -1500,6 +1502,7 @@ void __init setup_arch(char **cmdline_p)
+       data_resource.end = virt_to_phys(_edata)-1;
+ 
+       parse_cmdline_early(cmdline_p);
++      parse_early_param();
+ 
+ #ifdef CONFIG_EARLY_PRINTK
+       {
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/i386/kernel/smpboot.c linux-2.6.18.kgdb/arch/i386/kernel/smpboot.c
+--- linux-2.6.18/arch/i386/kernel/smpboot.c    2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/i386/kernel/smpboot.c       2008-06-10 16:19:58.000000000 +0400
+@@ -592,6 +592,9 @@ void __devinit initialize_secondary(void
+ 
+       asm volatile(
+               "movl %0,%%esp\n\t"
++#ifdef CONFIG_KGDB
++              "pushl end_of_stack_stop_unwind_function\n\t"
++#endif
+               "jmp *%1"
+               :
+               :"r" (current->thread.esp),"r" (current->thread.eip));
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/i386/kernel/traps.c linux-2.6.18.kgdb/arch/i386/kernel/traps.c
+--- linux-2.6.18/arch/i386/kernel/traps.c      2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/i386/kernel/traps.c 2008-06-10 16:19:17.000000000 +0400
+@@ -863,6 +863,7 @@ fastcall void __kprobes do_debug(struct 
+        */
+ clear_dr7:
+       set_debugreg(0, 7);
++      notify_die(DIE_DEBUG, "debug2", regs, condition, error_code, SIGTRAP);
+       return;
+ 
+ debug_vm86:
+@@ -1167,6 +1168,12 @@ static void __init set_task_gate(unsigne
+       _set_gate(idt_table+n,5,0,0,(gdt_entry<<3));
+ }
+ 
++/* Some traps need to be set early. */
++void __init early_trap_init(void) {
++      set_intr_gate(1,&debug);
++      set_system_intr_gate(3, &int3); /* int3 can be called from all */
++      set_intr_gate(14,&page_fault);
++}
+ 
+ void __init trap_init(void)
+ {
+@@ -1183,10 +1190,8 @@ void __init trap_init(void)
+ #endif
+ 
+       set_trap_gate(0,&divide_error);
+-      set_intr_gate(1,&debug);
+       set_intr_gate(2,&nmi);
+-      set_system_intr_gate(3, &int3); /* int3/4 can be called from all */
+-      set_system_gate(4,&overflow);
++      set_system_gate(4,&overflow); /* int4/5 can be called from all */
+       set_trap_gate(5,&bounds);
+       set_trap_gate(6,&invalid_op);
+       set_trap_gate(7,&device_not_available);
+@@ -1196,7 +1201,6 @@ void __init trap_init(void)
+       set_trap_gate(11,&segment_not_present);
+       set_trap_gate(12,&stack_segment);
+       set_trap_gate(13,&general_protection);
+-      set_intr_gate(14,&page_fault);
+       set_trap_gate(15,&spurious_interrupt_bug);
+       set_trap_gate(16,&coprocessor_error);
+       set_trap_gate(17,&alignment_check);
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/i386/mm/fault.c linux-2.6.18.kgdb/arch/i386/mm/fault.c
+--- linux-2.6.18/arch/i386/mm/fault.c  2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/i386/mm/fault.c     2008-06-10 16:19:17.000000000 +0400
+@@ -539,6 +539,10 @@ no_context:
+       if (is_prefetch(regs, address, error_code))
+               return;
+ 
++      if (notify_die(DIE_PAGE_FAULT_NO_CONTEXT, "no context", regs,
++                              error_code, 14, SIGSEGV) == NOTIFY_STOP)
++              return;
++
+ /*
+  * Oops. The kernel tried to access some bad page. We'll have to
+  * terminate things with extreme prejudice.
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ia64/kernel/Makefile linux-2.6.18.kgdb/arch/ia64/kernel/Makefile
+--- linux-2.6.18/arch/ia64/kernel/Makefile     2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/ia64/kernel/Makefile        2008-06-10 16:19:32.000000000 +0400
+@@ -31,6 +31,7 @@ obj-$(CONFIG_KPROBES)                += kprobes.o jpro
+ obj-$(CONFIG_IA64_UNCACHED_ALLOCATOR) += uncached.o
+ obj-$(CONFIG_AUDIT)           += audit.o
+ mca_recovery-y                        += mca_drv.o mca_drv_asm.o
++obj-$(CONFIG_KGDB)            += kgdb.o kgdb-jmp.o
+ 
+ # The gate DSO image is built using a special linker script.
+ targets += gate.so gate-syms.o
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ia64/kernel/entry.S linux-2.6.18.kgdb/arch/ia64/kernel/entry.S
+--- linux-2.6.18/arch/ia64/kernel/entry.S      2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/ia64/kernel/entry.S 2008-06-10 16:20:23.000000000 +0400
+@@ -953,9 +953,9 @@ GLOBAL_ENTRY(ia64_leave_kernel)
+       shr.u r18=r19,16        // get byte size of existing "dirty" partition
+       ;;
+       mov r16=ar.bsp          // get existing backing store pointer
+-      addl r17=THIS_CPU(ia64_phys_stacked_size_p8),r0
++(pUStk)       addl r17=THIS_CPU(ia64_phys_stacked_size_p8),r0
+       ;;
+-      ld4 r17=[r17]           // r17 = cpu_data->phys_stacked_size_p8
++(pUStk)       ld4 r17=[r17]           // r17 = cpu_data->phys_stacked_size_p8
+ (pKStk)       br.cond.dpnt skip_rbs_switch
+ 
+       /*
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ia64/kernel/ivt.S linux-2.6.18.kgdb/arch/ia64/kernel/ivt.S
+--- linux-2.6.18/arch/ia64/kernel/ivt.S        2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/ia64/kernel/ivt.S   2008-06-10 16:20:23.000000000 +0400
+@@ -52,6 +52,14 @@
+ #include <asm/unistd.h>
+ #include <asm/errno.h>
+ 
++#ifdef CONFIG_KGDB
++#define KGDB_ENABLE_PSR_DB mov r31=psr;; movl r30=IA64_PSR_DB;;       \
++      or r31=r31,r30;;                                        \
++      mov psr.l=r31;; srlz.i;;
++#else
++#define KGDB_ENABLE_PSR_DB
++#endif
++
+ #if 1
+ # define PSR_DEFAULT_BITS     psr.ac
+ #else
+@@ -519,6 +527,7 @@ ENTRY(page_fault)
+       movl r14=ia64_leave_kernel
+       ;;
+       SAVE_REST
++      KGDB_ENABLE_PSR_DB
+       mov rp=r14
+       ;;
+       adds out2=16,r12                        // out2 = pointer to pt_regs
+@@ -863,6 +872,7 @@ ENTRY(interrupt)
+       srlz.i                  // ensure everybody knows psr.ic is back on
+       ;;
+       SAVE_REST
++      KGDB_ENABLE_PSR_DB
+       ;;
+       MCA_RECOVER_RANGE(interrupt)
+       alloc r14=ar.pfs,0,0,2,0 // must be first in an insn group
+@@ -1110,6 +1120,7 @@ ENTRY(non_syscall)
+       movl r15=ia64_leave_kernel
+       ;;
+       SAVE_REST
++      KGDB_ENABLE_PSR_DB
+       mov rp=r15
+       ;;
+       br.call.sptk.many b6=ia64_bad_break     // avoid WAW on CFM and ignore return addr
+@@ -1143,6 +1154,7 @@ ENTRY(dispatch_unaligned_handler)
+       adds r3=8,r2                            // set up second base pointer
+       ;;
+       SAVE_REST
++      KGDB_ENABLE_PSR_DB
+       movl r14=ia64_leave_kernel
+       ;;
+       mov rp=r14
+@@ -1185,6 +1197,10 @@ ENTRY(dispatch_to_fault_handler)
+       adds r3=8,r2                            // set up second base pointer for SAVE_REST
+       ;;
+       SAVE_REST
++      cmp.eq p6,p0=29,out0
++(p6)  br.cond.spnt 1f;;                       // debug_vector
++      KGDB_ENABLE_PSR_DB
++1:
+       movl r14=ia64_leave_kernel
+       ;;
+       mov rp=r14
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ia64/kernel/kgdb-jmp.S linux-2.6.18.kgdb/arch/ia64/kernel/kgdb-jmp.S
+--- linux-2.6.18/arch/ia64/kernel/kgdb-jmp.S   1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18.kgdb/arch/ia64/kernel/kgdb-jmp.S      2008-06-10 16:19:32.000000000 +0400
+@@ -0,0 +1,238 @@
++/* setjmp() and longjmp() assembler support for kdb on ia64.
++
++   This code was copied from glibc CVS as of 2001-06-27 and modified where
++   necessary to fit the kernel.
++   Keith Owens <kaos@melbourne.sgi.com> 2001-06-27
++ */
++
++/* Copyright (C) 1999, 2000, 2001 Free Software Foundation, Inc.
++   Contributed by David Mosberger-Tang <davidm@hpl.hp.com>.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Library General Public License as
++   published by the Free Software Foundation; either version 2 of the
++   License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Library General Public License for more details.
++
++   You should have received a copy of the GNU Library General Public
++   License along with the GNU C Library; see the file COPYING.LIB.  If
++   not, write to the Free Software Foundation, Inc.,
++   59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
++*/
++
++#include <asm/asmmacro.h>
++GLOBAL_ENTRY(kgdb_fault_setjmp)
++      .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(2)
++      alloc loc1=ar.pfs,2,2,2,0
++      mov r16=ar.unat
++      ;;
++      mov r17=ar.fpsr
++      mov r2=in0
++      add r3=8,in0
++      ;;
++.mem.offset 0,0;
++      st8.spill.nta [r2]=sp,16        // r12 (sp)
++.mem.offset 8,0;
++      st8.spill.nta [r3]=gp,16        // r1 (gp)
++      ;;
++      st8.nta [r2]=r16,16             // save caller's unat
++      st8.nta [r3]=r17,16             // save fpsr
++      add r8=0xa0,in0
++      ;;
++.mem.offset 160,0;
++      st8.spill.nta [r2]=r4,16        // r4
++.mem.offset 168,0;
++      st8.spill.nta [r3]=r5,16        // r5
++      add r9=0xb0,in0
++      ;;
++      stf.spill.nta [r8]=f2,32
++      stf.spill.nta [r9]=f3,32
++      mov loc0=rp
++      .body
++      ;;
++      stf.spill.nta [r8]=f4,32
++      stf.spill.nta [r9]=f5,32
++      mov r17=b1
++      ;;
++      stf.spill.nta [r8]=f16,32
++      stf.spill.nta [r9]=f17,32
++      mov r18=b2
++      ;;
++      stf.spill.nta [r8]=f18,32
++      stf.spill.nta [r9]=f19,32
++      mov r19=b3
++      ;;
++      stf.spill.nta [r8]=f20,32
++      stf.spill.nta [r9]=f21,32
++      mov r20=b4
++      ;;
++      stf.spill.nta [r8]=f22,32
++      stf.spill.nta [r9]=f23,32
++      mov r21=b5
++      ;;
++      stf.spill.nta [r8]=f24,32
++      stf.spill.nta [r9]=f25,32
++      mov r22=ar.lc
++      ;;
++      stf.spill.nta [r8]=f26,32
++      stf.spill.nta [r9]=f27,32
++      mov r24=pr
++      ;;
++      stf.spill.nta [r8]=f28,32
++      stf.spill.nta [r9]=f29,32
++      ;;
++      stf.spill.nta [r8]=f30
++      stf.spill.nta [r9]=f31
++
++.mem.offset 0,0;
++      st8.spill.nta [r2]=r6,16        // r6
++.mem.offset 8,0;
++      st8.spill.nta [r3]=r7,16        // r7
++      ;;
++      mov r23=ar.bsp
++      mov r25=ar.unat
++      st8.nta [r2]=loc0,16            // b0
++      st8.nta [r3]=r17,16             // b1
++      ;;
++      st8.nta [r2]=r18,16             // b2
++      st8.nta [r3]=r19,16             // b3
++      ;;
++      st8.nta [r2]=r20,16             // b4
++      st8.nta [r3]=r21,16             // b5
++      ;;
++      st8.nta [r2]=loc1,16            // ar.pfs
++      st8.nta [r3]=r22,16             // ar.lc
++      ;;
++      st8.nta [r2]=r24,16             // pr
++      st8.nta [r3]=r23,16             // ar.bsp
++      ;;
++      st8.nta [r2]=r25                // ar.unat
++      st8.nta [r3]=in0                // &__jmp_buf
++      mov r8=0
++      mov rp=loc0
++      mov ar.pfs=loc1
++      br.ret.sptk.few rp
++END(kdba_setjmp)
++#define       pPos    p6      /* is rotate count positive? */
++#define       pNeg    p7      /* is rotate count negative? */
++GLOBAL_ENTRY(kgdb_fault_longjmp)
++      alloc r8=ar.pfs,2,1,0,0
++      mov r27=ar.rsc
++      add r2=0x98,in0         // r2 <- &jmpbuf.orig_jmp_buf_addr
++      ;;
++      ld8 r8=[r2],-16         // r8 <- orig_jmp_buf_addr
++      mov r10=ar.bsp
++      and r11=~0x3,r27        // clear ar.rsc.mode
++      ;;
++      flushrs                 // flush dirty regs to backing store (must be first in insn grp)
++      ld8 r23=[r2],8          // r23 <- jmpbuf.ar_bsp
++      sub r8=r8,in0           // r8 <- &orig_jmpbuf - &jmpbuf
++      ;;
++      ld8 r25=[r2]            // r25 <- jmpbuf.ar_unat
++      extr.u r8=r8,3,6        // r8 <- (&orig_jmpbuf - &jmpbuf)/8 & 0x3f
++      ;;
++      cmp.lt pNeg,pPos=r8,r0
++      mov r2=in0
++      ;;
++(pPos)        mov r16=r8
++(pNeg)        add r16=64,r8
++(pPos)        sub r17=64,r8
++(pNeg)        sub r17=r0,r8
++      ;;
++      mov ar.rsc=r11          // put RSE in enforced lazy mode
++      shr.u r8=r25,r16
++      add r3=8,in0            // r3 <- &jmpbuf.r1
++      shl r9=r25,r17
++      ;;
++      or r25=r8,r9
++      ;;
++      mov r26=ar.rnat
++      mov ar.unat=r25         // setup ar.unat (NaT bits for r1, r4-r7, and r12)
++      ;;
++      ld8.fill.nta sp=[r2],16 // r12 (sp)
++      ld8.fill.nta gp=[r3],16         // r1 (gp)
++      dep r11=-1,r23,3,6      // r11 <- ia64_rse_rnat_addr(jmpbuf.ar_bsp)
++      ;;
++      ld8.nta r16=[r2],16             // caller's unat
++      ld8.nta r17=[r3],16             // fpsr
++      ;;
++      ld8.fill.nta r4=[r2],16 // r4
++      ld8.fill.nta r5=[r3],16         // r5 (gp)
++      cmp.geu p8,p0=r10,r11   // p8 <- (ar.bsp >= jmpbuf.ar_bsp)
++      ;;
++      ld8.fill.nta r6=[r2],16 // r6
++      ld8.fill.nta r7=[r3],16         // r7
++      ;;
++      mov ar.unat=r16                 // restore caller's unat
++      mov ar.fpsr=r17                 // restore fpsr
++      ;;
++      ld8.nta r16=[r2],16             // b0
++      ld8.nta r17=[r3],16             // b1
++      ;;
++(p8)  ld8 r26=[r11]           // r26 <- *ia64_rse_rnat_addr(jmpbuf.ar_bsp)
++      mov ar.bspstore=r23     // restore ar.bspstore
++      ;;
++      ld8.nta r18=[r2],16             // b2
++      ld8.nta r19=[r3],16             // b3
++      ;;
++      ld8.nta r20=[r2],16             // b4
++      ld8.nta r21=[r3],16             // b5
++      ;;
++      ld8.nta r11=[r2],16             // ar.pfs
++      ld8.nta r22=[r3],56             // ar.lc
++      ;;
++      ld8.nta r24=[r2],32             // pr
++      mov b0=r16
++      ;;
++      ldf.fill.nta f2=[r2],32
++      ldf.fill.nta f3=[r3],32
++      mov b1=r17
++      ;;
++      ldf.fill.nta f4=[r2],32
++      ldf.fill.nta f5=[r3],32
++      mov b2=r18
++      ;;
++      ldf.fill.nta f16=[r2],32
++      ldf.fill.nta f17=[r3],32
++      mov b3=r19
++      ;;
++      ldf.fill.nta f18=[r2],32
++      ldf.fill.nta f19=[r3],32
++      mov b4=r20
++      ;;
++      ldf.fill.nta f20=[r2],32
++      ldf.fill.nta f21=[r3],32
++      mov b5=r21
++      ;;
++      ldf.fill.nta f22=[r2],32
++      ldf.fill.nta f23=[r3],32
++      mov ar.lc=r22
++      ;;
++      ldf.fill.nta f24=[r2],32
++      ldf.fill.nta f25=[r3],32
++      cmp.eq p8,p9=0,in1
++      ;;
++      ldf.fill.nta f26=[r2],32
++      ldf.fill.nta f27=[r3],32
++      mov ar.pfs=r11
++      ;;
++      ldf.fill.nta f28=[r2],32
++      ldf.fill.nta f29=[r3],32
++      ;;
++      ldf.fill.nta f30=[r2]
++      ldf.fill.nta f31=[r3]
++(p8)  mov r8=1
++
++      mov ar.rnat=r26         // restore ar.rnat
++      ;;
++      mov ar.rsc=r27          // restore ar.rsc
++(p9)  mov r8=in1
++
++      invala                  // virt. -> phys. regnum mapping may change
++      mov pr=r24,-1
++      br.ret.sptk.few rp
++END(kgdb_fault_longjmp)
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ia64/kernel/kgdb.c linux-2.6.18.kgdb/arch/ia64/kernel/kgdb.c
+--- linux-2.6.18/arch/ia64/kernel/kgdb.c       1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18.kgdb/arch/ia64/kernel/kgdb.c  2008-06-10 16:19:32.000000000 +0400
+@@ -0,0 +1,1131 @@
++/*
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License as published by the
++ * Free Software Foundation; either version 2, or (at your option) any
++ * later version.
++ *
++ * This program is distributed in the hope that it will be useful, but
++ * WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * General Public License for more details.
++ *
++ */
++
++/*
++ * Copyright (C) 2000-2001 VERITAS Software Corporation.
++ * (c) Copyright 2005 Hewlett-Packard Development Company, L.P.
++ *     Bob Picco <bob.picco@hp.com>
++ */
++/*
++ *  Contributor:     Lake Stevens Instrument Division$
++ *  Written by:      Glenn Engel $
++ *  Updated by:            Amit Kale<akale@veritas.com>
++ *  Modified for 386 by Jim Kingdon, Cygnus Support.
++ *  Origianl kgdb, compatibility with 2.1.xx kernel by David Grothe <dave@gcom.com>
++ */
++
++#include <linux/string.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/smp.h>
++#include <linux/spinlock.h>
++#include <linux/delay.h>
++#include <asm/system.h>
++#include <asm/ptrace.h>               /* for linux pt_regs struct */
++#include <asm/unwind.h>
++#include <asm/rse.h>
++#include <linux/kgdb.h>
++#include <linux/init.h>
++#include <asm/cacheflush.h>
++#include <asm/kdebug.h>
++
++#define NUM_REGS 590
++#define REGISTER_BYTES (NUM_REGS*8+128*8)
++#define REGISTER_BYTE(N) (((N) * 8)                                    \
++      + ((N) <= IA64_FR0_REGNUM ?                                     \
++      0 : 8 * (((N) > IA64_FR127_REGNUM) ? 128 : (N) - IA64_FR0_REGNUM)))
++#define REGISTER_SIZE(N)                                               \
++      (((N) >= IA64_FR0_REGNUM && (N) <= IA64_FR127_REGNUM) ? 16 : 8)
++#define IA64_GR0_REGNUM         0
++#define IA64_FR0_REGNUM         128
++#define IA64_FR127_REGNUM       (IA64_FR0_REGNUM+127)
++#define IA64_PR0_REGNUM         256
++#define IA64_BR0_REGNUM         320
++#define IA64_VFP_REGNUM         328
++#define IA64_PR_REGNUM          330
++#define IA64_IP_REGNUM          331
++#define IA64_PSR_REGNUM         332
++#define IA64_CFM_REGNUM         333
++#define IA64_AR0_REGNUM         334
++#define IA64_NAT0_REGNUM        462
++#define IA64_NAT31_REGNUM       (IA64_NAT0_REGNUM+31)
++#define IA64_NAT32_REGNUM       (IA64_NAT0_REGNUM+32)
++#define IA64_RSC_REGNUM               (IA64_AR0_REGNUM+16)
++#define IA64_BSP_REGNUM               (IA64_AR0_REGNUM+17)
++#define IA64_BSPSTORE_REGNUM  (IA64_AR0_REGNUM+18)
++#define IA64_RNAT_REGNUM      (IA64_AR0_REGNUM+19)
++#define IA64_FCR_REGNUM               (IA64_AR0_REGNUM+21)
++#define IA64_EFLAG_REGNUM     (IA64_AR0_REGNUM+24)
++#define IA64_CSD_REGNUM               (IA64_AR0_REGNUM+25)
++#define IA64_SSD_REGNUM               (IA64_AR0_REGNUM+26)
++#define IA64_CFLG_REGNUM      (IA64_AR0_REGNUM+27)
++#define IA64_FSR_REGNUM               (IA64_AR0_REGNUM+28)
++#define IA64_FIR_REGNUM               (IA64_AR0_REGNUM+29)
++#define IA64_FDR_REGNUM               (IA64_AR0_REGNUM+30)
++#define IA64_CCV_REGNUM               (IA64_AR0_REGNUM+32)
++#define IA64_UNAT_REGNUM      (IA64_AR0_REGNUM+36)
++#define IA64_FPSR_REGNUM      (IA64_AR0_REGNUM+40)
++#define IA64_ITC_REGNUM               (IA64_AR0_REGNUM+44)
++#define IA64_PFS_REGNUM               (IA64_AR0_REGNUM+64)
++#define IA64_LC_REGNUM                (IA64_AR0_REGNUM+65)
++#define IA64_EC_REGNUM                (IA64_AR0_REGNUM+66)
++
++#define       REGISTER_INDEX(N)       (REGISTER_BYTE(N) / sizeof (unsigned long))
++#define BREAK_INSTR_ALIGN     (~0xfULL)
++
++#define       ptoff(V)        ((unsigned int) &((struct pt_regs *)0x0)->V)
++struct reg_to_ptreg_index {
++      unsigned int reg;
++      unsigned int ptregoff;
++};
++
++static struct reg_to_ptreg_index gr_reg_to_ptreg_index[] = {
++      {IA64_GR0_REGNUM + 1, ptoff(r1)},
++      {IA64_GR0_REGNUM + 2, ptoff(r2)},
++      {IA64_GR0_REGNUM + 3, ptoff(r3)},
++      {IA64_GR0_REGNUM + 8, ptoff(r8)},
++      {IA64_GR0_REGNUM + 9, ptoff(r9)},
++      {IA64_GR0_REGNUM + 10, ptoff(r10)},
++      {IA64_GR0_REGNUM + 11, ptoff(r11)},
++      {IA64_GR0_REGNUM + 12, ptoff(r12)},
++      {IA64_GR0_REGNUM + 13, ptoff(r13)},
++      {IA64_GR0_REGNUM + 14, ptoff(r14)},
++      {IA64_GR0_REGNUM + 15, ptoff(r15)},
++      {IA64_GR0_REGNUM + 16, ptoff(r16)},
++      {IA64_GR0_REGNUM + 17, ptoff(r17)},
++      {IA64_GR0_REGNUM + 18, ptoff(r18)},
++      {IA64_GR0_REGNUM + 19, ptoff(r19)},
++      {IA64_GR0_REGNUM + 20, ptoff(r20)},
++      {IA64_GR0_REGNUM + 21, ptoff(r21)},
++      {IA64_GR0_REGNUM + 22, ptoff(r22)},
++      {IA64_GR0_REGNUM + 23, ptoff(r23)},
++      {IA64_GR0_REGNUM + 24, ptoff(r24)},
++      {IA64_GR0_REGNUM + 25, ptoff(r25)},
++      {IA64_GR0_REGNUM + 26, ptoff(r26)},
++      {IA64_GR0_REGNUM + 27, ptoff(r27)},
++      {IA64_GR0_REGNUM + 28, ptoff(r28)},
++      {IA64_GR0_REGNUM + 29, ptoff(r29)},
++      {IA64_GR0_REGNUM + 30, ptoff(r30)},
++      {IA64_GR0_REGNUM + 31, ptoff(r31)},
++};
++
++static struct reg_to_ptreg_index br_reg_to_ptreg_index[] = {
++      {IA64_BR0_REGNUM, ptoff(b0)},
++      {IA64_BR0_REGNUM + 6, ptoff(b6)},
++      {IA64_BR0_REGNUM + 7, ptoff(b7)},
++};
++
++static struct reg_to_ptreg_index ar_reg_to_ptreg_index[] = {
++      {IA64_PFS_REGNUM, ptoff(ar_pfs)},
++      {IA64_UNAT_REGNUM, ptoff(ar_unat)},
++      {IA64_RNAT_REGNUM, ptoff(ar_rnat)},
++      {IA64_BSPSTORE_REGNUM, ptoff(ar_bspstore)},
++      {IA64_RSC_REGNUM, ptoff(ar_rsc)},
++      {IA64_CSD_REGNUM, ptoff(ar_csd)},
++      {IA64_SSD_REGNUM, ptoff(ar_ssd)},
++      {IA64_FPSR_REGNUM, ptoff(ar_fpsr)},
++      {IA64_CCV_REGNUM, ptoff(ar_ccv)},
++};
++
++extern atomic_t cpu_doing_single_step;
++
++static int kgdb_gr_reg(int regnum, struct unw_frame_info *info,
++      unsigned long *reg, int rw)
++{
++      char nat;
++
++      if ((regnum >= IA64_GR0_REGNUM && regnum <= (IA64_GR0_REGNUM + 1)) ||
++              (regnum >= (IA64_GR0_REGNUM + 4) &&
++              regnum <= (IA64_GR0_REGNUM + 7)))
++              return !unw_access_gr(info, regnum - IA64_GR0_REGNUM,
++              reg, &nat, rw);
++      else
++              return 0;
++}
++static int kgdb_gr_ptreg(int regnum, struct pt_regs * ptregs,
++      struct unw_frame_info *info, unsigned long *reg, int rw)
++{
++      int i, result = 1;
++      char nat;
++
++      if (!((regnum >= (IA64_GR0_REGNUM + 2) &&
++              regnum <= (IA64_GR0_REGNUM + 3)) ||
++              (regnum >= (IA64_GR0_REGNUM + 8) &&
++              regnum <= (IA64_GR0_REGNUM + 15)) ||
++              (regnum >= (IA64_GR0_REGNUM + 16) &&
++              regnum <= (IA64_GR0_REGNUM + 31))))
++              return 0;
++      else if (rw && ptregs) {
++              for (i = 0; i < ARRAY_SIZE(gr_reg_to_ptreg_index); i++)
++                      if (gr_reg_to_ptreg_index[i].reg == regnum) {
++                              *((unsigned long *)(((void *)ptregs) +
++                              gr_reg_to_ptreg_index[i].ptregoff)) = *reg;
++                              break;
++                      }
++      } else if (!rw && ptregs) {
++              for (i = 0; i < ARRAY_SIZE(gr_reg_to_ptreg_index); i++)
++                      if (gr_reg_to_ptreg_index[i].reg == regnum) {
++                              *reg = *((unsigned long *)
++                              (((void *)ptregs) +
++                               gr_reg_to_ptreg_index[i].ptregoff));
++                              break;
++                      }
++      } else
++              result = !unw_access_gr(info, regnum - IA64_GR0_REGNUM,
++                                      reg, &nat, rw);
++      return result;
++}
++
++static int kgdb_br_reg(int regnum, struct pt_regs * ptregs,
++      struct unw_frame_info *info, unsigned long *reg, int rw)
++{
++      int i, result = 1;
++
++      if (!(regnum >= IA64_BR0_REGNUM && regnum <= (IA64_BR0_REGNUM + 7)))
++              return 0;
++
++      switch (regnum) {
++      case IA64_BR0_REGNUM:
++      case IA64_BR0_REGNUM + 6:
++      case IA64_BR0_REGNUM + 7:
++              if (rw) {
++                      for (i = 0; i < ARRAY_SIZE(br_reg_to_ptreg_index); i++)
++                              if (br_reg_to_ptreg_index[i].reg == regnum) {
++                                      *((unsigned long *)
++                                      (((void *)ptregs) +
++                                      br_reg_to_ptreg_index[i].ptregoff)) =
++                                      *reg;
++                                      break;
++                              }
++              } else
++                      for (i = 0; i < ARRAY_SIZE(br_reg_to_ptreg_index); i++)
++                              if (br_reg_to_ptreg_index[i].reg == regnum) {
++                                              *reg = *((unsigned long *)
++                                              (((void *)ptregs) +
++                                              br_reg_to_ptreg_index[i].
++                                              ptregoff));
++                                              break;
++                              }
++              break;
++      case IA64_BR0_REGNUM + 1:
++      case IA64_BR0_REGNUM + 2:
++      case IA64_BR0_REGNUM + 3:
++      case IA64_BR0_REGNUM + 4:
++      case IA64_BR0_REGNUM + 5:
++              result = !unw_access_br(info, regnum - IA64_BR0_REGNUM,
++                              reg, rw);
++              break;
++      }
++
++      return result;
++}
++
++static int kgdb_fr_reg(int regnum, char *inbuffer, struct pt_regs * ptregs,
++      struct unw_frame_info *info, unsigned long *reg,
++      struct ia64_fpreg *freg, int rw)
++{
++      int result = 1;
++
++      if (!(regnum >= IA64_FR0_REGNUM && regnum <= (IA64_FR0_REGNUM + 127)))
++              return 0;
++
++      switch (regnum) {
++      case IA64_FR0_REGNUM + 6:
++      case IA64_FR0_REGNUM + 7:
++      case IA64_FR0_REGNUM + 8:
++      case IA64_FR0_REGNUM + 9:
++      case IA64_FR0_REGNUM + 10:
++      case IA64_FR0_REGNUM + 11:
++      case IA64_FR0_REGNUM + 12:
++              if (rw) {
++                      char *ptr = inbuffer;
++
++                      freg->u.bits[0] = *reg;
++                      kgdb_hex2long(&ptr, &freg->u.bits[1]);
++                      *(&ptregs->f6 + (regnum - (IA64_FR0_REGNUM + 6))) =
++                              *freg;
++                      break;
++              } else if (!ptregs)
++                      result = !unw_access_fr(info, regnum - IA64_FR0_REGNUM,
++                              freg, rw);
++              else
++                      *freg =
++                      *(&ptregs->f6 + (regnum - (IA64_FR0_REGNUM + 6)));
++              break;
++      default:
++              if (!rw)
++                      result = !unw_access_fr(info, regnum - IA64_FR0_REGNUM,
++                              freg, rw);
++              else
++                      result = 0;
++              break;
++      }
++
++      return result;
++}
++
++static int kgdb_ar_reg(int regnum, struct pt_regs * ptregs,
++      struct unw_frame_info *info, unsigned long *reg, int rw)
++{
++      int result = 0, i;
++
++      if (!(regnum >= IA64_AR0_REGNUM && regnum <= IA64_EC_REGNUM))
++              return 0;
++
++      if (rw && ptregs) {
++              for (i = 0; i < ARRAY_SIZE(ar_reg_to_ptreg_index); i++)
++                      if (ar_reg_to_ptreg_index[i].reg == regnum) {
++                              *((unsigned long *) (((void *)ptregs) +
++                              ar_reg_to_ptreg_index[i].ptregoff)) =
++                                      *reg;
++                              result = 1;
++                              break;
++                      }
++      } else if (ptregs) {
++              for (i = 0; i < ARRAY_SIZE(ar_reg_to_ptreg_index); i++)
++                      if (ar_reg_to_ptreg_index[i].reg == regnum) {
++                              *reg = *((unsigned long *) (((void *)ptregs) +
++                                      ar_reg_to_ptreg_index[i].ptregoff));
++                                      result = 1;
++                              break;
++                      }
++      }
++
++      if (result)
++              return result;
++
++       result = 1;
++
++      switch (regnum) {
++      case IA64_CSD_REGNUM:
++              result = !unw_access_ar(info, UNW_AR_CSD, reg, rw);
++              break;
++      case IA64_SSD_REGNUM:
++              result = !unw_access_ar(info, UNW_AR_SSD, reg, rw);
++              break;
++      case IA64_UNAT_REGNUM:
++              result = !unw_access_ar(info, UNW_AR_RNAT, reg, rw);
++              break;
++              case IA64_RNAT_REGNUM:
++              result = !unw_access_ar(info, UNW_AR_RNAT, reg, rw);
++              break;
++      case IA64_BSPSTORE_REGNUM:
++              result = !unw_access_ar(info, UNW_AR_RNAT, reg, rw);
++              break;
++      case IA64_PFS_REGNUM:
++              result = !unw_access_ar(info, UNW_AR_RNAT, reg, rw);
++              break;
++      case IA64_LC_REGNUM:
++              result = !unw_access_ar(info, UNW_AR_LC, reg, rw);
++              break;
++      case IA64_EC_REGNUM:
++              result = !unw_access_ar(info, UNW_AR_EC, reg, rw);
++              break;
++      case IA64_FPSR_REGNUM:
++              result = !unw_access_ar(info, UNW_AR_FPSR, reg, rw);
++              break;
++      case IA64_RSC_REGNUM:
++              result = !unw_access_ar(info, UNW_AR_RSC, reg, rw);
++              break;
++      case IA64_CCV_REGNUM:
++              result = !unw_access_ar(info, UNW_AR_CCV, reg, rw);
++              break;
++      default:
++              result = 0;
++      }
++
++      return result;
++}
++
++void kgdb_get_reg(char *outbuffer, int regnum, struct unw_frame_info *info,
++      struct pt_regs *ptregs)
++{
++      unsigned long reg, size = 0, *mem = &reg;
++      struct ia64_fpreg freg;
++
++      if (kgdb_gr_reg(regnum, info, &reg, 0) ||
++              kgdb_gr_ptreg(regnum, ptregs, info, &reg, 0) ||
++              kgdb_br_reg(regnum, ptregs, info, &reg, 0) ||
++              kgdb_ar_reg(regnum, ptregs, info, &reg, 0))
++                      size = sizeof(reg);
++      else if (kgdb_fr_reg(regnum, NULL, ptregs, info, &reg, &freg, 0)) {
++              size = sizeof(freg);
++              mem = (unsigned long *)&freg;
++      } else if (regnum == IA64_IP_REGNUM) {
++              if (!ptregs) {
++                      unw_get_ip(info, &reg);
++                      size = sizeof(reg);
++              } else {
++                      reg = ptregs->cr_iip;
++                      size = sizeof(reg);
++              }
++      } else if (regnum == IA64_CFM_REGNUM) {
++              if (!ptregs)
++                      unw_get_cfm(info, &reg);
++              else
++                      reg = ptregs->cr_ifs;
++              size = sizeof(reg);
++      } else if (regnum == IA64_PSR_REGNUM) {
++              if (!ptregs && kgdb_usethread)
++                      ptregs = (struct pt_regs *)
++                      ((unsigned long)kgdb_usethread +
++                      IA64_STK_OFFSET) - 1;
++              if (ptregs)
++                      reg = ptregs->cr_ipsr;
++              size = sizeof(reg);
++      } else if (regnum == IA64_PR_REGNUM) {
++              if (ptregs)
++                      reg = ptregs->pr;
++              else
++                      unw_access_pr(info, &reg, 0);
++              size = sizeof(reg);
++      } else if (regnum == IA64_BSP_REGNUM) {
++              unw_get_bsp(info, &reg);
++              size = sizeof(reg);
++      }
++
++      if (size) {
++              kgdb_mem2hex((char *) mem, outbuffer, size);
++              outbuffer[size*2] = 0;
++      }
++      else
++              strcpy(outbuffer, "E0");
++
++      return;
++}
++
++void kgdb_put_reg(char *inbuffer, char *outbuffer, int regnum,
++                struct unw_frame_info *info, struct pt_regs *ptregs)
++{
++      unsigned long reg;
++      struct ia64_fpreg freg;
++      char *ptr = inbuffer;
++
++      kgdb_hex2long(&ptr, &reg);
++      strcpy(outbuffer, "OK");
++
++      if (kgdb_gr_reg(regnum, info, &reg, 1) ||
++              kgdb_gr_ptreg(regnum, ptregs, info, &reg, 1) ||
++              kgdb_br_reg(regnum, ptregs, info, &reg, 1) ||
++              kgdb_fr_reg(regnum, inbuffer, ptregs, info, &reg, &freg, 1) ||
++              kgdb_ar_reg(regnum, ptregs, info, &reg, 1)) ;
++      else if (regnum == IA64_IP_REGNUM)
++              ptregs->cr_iip = reg;
++      else if (regnum == IA64_CFM_REGNUM)
++              ptregs->cr_ifs = reg;
++      else if (regnum == IA64_PSR_REGNUM)
++              ptregs->cr_ipsr = reg;
++      else if (regnum == IA64_PR_REGNUM)
++              ptregs->pr = reg;
++      else
++              strcpy(outbuffer, "E01");
++      return;
++}
++
++void regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
++{
++}
++
++void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
++{
++}
++
++void gdb_regs_to_regs(unsigned long *gdb_regs, struct pt_regs *regs)
++{
++
++}
++
++#define       MAX_HW_BREAKPOINT       (20)
++long hw_break_total_dbr, hw_break_total_ibr;
++#define       HW_BREAKPOINT   (hw_break_total_dbr + hw_break_total_ibr)
++#define       WATCH_INSTRUCTION       0x0
++#define WATCH_WRITE           0x1
++#define       WATCH_READ              0x2
++#define       WATCH_ACCESS            0x3
++
++#define       HWCAP_DBR       ((1 << WATCH_WRITE) | (1 << WATCH_READ))
++#define       HWCAP_IBR       (1 << WATCH_INSTRUCTION)
++struct hw_breakpoint {
++      unsigned enabled;
++      unsigned long capable;
++      unsigned long type;
++      unsigned long mask;
++      unsigned long addr;
++} *breakinfo;
++
++static struct hw_breakpoint hwbreaks[MAX_HW_BREAKPOINT];
++
++enum instruction_type { A, I, M, F, B, L, X, u };
++
++static enum instruction_type bundle_encoding[32][3] = {
++      {M, I, I},              /* 00 */
++      {M, I, I},              /* 01 */
++      {M, I, I},              /* 02 */
++      {M, I, I},              /* 03 */
++      {M, L, X},              /* 04 */
++      {M, L, X},              /* 05 */
++      {u, u, u},              /* 06 */
++      {u, u, u},              /* 07 */
++      {M, M, I},              /* 08 */
++      {M, M, I},              /* 09 */
++      {M, M, I},              /* 0A */
++      {M, M, I},              /* 0B */
++      {M, F, I},              /* 0C */
++      {M, F, I},              /* 0D */
++      {M, M, F},              /* 0E */
++      {M, M, F},              /* 0F */
++      {M, I, B},              /* 10 */
++      {M, I, B},              /* 11 */
++      {M, B, B},              /* 12 */
++      {M, B, B},              /* 13 */
++      {u, u, u},              /* 14 */
++      {u, u, u},              /* 15 */
++      {B, B, B},              /* 16 */
++      {B, B, B},              /* 17 */
++      {M, M, B},              /* 18 */
++      {M, M, B},              /* 19 */
++      {u, u, u},              /* 1A */
++      {u, u, u},              /* 1B */
++      {M, F, B},              /* 1C */
++      {M, F, B},              /* 1D */
++      {u, u, u},              /* 1E */
++      {u, u, u},              /* 1F */
++};
++
++int kgdb_validate_break_address(unsigned long addr)
++{
++      int error;
++      char tmp_variable[BREAK_INSTR_SIZE];
++      error = kgdb_get_mem((char *)(addr & BREAK_INSTR_ALIGN), tmp_variable,
++              BREAK_INSTR_SIZE);
++      return error;
++}
++
++int kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr)
++{
++      extern unsigned long _start[];
++      unsigned long slot = addr & BREAK_INSTR_ALIGN, bundle_addr;
++      unsigned long template;
++      struct bundle {
++              struct {
++                      unsigned long long template:5;
++                      unsigned long long slot0:41;
++                      unsigned long long slot1_p0:64 - 46;
++              } quad0;
++              struct {
++                      unsigned long long slot1_p1:41 - (64 - 46);
++                      unsigned long long slot2:41;
++              } quad1;
++      } bundle;
++      int ret;
++
++      bundle_addr = addr & ~0xFULL;
++
++      if (bundle_addr == (unsigned long)_start)
++              return 0;
++
++      ret = kgdb_get_mem((char *)bundle_addr, (char *)&bundle,
++                         BREAK_INSTR_SIZE);
++      if (ret < 0)
++              return ret;
++
++      if (slot > 2)
++              slot = 0;
++
++      memcpy(saved_instr, &bundle, BREAK_INSTR_SIZE);
++      template = bundle.quad0.template;
++
++      if (slot == 1 && bundle_encoding[template][1] == L)
++              slot = 2;
++
++      switch (slot) {
++      case 0:
++              bundle.quad0.slot0 = BREAKNUM;
++              break;
++      case 1:
++              bundle.quad0.slot1_p0 = BREAKNUM;
++              bundle.quad1.slot1_p1 = (BREAKNUM >> (64 - 46));
++              break;
++      case 2:
++              bundle.quad1.slot2 = BREAKNUM;
++              break;
++      }
++
++      return kgdb_set_mem((char *)bundle_addr, (char *)&bundle,
++                          BREAK_INSTR_SIZE);
++}
++
++int kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle)
++{
++      extern unsigned long _start[];
++
++      addr = addr & BREAK_INSTR_ALIGN;
++      if (addr == (unsigned long)_start)
++              return 0;
++      return kgdb_set_mem((char *)addr, (char *)bundle, BREAK_INSTR_SIZE);
++}
++
++static int hw_breakpoint_init;
++
++void do_init_hw_break(void)
++{
++      s64 status;
++      int i;
++
++      hw_breakpoint_init = 1;
++
++#ifdef        CONFIG_IA64_HP_SIM
++      hw_break_total_ibr = 8;
++      hw_break_total_dbr = 8;
++      status = 0;
++#else
++      status = ia64_pal_debug_info(&hw_break_total_ibr, &hw_break_total_dbr);
++#endif
++
++      if (status) {
++              printk(KERN_INFO "do_init_hw_break: pal call failed %d\n",
++                     (int)status);
++              return;
++      }
++
++      if (HW_BREAKPOINT > MAX_HW_BREAKPOINT) {
++              printk(KERN_INFO "do_init_hw_break: %d exceeds max %d\n",
++                     (int)HW_BREAKPOINT, (int)MAX_HW_BREAKPOINT);
++
++              while ((HW_BREAKPOINT > MAX_HW_BREAKPOINT)
++                     && hw_break_total_ibr != 1)
++                      hw_break_total_ibr--;
++              while (HW_BREAKPOINT > MAX_HW_BREAKPOINT)
++                      hw_break_total_dbr--;
++      }
++
++      breakinfo = hwbreaks;
++
++      memset(breakinfo, 0, HW_BREAKPOINT * sizeof(struct hw_breakpoint));
++
++      for (i = 0; i < hw_break_total_dbr; i++)
++              breakinfo[i].capable = HWCAP_DBR;
++
++      for (; i < HW_BREAKPOINT; i++)
++              breakinfo[i].capable = HWCAP_IBR;
++
++      return;
++}
++
++void kgdb_correct_hw_break(void)
++{
++      int breakno;
++
++      if (!breakinfo)
++              return;
++
++      for (breakno = 0; breakno < HW_BREAKPOINT; breakno++) {
++              if (breakinfo[breakno].enabled) {
++                      if (breakinfo[breakno].capable & HWCAP_IBR) {
++                              int ibreakno = breakno - hw_break_total_dbr;
++                              ia64_set_ibr(ibreakno << 1,
++                                           breakinfo[breakno].addr);
++                              ia64_set_ibr((ibreakno << 1) + 1,
++                                           (~breakinfo[breakno].mask &
++                                            ((1UL << 56UL) - 1)) |
++                                            (1UL << 56UL) | (1UL << 63UL));
++                      } else {
++                              ia64_set_dbr(breakno << 1,
++                                           breakinfo[breakno].addr);
++                              ia64_set_dbr((breakno << 1) + 1,
++                                           (~breakinfo[breakno].
++                                            mask & ((1UL << 56UL) - 1)) |
++                                           (1UL << 56UL) |
++                                           (breakinfo[breakno].type << 62UL));
++                      }
++              } else {
++                      if (breakinfo[breakno].capable & HWCAP_IBR)
++                              ia64_set_ibr(((breakno -
++                                             hw_break_total_dbr) << 1) + 1,
++                                           0);
++                      else
++                              ia64_set_dbr((breakno << 1) + 1, 0);
++              }
++      }
++
++      return;
++}
++
++int hardware_breakpoint(unsigned long addr, int length, int type, int action)
++{
++      int breakno, found, watch;
++      unsigned long mask;
++      extern unsigned long _start[];
++
++      if (!hw_breakpoint_init)
++              do_init_hw_break();
++
++      if (!breakinfo)
++              return 0;
++      else if (addr == (unsigned long)_start)
++              return 1;
++
++      if (type == WATCH_ACCESS)
++              mask = HWCAP_DBR;
++      else
++              mask = 1UL << type;
++
++      for (watch = 0, found = 0, breakno = 0; breakno < HW_BREAKPOINT;
++           breakno++) {
++              if (action) {
++                      if (breakinfo[breakno].enabled
++                          || !(breakinfo[breakno].capable & mask))
++                              continue;
++                      breakinfo[breakno].enabled = 1;
++                      breakinfo[breakno].type = type;
++                      breakinfo[breakno].mask = length - 1;
++                      breakinfo[breakno].addr = addr;
++                      watch = breakno;
++              } else if (breakinfo[breakno].enabled &&
++                         ((length < 0 && breakinfo[breakno].addr == addr) ||
++                          ((breakinfo[breakno].capable & mask) &&
++                           (breakinfo[breakno].mask == (length - 1)) &&
++                           (breakinfo[breakno].addr == addr)))) {
++                      breakinfo[breakno].enabled = 0;
++                      breakinfo[breakno].type = 0UL;
++              } else
++                      continue;
++              found++;
++              if (type != WATCH_ACCESS)
++                      break;
++              else if (found == 2)
++                      break;
++              else
++                      mask = HWCAP_IBR;
++      }
++
++      if (type == WATCH_ACCESS && found == 1) {
++              breakinfo[watch].enabled = 0;
++              found = 0;
++      }
++
++      mb();
++      return found;
++}
++
++int kgdb_arch_set_hw_breakpoint(unsigned long addr, int len,
++                              enum kgdb_bptype type)
++{
++      return hardware_breakpoint(addr, len, type - '1', 1);
++}
++
++int kgdb_arch_remove_hw_breakpoint(unsigned long addr, int len,
++                                 enum kgdb_bptype type)
++{
++      return hardware_breakpoint(addr, len, type - '1', 0);
++}
++
++int kgdb_remove_hw_break(unsigned long addr)
++{
++      return hardware_breakpoint(addr, 8, WATCH_INSTRUCTION, 0);
++
++}
++
++void kgdb_remove_all_hw_break(void)
++{
++      int i;
++
++      for (i = 0; i < HW_BREAKPOINT; i++)
++              memset(&breakinfo[i], 0, sizeof(struct hw_breakpoint));
++}
++
++int kgdb_set_hw_break(unsigned long addr)
++{
++      return hardware_breakpoint(addr, 8, WATCH_INSTRUCTION, 1);
++}
++
++void kgdb_disable_hw_debug(struct pt_regs *regs)
++{
++      unsigned long hw_breakpoint_status;
++
++      hw_breakpoint_status = ia64_getreg(_IA64_REG_PSR);
++      if (hw_breakpoint_status & IA64_PSR_DB)
++              ia64_setreg(_IA64_REG_PSR_L,
++                          hw_breakpoint_status ^ IA64_PSR_DB);
++}
++
++volatile static struct smp_unw {
++      struct unw_frame_info *unw;
++      struct task_struct *task;
++} smp_unw[NR_CPUS];
++
++static int inline kgdb_get_blocked_state(struct task_struct *p,
++                                       struct unw_frame_info *unw)
++{
++      unsigned long ip;
++      int count = 0;
++
++      unw_init_from_blocked_task(unw, p);
++      ip = 0UL;
++      do {
++              if (unw_unwind(unw) < 0)
++                      return -1;
++              unw_get_ip(unw, &ip);
++              if (!in_sched_functions(ip))
++                      break;
++      } while (count++ < 16);
++
++      if (!ip)
++              return -1;
++      else
++              return 0;
++}
++
++static void inline kgdb_wait(struct pt_regs *regs)
++{
++      unsigned long hw_breakpoint_status = ia64_getreg(_IA64_REG_PSR);
++      if (hw_breakpoint_status & IA64_PSR_DB)
++              ia64_setreg(_IA64_REG_PSR_L,
++                          hw_breakpoint_status ^ IA64_PSR_DB);
++      kgdb_nmihook(smp_processor_id(), regs);
++      if (hw_breakpoint_status & IA64_PSR_DB)
++              ia64_setreg(_IA64_REG_PSR_L, hw_breakpoint_status);
++
++      return;
++}
++
++static void inline normalize(struct unw_frame_info *running,
++                           struct pt_regs *regs)
++{
++      unsigned long sp;
++
++      do {
++              unw_get_sp(running, &sp);
++              if ((sp + 0x10) >= (unsigned long)regs)
++                      break;
++      } while (unw_unwind(running) >= 0);
++
++      return;
++}
++
++static void kgdb_init_running(struct unw_frame_info *unw, void *data)
++{
++      struct pt_regs *regs;
++
++      regs = data;
++      normalize(unw, regs);
++      smp_unw[smp_processor_id()].unw = unw;
++      kgdb_wait(regs);
++}
++
++void kgdb_wait_ipi(struct pt_regs *regs)
++{
++      struct unw_frame_info unw;
++
++      smp_unw[smp_processor_id()].task = current;
++
++      if (user_mode(regs)) {
++              smp_unw[smp_processor_id()].unw = (struct unw_frame_info *)1;
++              kgdb_wait(regs);
++      } else {
++              if (current->state == TASK_RUNNING)
++                      unw_init_running(kgdb_init_running, regs);
++              else {
++                      if (kgdb_get_blocked_state(current, &unw))
++                              smp_unw[smp_processor_id()].unw =
++                                  (struct unw_frame_info *)1;
++                      else
++                              smp_unw[smp_processor_id()].unw = &unw;
++                      kgdb_wait(regs);
++              }
++      }
++
++      smp_unw[smp_processor_id()].unw = NULL;
++      return;
++}
++
++void kgdb_roundup_cpus(unsigned long flags)
++{
++      if (num_online_cpus() > 1)
++              smp_send_nmi_allbutself();
++}
++
++static volatile int kgdb_hwbreak_sstep[NR_CPUS];
++
++static int kgdb_notify(struct notifier_block *self, unsigned long cmd,
++      void *ptr)
++{
++      struct die_args *args = ptr;
++      struct pt_regs *regs = args->regs;
++      unsigned long err = args->err;
++
++      switch (cmd) {
++      default:
++              return NOTIFY_DONE;
++      case DIE_PAGE_FAULT_NO_CONTEXT:
++              if (atomic_read(&debugger_active) && kgdb_may_fault) {
++                      kgdb_fault_longjmp(kgdb_fault_jmp_regs);
++                      return NOTIFY_STOP;
++              }
++              break;
++      case DIE_BREAK:
++              if (user_mode(regs) || err == 0x80001)
++                      return NOTIFY_DONE;
++              break;
++      case DIE_FAULT:
++              if (user_mode(regs))
++                      return NOTIFY_DONE;
++              else if (err == 36 && kgdb_hwbreak_sstep[smp_processor_id()]) {
++                      kgdb_hwbreak_sstep[smp_processor_id()] = 0;
++                      regs->cr_ipsr &= ~IA64_PSR_SS;
++                      return NOTIFY_STOP;
++              }
++      case DIE_MCA_MONARCH_PROCESS:
++      case DIE_INIT_MONARCH_PROCESS:
++              break;
++      }
++
++      kgdb_handle_exception(args->trapnr, args->signr, args->err, regs);
++      return NOTIFY_STOP;
++}
++
++static struct notifier_block kgdb_notifier = {
++      .notifier_call = kgdb_notify,
++};
++
++int kgdb_arch_init(void)
++{
++      atomic_notifier_chain_register(&ia64die_chain, &kgdb_notifier);
++      return 0;
++}
++
++static void do_kgdb_handle_exception(struct unw_frame_info *, void *data);
++
++struct kgdb_state {
++      int e_vector;
++      int signo;
++      unsigned long err_code;
++      struct pt_regs *regs;
++      struct unw_frame_info *unw;
++      char *inbuf;
++      char *outbuf;
++      int unwind;
++      int ret;
++};
++
++static void inline kgdb_pc(struct pt_regs *regs, unsigned long pc)
++{
++      regs->cr_iip = pc & ~0xf;
++      ia64_psr(regs)->ri = pc & 0x3;
++      return;
++}
++
++int kgdb_arch_handle_exception(int e_vector, int signo,
++                             int err_code, char *remcom_in_buffer,
++                             char *remcom_out_buffer,
++                             struct pt_regs *linux_regs)
++{
++      struct kgdb_state info;
++
++      info.e_vector = e_vector;
++      info.signo = signo;
++      info.err_code = err_code;
++      info.unw = (void *)0;
++      info.inbuf = remcom_in_buffer;
++      info.outbuf = remcom_out_buffer;
++      info.unwind = 0;
++      info.ret = -1;
++
++      if (remcom_in_buffer[0] == 'c' || remcom_in_buffer[0] == 's') {
++              info.regs = linux_regs;
++              do_kgdb_handle_exception(NULL, &info);
++      } else if (kgdb_usethread == current) {
++              info.regs = linux_regs;
++              info.unwind = 1;
++              unw_init_running(do_kgdb_handle_exception, &info);
++      } else if (kgdb_usethread->state != TASK_RUNNING) {
++              struct unw_frame_info unw_info;
++
++              if (kgdb_get_blocked_state(kgdb_usethread, &unw_info)) {
++                      info.ret = 1;
++                      goto bad;
++              }
++              info.regs = NULL;
++              do_kgdb_handle_exception(&unw_info, &info);
++      } else {
++              int i;
++
++              for (i = 0; i < NR_CPUS; i++)
++                      if (smp_unw[i].task == kgdb_usethread && smp_unw[i].unw
++                          && smp_unw[i].unw != (struct unw_frame_info *)1) {
++                              info.regs = NULL;
++                              do_kgdb_handle_exception(smp_unw[i].unw, &info);
++                              break;
++                      } else {
++                              info.ret = 1;
++                              goto bad;
++                      }
++      }
++
++      bad:
++      if (info.ret != -1 && remcom_in_buffer[0] == 'p') {
++              unsigned long bad = 0xbad4badbadbadbadUL;
++
++              printk("kgdb_arch_handle_exception: p packet bad (%s)\n",
++                     remcom_in_buffer);
++              kgdb_mem2hex((char *)&bad, remcom_out_buffer, sizeof(bad));
++              remcom_out_buffer[sizeof(bad) * 2] = 0;
++              info.ret = -1;
++      }
++      return info.ret;
++}
++
++/*
++ * This is done because I evidently made an incorrect 'p' encoding
++ * when my patch for gdb was committed. It was later corrected. This
++ * check supports both my wrong encoding of the register number and
++ * the correct encoding. Eventually this should be eliminated and
++ * kgdb_hex2long should be demarshalling the regnum.
++ */
++static inline int check_packet(unsigned int regnum, char *packet)
++{
++      static int check_done, swap;
++      unsigned long reglong;
++
++      if (likely(check_done)) {
++              if (swap) {
++                      kgdb_hex2long(&packet, &reglong);
++                      regnum = (int) reglong;
++              }
++
++      } else {
++              if (regnum > NUM_REGS) {
++                      kgdb_hex2long(&packet, &reglong);
++                      regnum = (int) reglong;
++                      swap = 1;
++              }
++              check_done = 1;
++      }
++      return regnum;
++}
++
++static void do_kgdb_handle_exception(struct unw_frame_info *unw_info,
++      void *data)
++{
++      long addr;
++      char *ptr;
++      unsigned long newPC;
++      int e_vector, signo;
++      unsigned long err_code;
++      struct pt_regs *linux_regs;
++      struct kgdb_state *info;
++      char *remcom_in_buffer, *remcom_out_buffer;
++
++      info = data;
++      info->unw = unw_info;
++      e_vector = info->e_vector;
++      signo = info->signo;
++      err_code = info->err_code;
++      remcom_in_buffer = info->inbuf;
++      remcom_out_buffer = info->outbuf;
++      linux_regs = info->regs;
++
++      if (info->unwind)
++              normalize(unw_info, linux_regs);
++
++      switch (remcom_in_buffer[0]) {
++      case 'p':
++              {
++                      unsigned int regnum;
++
++                      kgdb_hex2mem(&remcom_in_buffer[1], (char *)&regnum,
++                                   sizeof(regnum));
++                      regnum = check_packet(regnum, &remcom_in_buffer[1]);
++                      if (regnum >= NUM_REGS) {
++                              remcom_out_buffer[0] = 'E';
++                              remcom_out_buffer[1] = 0;
++                      } else
++                              kgdb_get_reg(remcom_out_buffer, regnum,
++                                           unw_info, linux_regs);
++                      break;
++              }
++      case 'P':
++              {
++                      unsigned int regno;
++                      long v;
++                      char *ptr;
++
++                      ptr = &remcom_in_buffer[1];
++                      if ((!kgdb_usethread || kgdb_usethread == current) &&
++                          kgdb_hex2long(&ptr, &v) &&
++                          *ptr++ == '=' && (v >= 0)) {
++                              regno = (unsigned int)v;
++                              regno = (regno >= NUM_REGS ? 0 : regno);
++                              kgdb_put_reg(ptr, remcom_out_buffer, regno,
++                                           unw_info, linux_regs);
++                      } else
++                              strcpy(remcom_out_buffer, "E01");
++                      break;
++              }
++      case 'c':
++      case 's':
++              if (e_vector == TRAP_BRKPT && err_code == KGDBBREAKNUM) {
++                      if (ia64_psr(linux_regs)->ri < 2)
++                              kgdb_pc(linux_regs, linux_regs->cr_iip +
++                                      ia64_psr(linux_regs)->ri + 1);
++                      else
++                              kgdb_pc(linux_regs, linux_regs->cr_iip + 16);
++              }
++
++              /* try to read optional parameter, pc unchanged if no parm */
++              ptr = &remcom_in_buffer[1];
++              if (kgdb_hex2long(&ptr, &addr)) {
++                      linux_regs->cr_iip = addr;
++              }
++              newPC = linux_regs->cr_iip;
++
++              /* clear the trace bit */
++              linux_regs->cr_ipsr &= ~IA64_PSR_SS;
++
++              atomic_set(&cpu_doing_single_step, -1);
++
++              /* set the trace bit if we're stepping or took a hardware break */
++              if (remcom_in_buffer[0] == 's' || e_vector == TRAP_HWBKPT) {
++                      linux_regs->cr_ipsr |= IA64_PSR_SS;
++                      debugger_step = 1;
++                      if (kgdb_contthread)
++                              atomic_set(&cpu_doing_single_step,
++                                         smp_processor_id());
++              }
++
++              kgdb_correct_hw_break();
++
++              /* if not hardware breakpoint, then reenable them */
++              if (e_vector != TRAP_HWBKPT)
++                      linux_regs->cr_ipsr |= IA64_PSR_DB;
++              else {
++                      kgdb_hwbreak_sstep[smp_processor_id()] = 1;
++                      linux_regs->cr_ipsr &= ~IA64_PSR_DB;
++              }
++
++              info->ret = 0;
++              break;
++      default:
++              break;
++      }
++
++      return;
++}
++
++struct kgdb_arch arch_kgdb_ops = {
++      .set_hw_breakpoint = kgdb_arch_set_hw_breakpoint,
++      .remove_hw_breakpoint = kgdb_arch_remove_hw_breakpoint,
++      .gdb_bpt_instr = {0xcc},
++      .flags = KGDB_HW_BREAKPOINT,
++};
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ia64/kernel/process.c linux-2.6.18.kgdb/arch/ia64/kernel/process.c
+--- linux-2.6.18/arch/ia64/kernel/process.c    2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/ia64/kernel/process.c       2008-06-10 16:20:23.000000000 +0400
+@@ -458,6 +458,9 @@ copy_thread (int nr, unsigned long clone
+        */
+       child_ptregs->cr_ipsr = ((child_ptregs->cr_ipsr | IA64_PSR_BITS_TO_SET)
+                                & ~(IA64_PSR_BITS_TO_CLEAR | IA64_PSR_PP | IA64_PSR_UP));
++#ifdef        CONFIG_KGDB
++      child_ptregs->cr_ipsr |= IA64_PSR_DB;
++#endif
+ 
+       /*
+        * NOTE: The calling convention considers all floating point
+@@ -686,6 +689,9 @@ kernel_thread (int (*fn)(void *), void *
+       regs.pt.r11 = (unsigned long) arg;      /* 2nd argument */
+       /* Preserve PSR bits, except for bits 32-34 and 37-45, which we can't read.  */
+       regs.pt.cr_ipsr = ia64_getreg(_IA64_REG_PSR) | IA64_PSR_BN;
++#ifdef        CONFIG_KGDB
++      regs.pt.cr_ipsr |= IA64_PSR_DB;
++#endif
+       regs.pt.cr_ifs = 1UL << 63;             /* mark as valid, empty frame */
+       regs.sw.ar_fpsr = regs.pt.ar_fpsr = ia64_getreg(_IA64_REG_AR_FPSR);
+       regs.sw.ar_bspstore = (unsigned long) current + IA64_RBS_OFFSET;
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ia64/kernel/smp.c linux-2.6.18.kgdb/arch/ia64/kernel/smp.c
+--- linux-2.6.18/arch/ia64/kernel/smp.c        2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/ia64/kernel/smp.c   2008-06-10 16:19:32.000000000 +0400
+@@ -47,6 +47,7 @@
+ #include <asm/tlbflush.h>
+ #include <asm/unistd.h>
+ #include <asm/mca.h>
++#include <linux/kgdb.h>
+ 
+ /*
+  * Structure and data for smp_call_function(). This is designed to minimise static memory
+@@ -66,6 +67,9 @@ static volatile struct call_data_struct 
+ 
+ #define IPI_CALL_FUNC         0
+ #define IPI_CPU_STOP          1
++#ifdef        CONFIG_KGDB
++#define       IPI_KGDB_INTERRUPT      2
++#endif
+ 
+ /* This needs to be cacheline aligned because it is written to by *other* CPUs.  */
+ static DEFINE_PER_CPU(u64, ipi_operation) ____cacheline_aligned;
+@@ -155,6 +159,11 @@ handle_IPI (int irq, void *dev_id, struc
+                             case IPI_CPU_STOP:
+                               stop_this_cpu();
+                               break;
++#ifdef        CONFIG_KGDB
++                            case IPI_KGDB_INTERRUPT:
++                              kgdb_wait_ipi(regs);
++                              break;
++#endif
+ 
+                             default:
+                               printk(KERN_CRIT "Unknown IPI on CPU %d: %lu\n", this_cpu, which);
+@@ -305,6 +314,14 @@ smp_call_function_single (int cpuid, voi
+ }
+ EXPORT_SYMBOL(smp_call_function_single);
+ 
++#ifdef        CONFIG_KGDB
++void
++smp_send_nmi_allbutself(void)
++{
++      send_IPI_allbutself(IPI_KGDB_INTERRUPT);
++}
++#endif
++
+ /*
+  * this function sends a 'generic call function' IPI to all other CPUs
+  * in the system.
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ia64/kernel/traps.c linux-2.6.18.kgdb/arch/ia64/kernel/traps.c
+--- linux-2.6.18/arch/ia64/kernel/traps.c      2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/ia64/kernel/traps.c 2008-06-10 16:19:32.000000000 +0400
+@@ -200,8 +200,12 @@ __kprobes ia64_bad_break (unsigned long 
+               break;
+ 
+             default:
+-              if (break_num < 0x40000 || break_num > 0x100000)
++              if (break_num < 0x40000 || break_num > 0x100000) {
++                      if (notify_die(DIE_BREAK, "bad break", regs,
++                              break_num, TRAP_BRKPT, SIGTRAP) == NOTIFY_STOP)
++                              return;
+                       die_if_kernel("Bad break", regs, break_num);
++              }
+ 
+               if (break_num < 0x80000) {
+                       sig = SIGILL; code = __ILL_BREAK;
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ia64/kernel/unwind.c linux-2.6.18.kgdb/arch/ia64/kernel/unwind.c
+--- linux-2.6.18/arch/ia64/kernel/unwind.c     2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/ia64/kernel/unwind.c        2008-06-10 16:20:23.000000000 +0400
+@@ -72,10 +72,68 @@
+ # define STAT(x...)
+ #endif
+ 
++#ifdef        CONFIG_KGDB
++#define       KGDB_EARLY_SIZE 100
++static struct unw_reg_state __initdata kgdb_reg_state[KGDB_EARLY_SIZE];
++static struct unw_labeled_state __initdata kgdb_labeled_state[KGDB_EARLY_SIZE];
++void __initdata *kgdb_reg_state_free, __initdata *kgdb_labeled_state_free;
++
++static void __init
++kgdb_malloc_init(void)
++{
++      int i;
++
++      kgdb_reg_state_free = kgdb_reg_state;
++      for (i = 1; i < KGDB_EARLY_SIZE; i++) {
++              *((unsigned long *) &kgdb_reg_state[i]) = (unsigned long) kgdb_reg_state_free;
++              kgdb_reg_state_free = &kgdb_reg_state[i];
++      }
++
++      kgdb_labeled_state_free = kgdb_labeled_state;
++      for (i = 1; i < KGDB_EARLY_SIZE; i++) {
++              *((unsigned long *) &kgdb_labeled_state[i]) =
++                      (unsigned long) kgdb_labeled_state_free;
++              kgdb_labeled_state_free = &kgdb_labeled_state[i];
++      }
++
++}
++
++static void * __init
++kgdb_malloc(void **mem)
++{
++      void *p;
++
++      p = *mem;
++      *mem = *((void **) p);
++      return p;
++}
++
++static void __init
++kgdb_free(void **mem, void *p)
++{
++      *((void **)p) = *mem;
++      *mem = p;
++}
++
++#define alloc_reg_state()     (!malloc_sizes[0].cs_cachep ?           \
++              kgdb_malloc(&kgdb_reg_state_free) :                     \
++              kmalloc(sizeof(struct unw_reg_state), GFP_ATOMIC))
++#define free_reg_state(usr)   (!malloc_sizes[0].cs_cachep ?           \
++              kgdb_free(&kgdb_reg_state_free, usr) :                  \
++              kfree(usr))
++#define alloc_labeled_state() (!malloc_sizes[0].cs_cachep ?           \
++              kgdb_malloc(&kgdb_labeled_state_free) :                 \
++              kmalloc(sizeof(struct unw_labeled_state), GFP_ATOMIC))
++#define free_labeled_state(usr)       (!malloc_sizes[0].cs_cachep ?           \
++              kgdb_free(&kgdb_labeled_state_free, usr) :              \
++              kfree(usr))
++
++#else
+ #define alloc_reg_state()     kmalloc(sizeof(struct unw_reg_state), GFP_ATOMIC)
+ #define free_reg_state(usr)   kfree(usr)
+ #define alloc_labeled_state() kmalloc(sizeof(struct unw_labeled_state), GFP_ATOMIC)
+ #define free_labeled_state(usr)       kfree(usr)
++#endif
+ 
+ typedef unsigned long unw_word;
+ typedef unsigned char unw_hash_index_t;
+@@ -238,6 +296,24 @@ static struct {
+ #endif
+ };
+ 
++#ifdef        CONFIG_KGDB
++/*
++ * This makes it safe to call breakpoint() very early
++ * in setup_arch providing:
++ *    1) breakpoint isn't called between lines in cpu_init
++ *       where init_mm.mm_count is incremented and ia64_mmu_init
++ *       is called.  Otherwise the test below is invalid.
++ *    2) the memory examined doesn't result in tlbmiss.
++ */
++static unsigned long inline kgdb_unimpl_va_mask(void)
++{
++      if (atomic_read(&init_mm.mm_count) > 1)
++              return local_cpu_data->unimpl_va_mask;
++      else
++              return 0UL;
++}
++#endif
++
+ static inline int
+ read_only (void *addr)
+ {
+@@ -1786,7 +1862,11 @@ run_script (struct unw_script *script, s
+ 
+                     case UNW_INSN_LOAD:
+ #ifdef UNW_DEBUG
++#ifdef        CONFIG_KGDB
++                      if ((s[val] & (kgdb_unimpl_va_mask() | 0x7)) != 0
++#else
+                       if ((s[val] & (local_cpu_data->unimpl_va_mask | 0x7)) != 0
++#endif
+                           || s[val] < TASK_SIZE)
+                       {
+                               UNW_DPRINT(0, "unwind.%s: rejecting bad psp=0x%lx\n",
+@@ -1821,7 +1901,11 @@ find_save_locs (struct unw_frame_info *i
+       struct unw_script *scr;
+       unsigned long flags = 0;
+ 
++#ifdef        CONFIG_KGDB
++      if ((info->ip & (kgdb_unimpl_va_mask() | 0xf)) || info->ip < TASK_SIZE) {
++#else
+       if ((info->ip & (local_cpu_data->unimpl_va_mask | 0xf)) || info->ip < TASK_SIZE) {
++#endif
+               /* don't let obviously bad addresses pollute the cache */
+               /* FIXME: should really be level 0 but it occurs too often. KAO */
+               UNW_DPRINT(1, "unwind.%s: rejecting bad ip=0x%lx\n", __FUNCTION__, info->ip);
+@@ -2249,6 +2333,9 @@ unw_init (void)
+ 
+       init_unwind_table(&unw.kernel_table, "kernel", KERNEL_START, (unsigned long) __gp,
+                         __start_unwind, __end_unwind);
++#ifdef        CONFIG_KGDB
++      kgdb_malloc_init();
++#endif
+ }
+ 
+ /*
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ia64/mm/extable.c linux-2.6.18.kgdb/arch/ia64/mm/extable.c
+--- linux-2.6.18/arch/ia64/mm/extable.c        2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/ia64/mm/extable.c   2008-06-10 16:19:32.000000000 +0400
+@@ -6,6 +6,7 @@
+  */
+ 
+ #include <linux/sort.h>
++#include <linux/kgdb.h>
+ 
+ #include <asm/uaccess.h>
+ #include <asm/module.h>
+@@ -73,6 +74,11 @@ search_extable (const struct exception_t
+                 else
+                         last = mid - 1;
+         }
++#ifdef CONFIG_KGDB
++      if (atomic_read(&debugger_active) && kgdb_may_fault)
++              kgdb_fault_longjmp(kgdb_fault_jmp_regs);
++              /* Not reached. */
++#endif
+         return NULL;
+ }
+ 
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ia64/mm/fault.c linux-2.6.18.kgdb/arch/ia64/mm/fault.c
+--- linux-2.6.18/arch/ia64/mm/fault.c  2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/ia64/mm/fault.c     2008-06-10 16:19:32.000000000 +0400
+@@ -266,6 +266,10 @@ ia64_do_page_fault (unsigned long addres
+        */
+       bust_spinlocks(1);
+ 
++      if (notify_die(DIE_PAGE_FAULT_NO_CONTEXT, "no context", regs,
++                      isr, 14, SIGSEGV) == NOTIFY_STOP)
++              return;
++
+       if (address < PAGE_SIZE)
+               printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference (address %016lx)\n", address);
+       else
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/mips/Kconfig.debug linux-2.6.18.kgdb/arch/mips/Kconfig.debug
+--- linux-2.6.18/arch/mips/Kconfig.debug       2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/mips/Kconfig.debug  2008-06-10 16:19:28.000000000 +0400
+@@ -37,25 +37,6 @@ config DEBUG_STACK_USAGE
+ 
+         This option will slow down process creation somewhat.
+ 
+-config KGDB
+-      bool "Remote GDB kernel debugging"
+-      depends on DEBUG_KERNEL
+-      select DEBUG_INFO
+-      help
+-        If you say Y here, it will be possible to remotely debug the MIPS
+-        kernel using gdb. This enlarges your kernel image disk size by
+-        several megabytes and requires a machine with more than 16 MB,
+-        better 32 MB RAM to avoid excessive linking time. This is only
+-        useful for kernel hackers. If unsure, say N.
+-
+-config GDB_CONSOLE
+-      bool "Console output to GDB"
+-      depends on KGDB
+-      help
+-        If you are using GDB for remote debugging over a serial port and
+-        would like kernel messages to be formatted into GDB $O packets so
+-        that GDB prints them as program output, say 'Y'.
+-
+ config SB1XXX_CORELIS
+       bool "Corelis Debugger"
+       depends on SIBYTE_SB1xxx_SOC
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/mips/kernel/Makefile linux-2.6.18.kgdb/arch/mips/kernel/Makefile
+--- linux-2.6.18/arch/mips/kernel/Makefile     2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/mips/kernel/Makefile        2008-06-10 16:19:28.000000000 +0400
+@@ -59,7 +59,8 @@ obj-$(CONFIG_MIPS32_COMPAT)  += linux32.o
+ obj-$(CONFIG_MIPS32_N32)      += binfmt_elfn32.o scall64-n32.o signal_n32.o
+ obj-$(CONFIG_MIPS32_O32)      += binfmt_elfo32.o scall64-o32.o ptrace32.o
+ 
+-obj-$(CONFIG_KGDB)            += gdb-low.o gdb-stub.o
++obj-$(CONFIG_KGDB)            += kgdb_handler.o kgdb.o kgdb-jmp.o     \
++                                      kgdb-setjmp.o
+ obj-$(CONFIG_PROC_FS)         += proc.o
+ 
+ obj-$(CONFIG_64BIT)           += cpu-bugs64.o
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/mips/kernel/gdb-low.S linux-2.6.18.kgdb/arch/mips/kernel/gdb-low.S
+--- linux-2.6.18/arch/mips/kernel/gdb-low.S    2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/mips/kernel/gdb-low.S       1970-01-01 03:00:00.000000000 +0300
+@@ -1,394 +0,0 @@
+-/*
+- * gdb-low.S contains the low-level trap handler for the GDB stub.
+- *
+- * Copyright (C) 1995 Andreas Busse
+- */
+-#include <linux/sys.h>
+-
+-#include <asm/asm.h>
+-#include <asm/errno.h>
+-#include <asm/irqflags.h>
+-#include <asm/mipsregs.h>
+-#include <asm/regdef.h>
+-#include <asm/stackframe.h>
+-#include <asm/gdb-stub.h>
+-
+-#ifdef CONFIG_32BIT
+-#define DMFC0 mfc0
+-#define DMTC0 mtc0
+-#define LDC1  lwc1
+-#define SDC1  lwc1
+-#endif
+-#ifdef CONFIG_64BIT
+-#define DMFC0 dmfc0
+-#define DMTC0 dmtc0
+-#define LDC1  ldc1
+-#define SDC1  ldc1
+-#endif
+-
+-/*
+- * [jsun] We reserves about 2x GDB_FR_SIZE in stack.  The lower (addressed)
+- * part is used to store registers and passed to exception handler.
+- * The upper part is reserved for "call func" feature where gdb client
+- * saves some of the regs, setups call frame and passes args.
+- *
+- * A trace shows about 200 bytes are used to store about half of all regs.
+- * The rest should be big enough for frame setup and passing args.
+- */
+-
+-/*
+- * The low level trap handler
+- */
+-              .align  5
+-              NESTED(trap_low, GDB_FR_SIZE, sp)
+-              .set    noat
+-              .set    noreorder
+-
+-              mfc0    k0, CP0_STATUS
+-              sll     k0, 3                   /* extract cu0 bit */
+-              bltz    k0, 1f
+-              move    k1, sp
+-
+-              /*
+-               * Called from user mode, go somewhere else.
+-               */
+-              mfc0    k0, CP0_CAUSE
+-              andi    k0, k0, 0x7c
+-#ifdef CONFIG_64BIT
+-              dsll    k0, k0, 1
+-#endif
+-              PTR_L   k1, saved_vectors(k0)
+-              jr      k1
+-              nop
+-1:
+-              move    k0, sp
+-              PTR_SUBU sp, k1, GDB_FR_SIZE*2  # see comment above
+-              LONG_S  k0, GDB_FR_REG29(sp)
+-              LONG_S  $2, GDB_FR_REG2(sp)
+-
+-/*
+- * First save the CP0 and special registers
+- */
+-
+-              mfc0    v0, CP0_STATUS
+-              LONG_S  v0, GDB_FR_STATUS(sp)
+-              mfc0    v0, CP0_CAUSE
+-              LONG_S  v0, GDB_FR_CAUSE(sp)
+-              DMFC0   v0, CP0_EPC
+-              LONG_S  v0, GDB_FR_EPC(sp)
+-              DMFC0   v0, CP0_BADVADDR
+-              LONG_S  v0, GDB_FR_BADVADDR(sp)
+-              mfhi    v0
+-              LONG_S  v0, GDB_FR_HI(sp)
+-              mflo    v0
+-              LONG_S  v0, GDB_FR_LO(sp)
+-
+-/*
+- * Now the integer registers
+- */
+-
+-              LONG_S  zero, GDB_FR_REG0(sp)           /* I know... */
+-              LONG_S  $1, GDB_FR_REG1(sp)
+-              /* v0 already saved */
+-              LONG_S  $3, GDB_FR_REG3(sp)
+-              LONG_S  $4, GDB_FR_REG4(sp)
+-              LONG_S  $5, GDB_FR_REG5(sp)
+-              LONG_S  $6, GDB_FR_REG6(sp)
+-              LONG_S  $7, GDB_FR_REG7(sp)
+-              LONG_S  $8, GDB_FR_REG8(sp)
+-              LONG_S  $9, GDB_FR_REG9(sp)
+-              LONG_S  $10, GDB_FR_REG10(sp)
+-              LONG_S  $11, GDB_FR_REG11(sp)
+-              LONG_S  $12, GDB_FR_REG12(sp)
+-              LONG_S  $13, GDB_FR_REG13(sp)
+-              LONG_S  $14, GDB_FR_REG14(sp)
+-              LONG_S  $15, GDB_FR_REG15(sp)
+-              LONG_S  $16, GDB_FR_REG16(sp)
+-              LONG_S  $17, GDB_FR_REG17(sp)
+-              LONG_S  $18, GDB_FR_REG18(sp)
+-              LONG_S  $19, GDB_FR_REG19(sp)
+-              LONG_S  $20, GDB_FR_REG20(sp)
+-              LONG_S  $21, GDB_FR_REG21(sp)
+-              LONG_S  $22, GDB_FR_REG22(sp)
+-              LONG_S  $23, GDB_FR_REG23(sp)
+-              LONG_S  $24, GDB_FR_REG24(sp)
+-              LONG_S  $25, GDB_FR_REG25(sp)
+-              LONG_S  $26, GDB_FR_REG26(sp)
+-              LONG_S  $27, GDB_FR_REG27(sp)
+-              LONG_S  $28, GDB_FR_REG28(sp)
+-              /* sp already saved */
+-              LONG_S  $30, GDB_FR_REG30(sp)
+-              LONG_S  $31, GDB_FR_REG31(sp)
+-
+-              CLI                             /* disable interrupts */
+-              TRACE_IRQS_OFF
+-
+-/*
+- * Followed by the floating point registers
+- */
+-              mfc0    v0, CP0_STATUS          /* FPU enabled? */
+-              srl     v0, v0, 16
+-              andi    v0, v0, (ST0_CU1 >> 16)
+-
+-              beqz    v0,2f                   /* disabled, skip */
+-               nop
+-
+-              SDC1    $0, GDB_FR_FPR0(sp)
+-              SDC1    $1, GDB_FR_FPR1(sp)
+-              SDC1    $2, GDB_FR_FPR2(sp)
+-              SDC1    $3, GDB_FR_FPR3(sp)
+-              SDC1    $4, GDB_FR_FPR4(sp)
+-              SDC1    $5, GDB_FR_FPR5(sp)
+-              SDC1    $6, GDB_FR_FPR6(sp)
+-              SDC1    $7, GDB_FR_FPR7(sp)
+-              SDC1    $8, GDB_FR_FPR8(sp)
+-              SDC1    $9, GDB_FR_FPR9(sp)
+-              SDC1    $10, GDB_FR_FPR10(sp)
+-              SDC1    $11, GDB_FR_FPR11(sp)
+-              SDC1    $12, GDB_FR_FPR12(sp)
+-              SDC1    $13, GDB_FR_FPR13(sp)
+-              SDC1    $14, GDB_FR_FPR14(sp)
+-              SDC1    $15, GDB_FR_FPR15(sp)
+-              SDC1    $16, GDB_FR_FPR16(sp)
+-              SDC1    $17, GDB_FR_FPR17(sp)
+-              SDC1    $18, GDB_FR_FPR18(sp)
+-              SDC1    $19, GDB_FR_FPR19(sp)
+-              SDC1    $20, GDB_FR_FPR20(sp)
+-              SDC1    $21, GDB_FR_FPR21(sp)
+-              SDC1    $22, GDB_FR_FPR22(sp)
+-              SDC1    $23, GDB_FR_FPR23(sp)
+-              SDC1    $24, GDB_FR_FPR24(sp)
+-              SDC1    $25, GDB_FR_FPR25(sp)
+-              SDC1    $26, GDB_FR_FPR26(sp)
+-              SDC1    $27, GDB_FR_FPR27(sp)
+-              SDC1    $28, GDB_FR_FPR28(sp)
+-              SDC1    $29, GDB_FR_FPR29(sp)
+-              SDC1    $30, GDB_FR_FPR30(sp)
+-              SDC1    $31, GDB_FR_FPR31(sp)
+-
+-/*
+- * FPU control registers
+- */
+-
+-              cfc1    v0, CP1_STATUS
+-              LONG_S  v0, GDB_FR_FSR(sp)
+-              cfc1    v0, CP1_REVISION
+-              LONG_S  v0, GDB_FR_FIR(sp)
+-
+-/*
+- * Current stack frame ptr
+- */
+-
+-2:
+-              LONG_S  sp, GDB_FR_FRP(sp)
+-
+-/*
+- * CP0 registers (R4000/R4400 unused registers skipped)
+- */
+-
+-              mfc0    v0, CP0_INDEX
+-              LONG_S  v0, GDB_FR_CP0_INDEX(sp)
+-              mfc0    v0, CP0_RANDOM
+-              LONG_S  v0, GDB_FR_CP0_RANDOM(sp)
+-              DMFC0   v0, CP0_ENTRYLO0
+-              LONG_S  v0, GDB_FR_CP0_ENTRYLO0(sp)
+-              DMFC0   v0, CP0_ENTRYLO1
+-              LONG_S  v0, GDB_FR_CP0_ENTRYLO1(sp)
+-              DMFC0   v0, CP0_CONTEXT
+-              LONG_S  v0, GDB_FR_CP0_CONTEXT(sp)
+-              mfc0    v0, CP0_PAGEMASK
+-              LONG_S  v0, GDB_FR_CP0_PAGEMASK(sp)
+-              mfc0    v0, CP0_WIRED
+-              LONG_S  v0, GDB_FR_CP0_WIRED(sp)
+-              DMFC0   v0, CP0_ENTRYHI
+-              LONG_S  v0, GDB_FR_CP0_ENTRYHI(sp)
+-              mfc0    v0, CP0_PRID
+-              LONG_S  v0, GDB_FR_CP0_PRID(sp)
+-
+-              .set    at
+-
+-/*
+- * Continue with the higher level handler
+- */
+-
+-              move    a0,sp
+-
+-              jal     handle_exception
+-               nop
+-
+-/*
+- * Restore all writable registers, in reverse order
+- */
+-
+-              .set    noat
+-
+-              LONG_L  v0, GDB_FR_CP0_ENTRYHI(sp)
+-              LONG_L  v1, GDB_FR_CP0_WIRED(sp)
+-              DMTC0   v0, CP0_ENTRYHI
+-              mtc0    v1, CP0_WIRED
+-              LONG_L  v0, GDB_FR_CP0_PAGEMASK(sp)
+-              LONG_L  v1, GDB_FR_CP0_ENTRYLO1(sp)
+-              mtc0    v0, CP0_PAGEMASK
+-              DMTC0   v1, CP0_ENTRYLO1
+-              LONG_L  v0, GDB_FR_CP0_ENTRYLO0(sp)
+-              LONG_L  v1, GDB_FR_CP0_INDEX(sp)
+-              DMTC0   v0, CP0_ENTRYLO0
+-              LONG_L  v0, GDB_FR_CP0_CONTEXT(sp)
+-              mtc0    v1, CP0_INDEX
+-              DMTC0   v0, CP0_CONTEXT
+-
+-
+-/*
+- * Next, the floating point registers
+- */
+-              mfc0    v0, CP0_STATUS          /* check if the FPU is enabled */
+-              srl     v0, v0, 16
+-              andi    v0, v0, (ST0_CU1 >> 16)
+-
+-              beqz    v0, 3f                  /* disabled, skip */
+-               nop
+-
+-              LDC1    $31, GDB_FR_FPR31(sp)
+-              LDC1    $30, GDB_FR_FPR30(sp)
+-              LDC1    $29, GDB_FR_FPR29(sp)
+-              LDC1    $28, GDB_FR_FPR28(sp)
+-              LDC1    $27, GDB_FR_FPR27(sp)
+-              LDC1    $26, GDB_FR_FPR26(sp)
+-              LDC1    $25, GDB_FR_FPR25(sp)
+-              LDC1    $24, GDB_FR_FPR24(sp)
+-              LDC1    $23, GDB_FR_FPR23(sp)
+-              LDC1    $22, GDB_FR_FPR22(sp)
+-              LDC1    $21, GDB_FR_FPR21(sp)
+-              LDC1    $20, GDB_FR_FPR20(sp)
+-              LDC1    $19, GDB_FR_FPR19(sp)
+-              LDC1    $18, GDB_FR_FPR18(sp)
+-              LDC1    $17, GDB_FR_FPR17(sp)
+-              LDC1    $16, GDB_FR_FPR16(sp)
+-              LDC1    $15, GDB_FR_FPR15(sp)
+-              LDC1    $14, GDB_FR_FPR14(sp)
+-              LDC1    $13, GDB_FR_FPR13(sp)
+-              LDC1    $12, GDB_FR_FPR12(sp)
+-              LDC1    $11, GDB_FR_FPR11(sp)
+-              LDC1    $10, GDB_FR_FPR10(sp)
+-              LDC1    $9, GDB_FR_FPR9(sp)
+-              LDC1    $8, GDB_FR_FPR8(sp)
+-              LDC1    $7, GDB_FR_FPR7(sp)
+-              LDC1    $6, GDB_FR_FPR6(sp)
+-              LDC1    $5, GDB_FR_FPR5(sp)
+-              LDC1    $4, GDB_FR_FPR4(sp)
+-              LDC1    $3, GDB_FR_FPR3(sp)
+-              LDC1    $2, GDB_FR_FPR2(sp)
+-              LDC1    $1, GDB_FR_FPR1(sp)
+-              LDC1    $0, GDB_FR_FPR0(sp)
+-
+-/*
+- * Now the CP0 and integer registers
+- */
+-
+-3:
+-#ifdef CONFIG_MIPS_MT_SMTC
+-              /* Read-modify write of Status must be atomic */
+-              mfc0    t2, CP0_TCSTATUS
+-              ori     t1, t2, TCSTATUS_IXMT
+-              mtc0    t1, CP0_TCSTATUS
+-              andi    t2, t2, TCSTATUS_IXMT
+-              _ehb
+-              DMT     9                               # dmt   t1
+-              jal     mips_ihb
+-              nop
+-#endif /* CONFIG_MIPS_MT_SMTC */
+-              mfc0    t0, CP0_STATUS
+-              ori     t0, 0x1f
+-              xori    t0, 0x1f
+-              mtc0    t0, CP0_STATUS
+-#ifdef CONFIG_MIPS_MT_SMTC
+-              andi    t1, t1, VPECONTROL_TE
+-              beqz    t1, 9f
+-              nop
+-              EMT                                     # emt
+-9:
+-              mfc0    t1, CP0_TCSTATUS
+-              xori    t1, t1, TCSTATUS_IXMT
+-              or      t1, t1, t2
+-              mtc0    t1, CP0_TCSTATUS
+-              _ehb
+-#endif /* CONFIG_MIPS_MT_SMTC */
+-              LONG_L  v0, GDB_FR_STATUS(sp)
+-              LONG_L  v1, GDB_FR_EPC(sp)
+-              mtc0    v0, CP0_STATUS
+-              DMTC0   v1, CP0_EPC
+-              LONG_L  v0, GDB_FR_HI(sp)
+-              LONG_L  v1, GDB_FR_LO(sp)
+-              mthi    v0
+-              mtlo    v1
+-              LONG_L  $31, GDB_FR_REG31(sp)
+-              LONG_L  $30, GDB_FR_REG30(sp)
+-              LONG_L  $28, GDB_FR_REG28(sp)
+-              LONG_L  $27, GDB_FR_REG27(sp)
+-              LONG_L  $26, GDB_FR_REG26(sp)
+-              LONG_L  $25, GDB_FR_REG25(sp)
+-              LONG_L  $24, GDB_FR_REG24(sp)
+-              LONG_L  $23, GDB_FR_REG23(sp)
+-              LONG_L  $22, GDB_FR_REG22(sp)
+-              LONG_L  $21, GDB_FR_REG21(sp)
+-              LONG_L  $20, GDB_FR_REG20(sp)
+-              LONG_L  $19, GDB_FR_REG19(sp)
+-              LONG_L  $18, GDB_FR_REG18(sp)
+-              LONG_L  $17, GDB_FR_REG17(sp)
+-              LONG_L  $16, GDB_FR_REG16(sp)
+-              LONG_L  $15, GDB_FR_REG15(sp)
+-              LONG_L  $14, GDB_FR_REG14(sp)
+-              LONG_L  $13, GDB_FR_REG13(sp)
+-              LONG_L  $12, GDB_FR_REG12(sp)
+-              LONG_L  $11, GDB_FR_REG11(sp)
+-              LONG_L  $10, GDB_FR_REG10(sp)
+-              LONG_L  $9, GDB_FR_REG9(sp)
+-              LONG_L  $8, GDB_FR_REG8(sp)
+-              LONG_L  $7, GDB_FR_REG7(sp)
+-              LONG_L  $6, GDB_FR_REG6(sp)
+-              LONG_L  $5, GDB_FR_REG5(sp)
+-              LONG_L  $4, GDB_FR_REG4(sp)
+-              LONG_L  $3, GDB_FR_REG3(sp)
+-              LONG_L  $2, GDB_FR_REG2(sp)
+-              LONG_L  $1, GDB_FR_REG1(sp)
+-#if defined(CONFIG_CPU_R3000) || defined(CONFIG_CPU_TX39XX)
+-              LONG_L  k0, GDB_FR_EPC(sp)
+-              LONG_L  $29, GDB_FR_REG29(sp)           /* Deallocate stack */
+-              jr      k0
+-              rfe
+-#else
+-              LONG_L  sp, GDB_FR_REG29(sp)            /* Deallocate stack */
+-
+-              .set    mips3
+-              eret
+-              .set    mips0
+-#endif
+-              .set    at
+-              .set    reorder
+-              END(trap_low)
+-
+-LEAF(kgdb_read_byte)
+-4:            lb      t0, (a0)
+-              sb      t0, (a1)
+-              li      v0, 0
+-              jr      ra
+-              .section __ex_table,"a"
+-              PTR     4b, kgdbfault
+-              .previous
+-              END(kgdb_read_byte)
+-
+-LEAF(kgdb_write_byte)
+-5:            sb      a0, (a1)
+-              li      v0, 0
+-              jr      ra
+-              .section __ex_table,"a"
+-              PTR     5b, kgdbfault
+-              .previous
+-              END(kgdb_write_byte)
+-
+-              .type   kgdbfault@function
+-              .ent    kgdbfault
+-
+-kgdbfault:    li      v0, -EFAULT
+-              jr      ra
+-              .end    kgdbfault
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/mips/kernel/gdb-stub.c linux-2.6.18.kgdb/arch/mips/kernel/gdb-stub.c
+--- linux-2.6.18/arch/mips/kernel/gdb-stub.c   2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/mips/kernel/gdb-stub.c      1970-01-01 03:00:00.000000000 +0300
+@@ -1,1154 +0,0 @@
+-/*
+- *  arch/mips/kernel/gdb-stub.c
+- *
+- *  Originally written by Glenn Engel, Lake Stevens Instrument Division
+- *
+- *  Contributed by HP Systems
+- *
+- *  Modified for SPARC by Stu Grossman, Cygnus Support.
+- *
+- *  Modified for Linux/MIPS (and MIPS in general) by Andreas Busse
+- *  Send complaints, suggestions etc. to <andy@waldorf-gmbh.de>
+- *
+- *  Copyright (C) 1995 Andreas Busse
+- *
+- *  Copyright (C) 2003 MontaVista Software Inc.
+- *  Author: Jun Sun, jsun@mvista.com or jsun@junsun.net
+- */
+-
+-/*
+- *  To enable debugger support, two things need to happen.  One, a
+- *  call to set_debug_traps() is necessary in order to allow any breakpoints
+- *  or error conditions to be properly intercepted and reported to gdb.
+- *  Two, a breakpoint needs to be generated to begin communication.  This
+- *  is most easily accomplished by a call to breakpoint().  Breakpoint()
+- *  simulates a breakpoint by executing a BREAK instruction.
+- *
+- *
+- *    The following gdb commands are supported:
+- *
+- * command          function                               Return value
+- *
+- *    g             return the value of the CPU registers  hex data or ENN
+- *    G             set the value of the CPU registers     OK or ENN
+- *
+- *    mAA..AA,LLLL  Read LLLL bytes at address AA..AA      hex data or ENN
+- *    MAA..AA,LLLL: Write LLLL bytes at address AA.AA      OK or ENN
+- *
+- *    c             Resume at current address              SNN   ( signal NN)
+- *    cAA..AA       Continue at address AA..AA             SNN
+- *
+- *    s             Step one instruction                   SNN
+- *    sAA..AA       Step one instruction from AA..AA       SNN
+- *
+- *    k             kill
+- *
+- *    ?             What was the last sigval ?             SNN   (signal NN)
+- *
+- *    bBB..BB     Set baud rate to BB..BB                OK or BNN, then sets
+- *                                                       baud rate
+- *
+- * All commands and responses are sent with a packet which includes a
+- * checksum.  A packet consists of
+- *
+- * $<packet info>#<checksum>.
+- *
+- * where
+- * <packet info> :: <characters representing the command or response>
+- * <checksum>    :: < two hex digits computed as modulo 256 sum of <packetinfo>>
+- *
+- * When a packet is received, it is first acknowledged with either '+' or '-'.
+- * '+' indicates a successful transfer.  '-' indicates a failed transfer.
+- *
+- * Example:
+- *
+- * Host:                  Reply:
+- * $m0,10#2a               +$00010203040506070809101112131415#42
+- *
+- *
+- *  ==============
+- *  MORE EXAMPLES:
+- *  ==============
+- *
+- *  For reference -- the following are the steps that one
+- *  company took (RidgeRun Inc) to get remote gdb debugging
+- *  going. In this scenario the host machine was a PC and the
+- *  target platform was a Galileo EVB64120A MIPS evaluation
+- *  board.
+- *
+- *  Step 1:
+- *  First download gdb-5.0.tar.gz from the internet.
+- *  and then build/install the package.
+- *
+- *  Example:
+- *    $ tar zxf gdb-5.0.tar.gz
+- *    $ cd gdb-5.0
+- *    $ ./configure --target=mips-linux-elf
+- *    $ make
+- *    $ install
+- *    $ which mips-linux-elf-gdb
+- *    /usr/local/bin/mips-linux-elf-gdb
+- *
+- *  Step 2:
+- *  Configure linux for remote debugging and build it.
+- *
+- *  Example:
+- *    $ cd ~/linux
+- *    $ make menuconfig <go to "Kernel Hacking" and turn on remote debugging>
+- *    $ make
+- *
+- *  Step 3:
+- *  Download the kernel to the remote target and start
+- *  the kernel running. It will promptly halt and wait
+- *  for the host gdb session to connect. It does this
+- *  since the "Kernel Hacking" option has defined
+- *  CONFIG_KGDB which in turn enables your calls
+- *  to:
+- *     set_debug_traps();
+- *     breakpoint();
+- *
+- *  Step 4:
+- *  Start the gdb session on the host.
+- *
+- *  Example:
+- *    $ mips-linux-elf-gdb vmlinux
+- *    (gdb) set remotebaud 115200
+- *    (gdb) target remote /dev/ttyS1
+- *    ...at this point you are connected to
+- *       the remote target and can use gdb
+- *       in the normal fasion. Setting
+- *       breakpoints, single stepping,
+- *       printing variables, etc.
+- */
+-#include <linux/string.h>
+-#include <linux/kernel.h>
+-#include <linux/signal.h>
+-#include <linux/sched.h>
+-#include <linux/mm.h>
+-#include <linux/console.h>
+-#include <linux/init.h>
+-#include <linux/smp.h>
+-#include <linux/spinlock.h>
+-#include <linux/slab.h>
+-#include <linux/reboot.h>
+-
+-#include <asm/asm.h>
+-#include <asm/cacheflush.h>
+-#include <asm/mipsregs.h>
+-#include <asm/pgtable.h>
+-#include <asm/system.h>
+-#include <asm/gdb-stub.h>
+-#include <asm/inst.h>
+-#include <asm/smp.h>
+-
+-/*
+- * external low-level support routines
+- */
+-
+-extern int putDebugChar(char c);    /* write a single character      */
+-extern char getDebugChar(void);     /* read and return a single char */
+-extern void trap_low(void);
+-
+-/*
+- * breakpoint and test functions
+- */
+-extern void breakpoint(void);
+-extern void breakinst(void);
+-extern void async_breakpoint(void);
+-extern void async_breakinst(void);
+-extern void adel(void);
+-
+-/*
+- * local prototypes
+- */
+-
+-static void getpacket(char *buffer);
+-static void putpacket(char *buffer);
+-static int computeSignal(int tt);
+-static int hex(unsigned char ch);
+-static int hexToInt(char **ptr, int *intValue);
+-static int hexToLong(char **ptr, long *longValue);
+-static unsigned char *mem2hex(char *mem, char *buf, int count, int may_fault);
+-void handle_exception(struct gdb_regs *regs);
+-
+-int kgdb_enabled;
+-
+-/*
+- * spin locks for smp case
+- */
+-static DEFINE_SPINLOCK(kgdb_lock);
+-static raw_spinlock_t kgdb_cpulock[NR_CPUS] = {
+-      [0 ... NR_CPUS-1] = __RAW_SPIN_LOCK_UNLOCKED,
+-};
+-
+-/*
+- * BUFMAX defines the maximum number of characters in inbound/outbound buffers
+- * at least NUMREGBYTES*2 are needed for register packets
+- */
+-#define BUFMAX 2048
+-
+-static char input_buffer[BUFMAX];
+-static char output_buffer[BUFMAX];
+-static int initialized;       /* !0 means we've been initialized */
+-static int kgdb_started;
+-static const char hexchars[]="0123456789abcdef";
+-
+-/* Used to prevent crashes in memory access.  Note that they'll crash anyway if
+-   we haven't set up fault handlers yet... */
+-int kgdb_read_byte(unsigned char *address, unsigned char *dest);
+-int kgdb_write_byte(unsigned char val, unsigned char *dest);
+-
+-/*
+- * Convert ch from a hex digit to an int
+- */
+-static int hex(unsigned char ch)
+-{
+-      if (ch >= 'a' && ch <= 'f')
+-              return ch-'a'+10;
+-      if (ch >= '0' && ch <= '9')
+-              return ch-'0';
+-      if (ch >= 'A' && ch <= 'F')
+-              return ch-'A'+10;
+-      return -1;
+-}
+-
+-/*
+- * scan for the sequence $<data>#<checksum>
+- */
+-static void getpacket(char *buffer)
+-{
+-      unsigned char checksum;
+-      unsigned char xmitcsum;
+-      int i;
+-      int count;
+-      unsigned char ch;
+-
+-      do {
+-              /*
+-               * wait around for the start character,
+-               * ignore all other characters
+-               */
+-              while ((ch = (getDebugChar() & 0x7f)) != '$') ;
+-
+-              checksum = 0;
+-              xmitcsum = -1;
+-              count = 0;
+-
+-              /*
+-               * now, read until a # or end of buffer is found
+-               */
+-              while (count < BUFMAX) {
+-                      ch = getDebugChar();
+-                      if (ch == '#')
+-                              break;
+-                      checksum = checksum + ch;
+-                      buffer[count] = ch;
+-                      count = count + 1;
+-              }
+-
+-              if (count >= BUFMAX)
+-                      continue;
+-
+-              buffer[count] = 0;
+-
+-              if (ch == '#') {
+-                      xmitcsum = hex(getDebugChar() & 0x7f) << 4;
+-                      xmitcsum |= hex(getDebugChar() & 0x7f);
+-
+-                      if (checksum != xmitcsum)
+-                              putDebugChar('-');      /* failed checksum */
+-                      else {
+-                              putDebugChar('+'); /* successful transfer */
+-
+-                              /*
+-                               * if a sequence char is present,
+-                               * reply the sequence ID
+-                               */
+-                              if (buffer[2] == ':') {
+-                                      putDebugChar(buffer[0]);
+-                                      putDebugChar(buffer[1]);
+-
+-                                      /*
+-                                       * remove sequence chars from buffer
+-                                       */
+-                                      count = strlen(buffer);
+-                                      for (i=3; i <= count; i++)
+-                                              buffer[i-3] = buffer[i];
+-                              }
+-                      }
+-              }
+-      }
+-      while (checksum != xmitcsum);
+-}
+-
+-/*
+- * send the packet in buffer.
+- */
+-static void putpacket(char *buffer)
+-{
+-      unsigned char checksum;
+-      int count;
+-      unsigned char ch;
+-
+-      /*
+-       * $<packet info>#<checksum>.
+-       */
+-
+-      do {
+-              putDebugChar('$');
+-              checksum = 0;
+-              count = 0;
+-
+-              while ((ch = buffer[count]) != 0) {
+-                      if (!(putDebugChar(ch)))
+-                              return;
+-                      checksum += ch;
+-                      count += 1;
+-              }
+-
+-              putDebugChar('#');
+-              putDebugChar(hexchars[checksum >> 4]);
+-              putDebugChar(hexchars[checksum & 0xf]);
+-
+-      }
+-      while ((getDebugChar() & 0x7f) != '+');
+-}
+-
+-
+-/*
+- * Convert the memory pointed to by mem into hex, placing result in buf.
+- * Return a pointer to the last char put in buf (null), in case of mem fault,
+- * return 0.
+- * may_fault is non-zero if we are reading from arbitrary memory, but is currently
+- * not used.
+- */
+-static unsigned char *mem2hex(char *mem, char *buf, int count, int may_fault)
+-{
+-      unsigned char ch;
+-
+-      while (count-- > 0) {
+-              if (kgdb_read_byte(mem++, &ch) != 0)
+-                      return 0;
+-              *buf++ = hexchars[ch >> 4];
+-              *buf++ = hexchars[ch & 0xf];
+-      }
+-
+-      *buf = 0;
+-
+-      return buf;
+-}
+-
+-/*
+- * convert the hex array pointed to by buf into binary to be placed in mem
+- * return a pointer to the character AFTER the last byte written
+- * may_fault is non-zero if we are reading from arbitrary memory, but is currently
+- * not used.
+- */
+-static char *hex2mem(char *buf, char *mem, int count, int binary, int may_fault)
+-{
+-      int i;
+-      unsigned char ch;
+-
+-      for (i=0; i<count; i++)
+-      {
+-              if (binary) {
+-                      ch = *buf++;
+-                      if (ch == 0x7d)
+-                              ch = 0x20 ^ *buf++;
+-              }
+-              else {
+-                      ch = hex(*buf++) << 4;
+-                      ch |= hex(*buf++);
+-              }
+-              if (kgdb_write_byte(ch, mem++) != 0)
+-                      return 0;
+-      }
+-
+-      return mem;
+-}
+-
+-/*
+- * This table contains the mapping between SPARC hardware trap types, and
+- * signals, which are primarily what GDB understands.  It also indicates
+- * which hardware traps we need to commandeer when initializing the stub.
+- */
+-static struct hard_trap_info {
+-      unsigned char tt;               /* Trap type code for MIPS R3xxx and R4xxx */
+-      unsigned char signo;            /* Signal that we map this trap into */
+-} hard_trap_info[] = {
+-      { 6, SIGBUS },                  /* instruction bus error */
+-      { 7, SIGBUS },                  /* data bus error */
+-      { 9, SIGTRAP },                 /* break */
+-      { 10, SIGILL },                 /* reserved instruction */
+-/*    { 11, SIGILL },         */      /* CPU unusable */
+-      { 12, SIGFPE },                 /* overflow */
+-      { 13, SIGTRAP },                /* trap */
+-      { 14, SIGSEGV },                /* virtual instruction cache coherency */
+-      { 15, SIGFPE },                 /* floating point exception */
+-      { 23, SIGSEGV },                /* watch */
+-      { 31, SIGSEGV },                /* virtual data cache coherency */
+-      { 0, 0}                         /* Must be last */
+-};
+-
+-/* Save the normal trap handlers for user-mode traps. */
+-void *saved_vectors[32];
+-
+-/*
+- * Set up exception handlers for tracing and breakpoints
+- */
+-void set_debug_traps(void)
+-{
+-      struct hard_trap_info *ht;
+-      unsigned long flags;
+-      unsigned char c;
+-
+-      local_irq_save(flags);
+-      for (ht = hard_trap_info; ht->tt && ht->signo; ht++)
+-              saved_vectors[ht->tt] = set_except_vector(ht->tt, trap_low);
+-
+-      putDebugChar('+'); /* 'hello world' */
+-      /*
+-       * In case GDB is started before us, ack any packets
+-       * (presumably "$?#xx") sitting there.
+-       */
+-      while((c = getDebugChar()) != '$');
+-      while((c = getDebugChar()) != '#');
+-      c = getDebugChar(); /* eat first csum byte */
+-      c = getDebugChar(); /* eat second csum byte */
+-      putDebugChar('+'); /* ack it */
+-
+-      initialized = 1;
+-      local_irq_restore(flags);
+-}
+-
+-void restore_debug_traps(void)
+-{
+-      struct hard_trap_info *ht;
+-      unsigned long flags;
+-
+-      local_irq_save(flags);
+-      for (ht = hard_trap_info; ht->tt && ht->signo; ht++)
+-              set_except_vector(ht->tt, saved_vectors[ht->tt]);
+-      local_irq_restore(flags);
+-}
+-
+-/*
+- * Convert the MIPS hardware trap type code to a Unix signal number.
+- */
+-static int computeSignal(int tt)
+-{
+-      struct hard_trap_info *ht;
+-
+-      for (ht = hard_trap_info; ht->tt && ht->signo; ht++)
+-              if (ht->tt == tt)
+-                      return ht->signo;
+-
+-      return SIGHUP;          /* default for things we don't know about */
+-}
+-
+-/*
+- * While we find nice hex chars, build an int.
+- * Return number of chars processed.
+- */
+-static int hexToInt(char **ptr, int *intValue)
+-{
+-      int numChars = 0;
+-      int hexValue;
+-
+-      *intValue = 0;
+-
+-      while (**ptr) {
+-              hexValue = hex(**ptr);
+-              if (hexValue < 0)
+-                      break;
+-
+-              *intValue = (*intValue << 4) | hexValue;
+-              numChars ++;
+-
+-              (*ptr)++;
+-      }
+-
+-      return (numChars);
+-}
+-
+-static int hexToLong(char **ptr, long *longValue)
+-{
+-      int numChars = 0;
+-      int hexValue;
+-
+-      *longValue = 0;
+-
+-      while (**ptr) {
+-              hexValue = hex(**ptr);
+-              if (hexValue < 0)
+-                      break;
+-
+-              *longValue = (*longValue << 4) | hexValue;
+-              numChars ++;
+-
+-              (*ptr)++;
+-      }
+-
+-      return numChars;
+-}
+-
+-
+-#if 0
+-/*
+- * Print registers (on target console)
+- * Used only to debug the stub...
+- */
+-void show_gdbregs(struct gdb_regs * regs)
+-{
+-      /*
+-       * Saved main processor registers
+-       */
+-      printk("$0 : %08lx %08lx %08lx %08lx %08lx %08lx %08lx %08lx\n",
+-             regs->reg0, regs->reg1, regs->reg2, regs->reg3,
+-               regs->reg4, regs->reg5, regs->reg6, regs->reg7);
+-      printk("$8 : %08lx %08lx %08lx %08lx %08lx %08lx %08lx %08lx\n",
+-             regs->reg8, regs->reg9, regs->reg10, regs->reg11,
+-               regs->reg12, regs->reg13, regs->reg14, regs->reg15);
+-      printk("$16: %08lx %08lx %08lx %08lx %08lx %08lx %08lx %08lx\n",
+-             regs->reg16, regs->reg17, regs->reg18, regs->reg19,
+-               regs->reg20, regs->reg21, regs->reg22, regs->reg23);
+-      printk("$24: %08lx %08lx %08lx %08lx %08lx %08lx %08lx %08lx\n",
+-             regs->reg24, regs->reg25, regs->reg26, regs->reg27,
+-             regs->reg28, regs->reg29, regs->reg30, regs->reg31);
+-
+-      /*
+-       * Saved cp0 registers
+-       */
+-      printk("epc  : %08lx\nStatus: %08lx\nCause : %08lx\n",
+-             regs->cp0_epc, regs->cp0_status, regs->cp0_cause);
+-}
+-#endif /* dead code */
+-
+-/*
+- * We single-step by setting breakpoints. When an exception
+- * is handled, we need to restore the instructions hoisted
+- * when the breakpoints were set.
+- *
+- * This is where we save the original instructions.
+- */
+-static struct gdb_bp_save {
+-      unsigned long addr;
+-      unsigned int val;
+-} step_bp[2];
+-
+-#define BP 0x0000000d  /* break opcode */
+-
+-/*
+- * Set breakpoint instructions for single stepping.
+- */
+-static void single_step(struct gdb_regs *regs)
+-{
+-      union mips_instruction insn;
+-      unsigned long targ;
+-      int is_branch, is_cond, i;
+-
+-      targ = regs->cp0_epc;
+-      insn.word = *(unsigned int *)targ;
+-      is_branch = is_cond = 0;
+-
+-      switch (insn.i_format.opcode) {
+-      /*
+-       * jr and jalr are in r_format format.
+-       */
+-      case spec_op:
+-              switch (insn.r_format.func) {
+-              case jalr_op:
+-              case jr_op:
+-                      targ = *(&regs->reg0 + insn.r_format.rs);
+-                      is_branch = 1;
+-                      break;
+-              }
+-              break;
+-
+-      /*
+-       * This group contains:
+-       * bltz_op, bgez_op, bltzl_op, bgezl_op,
+-       * bltzal_op, bgezal_op, bltzall_op, bgezall_op.
+-       */
+-      case bcond_op:
+-              is_branch = is_cond = 1;
+-              targ += 4 + (insn.i_format.simmediate << 2);
+-              break;
+-
+-      /*
+-       * These are unconditional and in j_format.
+-       */
+-      case jal_op:
+-      case j_op:
+-              is_branch = 1;
+-              targ += 4;
+-              targ >>= 28;
+-              targ <<= 28;
+-              targ |= (insn.j_format.target << 2);
+-              break;
+-
+-      /*
+-       * These are conditional.
+-       */
+-      case beq_op:
+-      case beql_op:
+-      case bne_op:
+-      case bnel_op:
+-      case blez_op:
+-      case blezl_op:
+-      case bgtz_op:
+-      case bgtzl_op:
+-      case cop0_op:
+-      case cop1_op:
+-      case cop2_op:
+-      case cop1x_op:
+-              is_branch = is_cond = 1;
+-              targ += 4 + (insn.i_format.simmediate << 2);
+-              break;
+-      }
+-
+-      if (is_branch) {
+-              i = 0;
+-              if (is_cond && targ != (regs->cp0_epc + 8)) {
+-                      step_bp[i].addr = regs->cp0_epc + 8;
+-                      step_bp[i++].val = *(unsigned *)(regs->cp0_epc + 8);
+-                      *(unsigned *)(regs->cp0_epc + 8) = BP;
+-              }
+-              step_bp[i].addr = targ;
+-              step_bp[i].val  = *(unsigned *)targ;
+-              *(unsigned *)targ = BP;
+-      } else {
+-              step_bp[0].addr = regs->cp0_epc + 4;
+-              step_bp[0].val  = *(unsigned *)(regs->cp0_epc + 4);
+-              *(unsigned *)(regs->cp0_epc + 4) = BP;
+-      }
+-}
+-
+-/*
+- *  If asynchronously interrupted by gdb, then we need to set a breakpoint
+- *  at the interrupted instruction so that we wind up stopped with a
+- *  reasonable stack frame.
+- */
+-static struct gdb_bp_save async_bp;
+-
+-/*
+- * Swap the interrupted EPC with our asynchronous breakpoint routine.
+- * This is safer than stuffing the breakpoint in-place, since no cache
+- * flushes (or resulting smp_call_functions) are required.  The
+- * assumption is that only one CPU will be handling asynchronous bp's,
+- * and only one can be active at a time.
+- */
+-extern spinlock_t smp_call_lock;
+-
+-void set_async_breakpoint(unsigned long *epc)
+-{
+-      /* skip breaking into userland */
+-      if ((*epc & 0x80000000) == 0)
+-              return;
+-
+-#ifdef CONFIG_SMP
+-      /* avoid deadlock if someone is make IPC */
+-      if (spin_is_locked(&smp_call_lock))
+-              return;
+-#endif
+-
+-      async_bp.addr = *epc;
+-      *epc = (unsigned long)async_breakpoint;
+-}
+-
+-static void kgdb_wait(void *arg)
+-{
+-      unsigned flags;
+-      int cpu = smp_processor_id();
+-
+-      local_irq_save(flags);
+-
+-      __raw_spin_lock(&kgdb_cpulock[cpu]);
+-      __raw_spin_unlock(&kgdb_cpulock[cpu]);
+-
+-      local_irq_restore(flags);
+-}
+-
+-/*
+- * GDB stub needs to call kgdb_wait on all processor with interrupts
+- * disabled, so it uses it's own special variant.
+- */
+-static int kgdb_smp_call_kgdb_wait(void)
+-{
+-#ifdef CONFIG_SMP
+-      struct call_data_struct data;
+-      int i, cpus = num_online_cpus() - 1;
+-      int cpu = smp_processor_id();
+-
+-      /*
+-       * Can die spectacularly if this CPU isn't yet marked online
+-       */
+-      BUG_ON(!cpu_online(cpu));
+-
+-      if (!cpus)
+-              return 0;
+-
+-      if (spin_is_locked(&smp_call_lock)) {
+-              /*
+-               * Some other processor is trying to make us do something
+-               * but we're not going to respond... give up
+-               */
+-              return -1;
+-              }
+-
+-      /*
+-       * We will continue here, accepting the fact that
+-       * the kernel may deadlock if another CPU attempts
+-       * to call smp_call_function now...
+-       */
+-
+-      data.func = kgdb_wait;
+-      data.info = NULL;
+-      atomic_set(&data.started, 0);
+-      data.wait = 0;
+-
+-      spin_lock(&smp_call_lock);
+-      call_data = &data;
+-      mb();
+-
+-      /* Send a message to all other CPUs and wait for them to respond */
+-      for (i = 0; i < NR_CPUS; i++)
+-              if (cpu_online(i) && i != cpu)
+-                      core_send_ipi(i, SMP_CALL_FUNCTION);
+-
+-      /* Wait for response */
+-      /* FIXME: lock-up detection, backtrace on lock-up */
+-      while (atomic_read(&data.started) != cpus)
+-              barrier();
+-
+-      call_data = NULL;
+-      spin_unlock(&smp_call_lock);
+-#endif
+-
+-      return 0;
+-}
+-
+-/*
+- * This function does all command processing for interfacing to gdb.  It
+- * returns 1 if you should skip the instruction at the trap address, 0
+- * otherwise.
+- */
+-void handle_exception (struct gdb_regs *regs)
+-{
+-      int trap;                       /* Trap type */
+-      int sigval;
+-      long addr;
+-      int length;
+-      char *ptr;
+-      unsigned long *stack;
+-      int i;
+-      int bflag = 0;
+-
+-      kgdb_started = 1;
+-
+-      /*
+-       * acquire the big kgdb spinlock
+-       */
+-      if (!spin_trylock(&kgdb_lock)) {
+-              /*
+-               * some other CPU has the lock, we should go back to
+-               * receive the gdb_wait IPC
+-               */
+-              return;
+-      }
+-
+-      /*
+-       * If we're in async_breakpoint(), restore the real EPC from
+-       * the breakpoint.
+-       */
+-      if (regs->cp0_epc == (unsigned long)async_breakinst) {
+-              regs->cp0_epc = async_bp.addr;
+-              async_bp.addr = 0;
+-      }
+-
+-      /*
+-       * acquire the CPU spinlocks
+-       */
+-      for (i = num_online_cpus()-1; i >= 0; i--)
+-              if (__raw_spin_trylock(&kgdb_cpulock[i]) == 0)
+-                      panic("kgdb: couldn't get cpulock %d\n", i);
+-
+-      /*
+-       * force other cpus to enter kgdb
+-       */
+-      kgdb_smp_call_kgdb_wait();
+-
+-      /*
+-       * If we're in breakpoint() increment the PC
+-       */
+-      trap = (regs->cp0_cause & 0x7c) >> 2;
+-      if (trap == 9 && regs->cp0_epc == (unsigned long)breakinst)
+-              regs->cp0_epc += 4;
+-
+-      /*
+-       * If we were single_stepping, restore the opcodes hoisted
+-       * for the breakpoint[s].
+-       */
+-      if (step_bp[0].addr) {
+-              *(unsigned *)step_bp[0].addr = step_bp[0].val;
+-              step_bp[0].addr = 0;
+-
+-              if (step_bp[1].addr) {
+-                      *(unsigned *)step_bp[1].addr = step_bp[1].val;
+-                      step_bp[1].addr = 0;
+-              }
+-      }
+-
+-      stack = (long *)regs->reg29;                    /* stack ptr */
+-      sigval = computeSignal(trap);
+-
+-      /*
+-       * reply to host that an exception has occurred
+-       */
+-      ptr = output_buffer;
+-
+-      /*
+-       * Send trap type (converted to signal)
+-       */
+-      *ptr++ = 'T';
+-      *ptr++ = hexchars[sigval >> 4];
+-      *ptr++ = hexchars[sigval & 0xf];
+-
+-      /*
+-       * Send Error PC
+-       */
+-      *ptr++ = hexchars[REG_EPC >> 4];
+-      *ptr++ = hexchars[REG_EPC & 0xf];
+-      *ptr++ = ':';
+-      ptr = mem2hex((char *)&regs->cp0_epc, ptr, sizeof(long), 0);
+-      *ptr++ = ';';
+-
+-      /*
+-       * Send frame pointer
+-       */
+-      *ptr++ = hexchars[REG_FP >> 4];
+-      *ptr++ = hexchars[REG_FP & 0xf];
+-      *ptr++ = ':';
+-      ptr = mem2hex((char *)&regs->reg30, ptr, sizeof(long), 0);
+-      *ptr++ = ';';
+-
+-      /*
+-       * Send stack pointer
+-       */
+-      *ptr++ = hexchars[REG_SP >> 4];
+-      *ptr++ = hexchars[REG_SP & 0xf];
+-      *ptr++ = ':';
+-      ptr = mem2hex((char *)&regs->reg29, ptr, sizeof(long), 0);
+-      *ptr++ = ';';
+-
+-      *ptr++ = 0;
+-      putpacket(output_buffer);       /* send it off... */
+-
+-      /*
+-       * Wait for input from remote GDB
+-       */
+-      while (1) {
+-              output_buffer[0] = 0;
+-              getpacket(input_buffer);
+-
+-              switch (input_buffer[0])
+-              {
+-              case '?':
+-                      output_buffer[0] = 'S';
+-                      output_buffer[1] = hexchars[sigval >> 4];
+-                      output_buffer[2] = hexchars[sigval & 0xf];
+-                      output_buffer[3] = 0;
+-                      break;
+-
+-              /*
+-               * Detach debugger; let CPU run
+-               */
+-              case 'D':
+-                      putpacket(output_buffer);
+-                      goto finish_kgdb;
+-                      break;
+-
+-              case 'd':
+-                      /* toggle debug flag */
+-                      break;
+-
+-              /*
+-               * Return the value of the CPU registers
+-               */
+-              case 'g':
+-                      ptr = output_buffer;
+-                      ptr = mem2hex((char *)&regs->reg0, ptr, 32*sizeof(long), 0); /* r0...r31 */
+-                      ptr = mem2hex((char *)&regs->cp0_status, ptr, 6*sizeof(long), 0); /* cp0 */
+-                      ptr = mem2hex((char *)&regs->fpr0, ptr, 32*sizeof(long), 0); /* f0...31 */
+-                      ptr = mem2hex((char *)&regs->cp1_fsr, ptr, 2*sizeof(long), 0); /* cp1 */
+-                      ptr = mem2hex((char *)&regs->frame_ptr, ptr, 2*sizeof(long), 0); /* frp */
+-                      ptr = mem2hex((char *)&regs->cp0_index, ptr, 16*sizeof(long), 0); /* cp0 */
+-                      break;
+-
+-              /*
+-               * set the value of the CPU registers - return OK
+-               */
+-              case 'G':
+-              {
+-                      ptr = &input_buffer[1];
+-                      hex2mem(ptr, (char *)&regs->reg0, 32*sizeof(long), 0, 0);
+-                      ptr += 32*(2*sizeof(long));
+-                      hex2mem(ptr, (char *)&regs->cp0_status, 6*sizeof(long), 0, 0);
+-                      ptr += 6*(2*sizeof(long));
+-                      hex2mem(ptr, (char *)&regs->fpr0, 32*sizeof(long), 0, 0);
+-                      ptr += 32*(2*sizeof(long));
+-                      hex2mem(ptr, (char *)&regs->cp1_fsr, 2*sizeof(long), 0, 0);
+-                      ptr += 2*(2*sizeof(long));
+-                      hex2mem(ptr, (char *)&regs->frame_ptr, 2*sizeof(long), 0, 0);
+-                      ptr += 2*(2*sizeof(long));
+-                      hex2mem(ptr, (char *)&regs->cp0_index, 16*sizeof(long), 0, 0);
+-                      strcpy(output_buffer,"OK");
+-               }
+-              break;
+-
+-              /*
+-               * mAA..AA,LLLL  Read LLLL bytes at address AA..AA
+-               */
+-              case 'm':
+-                      ptr = &input_buffer[1];
+-
+-                      if (hexToLong(&ptr, &addr)
+-                              && *ptr++ == ','
+-                              && hexToInt(&ptr, &length)) {
+-                              if (mem2hex((char *)addr, output_buffer, length, 1))
+-                                      break;
+-                              strcpy (output_buffer, "E03");
+-                      } else
+-                              strcpy(output_buffer,"E01");
+-                      break;
+-
+-              /*
+-               * XAA..AA,LLLL: Write LLLL escaped binary bytes at address AA.AA
+-               */
+-              case 'X':
+-                      bflag = 1;
+-                      /* fall through */
+-
+-              /*
+-               * MAA..AA,LLLL: Write LLLL bytes at address AA.AA return OK
+-               */
+-              case 'M':
+-                      ptr = &input_buffer[1];
+-
+-                      if (hexToLong(&ptr, &addr)
+-                              && *ptr++ == ','
+-                              && hexToInt(&ptr, &length)
+-                              && *ptr++ == ':') {
+-                              if (hex2mem(ptr, (char *)addr, length, bflag, 1))
+-                                      strcpy(output_buffer, "OK");
+-                              else
+-                                      strcpy(output_buffer, "E03");
+-                      }
+-                      else
+-                              strcpy(output_buffer, "E02");
+-                      break;
+-
+-              /*
+-               * cAA..AA    Continue at address AA..AA(optional)
+-               */
+-              case 'c':
+-                      /* try to read optional parameter, pc unchanged if no parm */
+-
+-                      ptr = &input_buffer[1];
+-                      if (hexToLong(&ptr, &addr))
+-                              regs->cp0_epc = addr;
+-
+-                      goto exit_kgdb_exception;
+-                      break;
+-
+-              /*
+-               * kill the program; let us try to restart the machine
+-               * Reset the whole machine.
+-               */
+-              case 'k':
+-              case 'r':
+-                      machine_restart("kgdb restarts machine");
+-                      break;
+-
+-              /*
+-               * Step to next instruction
+-               */
+-              case 's':
+-                      /*
+-                       * There is no single step insn in the MIPS ISA, so we
+-                       * use breakpoints and continue, instead.
+-                       */
+-                      single_step(regs);
+-                      goto exit_kgdb_exception;
+-                      /* NOTREACHED */
+-                      break;
+-
+-              /*
+-               * Set baud rate (bBB)
+-               * FIXME: Needs to be written
+-               */
+-              case 'b':
+-              {
+-#if 0
+-                      int baudrate;
+-                      extern void set_timer_3();
+-
+-                      ptr = &input_buffer[1];
+-                      if (!hexToInt(&ptr, &baudrate))
+-                      {
+-                              strcpy(output_buffer,"B01");
+-                              break;
+-                      }
+-
+-                      /* Convert baud rate to uart clock divider */
+-
+-                      switch (baudrate)
+-                      {
+-                              case 38400:
+-                                      baudrate = 16;
+-                                      break;
+-                              case 19200:
+-                                      baudrate = 33;
+-                                      break;
+-                              case 9600:
+-                                      baudrate = 65;
+-                                      break;
+-                              default:
+-                                      baudrate = 0;
+-                                      strcpy(output_buffer,"B02");
+-                                      goto x1;
+-                      }
+-
+-                      if (baudrate) {
+-                              putpacket("OK");        /* Ack before changing speed */
+-                              set_timer_3(baudrate); /* Set it */
+-                      }
+-#endif
+-              }
+-              break;
+-
+-              }                       /* switch */
+-
+-              /*
+-               * reply to the request
+-               */
+-
+-              putpacket(output_buffer);
+-
+-      } /* while */
+-
+-      return;
+-
+-finish_kgdb:
+-      restore_debug_traps();
+-
+-exit_kgdb_exception:
+-      /* release locks so other CPUs can go */
+-      for (i = num_online_cpus()-1; i >= 0; i--)
+-              __raw_spin_unlock(&kgdb_cpulock[i]);
+-      spin_unlock(&kgdb_lock);
+-
+-      __flush_cache_all();
+-      return;
+-}
+-
+-/*
+- * This function will generate a breakpoint exception.  It is used at the
+- * beginning of a program to sync up with a debugger and can be used
+- * otherwise as a quick means to stop program execution and "break" into
+- * the debugger.
+- */
+-void breakpoint(void)
+-{
+-      if (!initialized)
+-              return;
+-
+-      __asm__ __volatile__(
+-                      ".globl breakinst\n\t"
+-                      ".set\tnoreorder\n\t"
+-                      "nop\n"
+-                      "breakinst:\tbreak\n\t"
+-                      "nop\n\t"
+-                      ".set\treorder"
+-                      );
+-}
+-
+-/* Nothing but the break; don't pollute any registers */
+-void async_breakpoint(void)
+-{
+-      __asm__ __volatile__(
+-                      ".globl async_breakinst\n\t"
+-                      ".set\tnoreorder\n\t"
+-                      "nop\n"
+-                      "async_breakinst:\tbreak\n\t"
+-                      "nop\n\t"
+-                      ".set\treorder"
+-                      );
+-}
+-
+-void adel(void)
+-{
+-      __asm__ __volatile__(
+-                      ".globl\tadel\n\t"
+-                      "lui\t$8,0x8000\n\t"
+-                      "lw\t$9,1($8)\n\t"
+-                      );
+-}
+-
+-/*
+- * malloc is needed by gdb client in "call func()", even a private one
+- * will make gdb happy
+- */
+-static void * __attribute_used__ malloc(size_t size)
+-{
+-      return kmalloc(size, GFP_ATOMIC);
+-}
+-
+-static void __attribute_used__ free (void *where)
+-{
+-      kfree(where);
+-}
+-
+-#ifdef CONFIG_GDB_CONSOLE
+-
+-void gdb_putsn(const char *str, int l)
+-{
+-      char outbuf[18];
+-
+-      if (!kgdb_started)
+-              return;
+-
+-      outbuf[0]='O';
+-
+-      while(l) {
+-              int i = (l>8)?8:l;
+-              mem2hex((char *)str, &outbuf[1], i, 0);
+-              outbuf[(i*2)+1]=0;
+-              putpacket(outbuf);
+-              str += i;
+-              l -= i;
+-      }
+-}
+-
+-static void gdb_console_write(struct console *con, const char *s, unsigned n)
+-{
+-      gdb_putsn(s, n);
+-}
+-
+-static struct console gdb_console = {
+-      .name   = "gdb",
+-      .write  = gdb_console_write,
+-      .flags  = CON_PRINTBUFFER,
+-      .index  = -1
+-};
+-
+-static int __init register_gdb_console(void)
+-{
+-      register_console(&gdb_console);
+-
+-      return 0;
+-}
+-
+-console_initcall(register_gdb_console);
+-
+-#endif
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/mips/kernel/irq.c linux-2.6.18.kgdb/arch/mips/kernel/irq.c
+--- linux-2.6.18/arch/mips/kernel/irq.c        2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/mips/kernel/irq.c   2008-06-10 16:19:28.000000000 +0400
+@@ -25,6 +25,10 @@
+ #include <asm/atomic.h>
+ #include <asm/system.h>
+ #include <asm/uaccess.h>
++#include <asm/kgdb.h>
++
++/* Keep track of if we've done certain initialization already or not. */
++int kgdb_early_setup;
+ 
+ /*
+  * 'what should we do if we get a hw irq event on an illegal vector'.
+@@ -115,23 +119,13 @@ asmlinkage void spurious_interrupt(struc
+       atomic_inc(&irq_err_count);
+ }
+ 
+-#ifdef CONFIG_KGDB
+-extern void breakpoint(void);
+-extern void set_debug_traps(void);
+-
+-static int kgdb_flag = 1;
+-static int __init nokgdb(char *str)
+-{
+-      kgdb_flag = 0;
+-      return 1;
+-}
+-__setup("nokgdb", nokgdb);
+-#endif
+-
+ void __init init_IRQ(void)
+ {
+       int i;
+ 
++      if (kgdb_early_setup)
++              return;
++
+       for (i = 0; i < NR_IRQS; i++) {
+               irq_desc[i].status  = IRQ_DISABLED;
+               irq_desc[i].action  = NULL;
+@@ -144,12 +138,12 @@ void __init init_IRQ(void)
+       }
+ 
+       arch_init_irq();
+-
+ #ifdef CONFIG_KGDB
+-      if (kgdb_flag) {
+-              printk("Wait for gdb client connection ...\n");
+-              set_debug_traps();
+-              breakpoint();
+-      }
++      /*
++       * We have been called before kgdb_arch_init(). Hence,
++       * we dont want the traps to be reinitialized
++       */
++      if (kgdb_early_setup == 0)
++              kgdb_early_setup = 1;
+ #endif
+ }
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/mips/kernel/kgdb-jmp.c linux-2.6.18.kgdb/arch/mips/kernel/kgdb-jmp.c
+--- linux-2.6.18/arch/mips/kernel/kgdb-jmp.c   1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18.kgdb/arch/mips/kernel/kgdb-jmp.c      2008-06-10 16:19:28.000000000 +0400
+@@ -0,0 +1,116 @@
++/*
++ * arch/mips/kernel/kgdb-jmp.c
++ *
++ * Save and restore system registers so that within a limited frame we
++ * may have a fault and "jump back" to a known safe location.
++ *
++ * Author: Tom Rini <trini@kernel.crashing.org>
++ * Author: Manish Lachwani <mlachwani@mvista.com>
++ *
++ * Cribbed from glibc, which carries the following:
++ * Copyright (C) 1996, 1997, 2000, 2002, 2003 Free Software Foundation, Inc.
++ * Copyright (C) 2005 by MontaVista Software.
++ *
++ * This file is licensed under the terms of the GNU General Public License
++ * version 2. This program as licensed "as is" without any warranty of
++ * any kind, whether express or implied.
++ */
++
++#include <linux/kgdb.h>
++#include <asm/interrupt.h>
++
++#ifdef CONFIG_MIPS64
++/*
++ * MIPS 64-bit
++ */
++
++int kgdb_fault_setjmp_aux(unsigned long *curr_context, int sp, int fp)
++{
++      __asm__ __volatile__ ("sd $gp, %0" : : "m" (curr_context[0]));
++      __asm__ __volatile__ ("sd $16, %0" : : "m" (curr_context[1]));
++      __asm__ __volatile__ ("sd $17, %0" : : "m" (curr_context[2]));
++      __asm__ __volatile__ ("sd $18, %0" : : "m" (curr_context[3]));
++      __asm__ __volatile__ ("sd $19, %0" : : "m" (curr_context[4]));
++      __asm__ __volatile__ ("sd $20, %0" : : "m" (curr_context[5]));
++      __asm__ __volatile__ ("sd $21, %0" : : "m" (curr_context[6]));
++      __asm__ __volatile__ ("sd $22, %0" : : "m" (curr_context[7]));
++      __asm__ __volatile__ ("sd $23, %0" : : "m" (curr_context[8]));
++      __asm__ __volatile__ ("sd $31, %0" : : "m" (curr_context[9]));
++      curr_context[10] = (long *)sp;
++      curr_context[11] = (long *)fp;
++
++      return 0;
++}
++
++void kgdb_fault_longjmp(unsigned long *curr_context)
++{
++      unsigned long sp_val, fp_val;
++
++      __asm__ __volatile__ ("ld $gp, %0" : : "m" (curr_context[0]));
++      __asm__ __volatile__ ("ld $16, %0" : : "m" (curr_context[1]));
++      __asm__ __volatile__ ("ld $17, %0" : : "m" (curr_context[2]));
++      __asm__ __volatile__ ("ld $18, %0" : : "m" (curr_context[3]));
++      __asm__ __volatile__ ("ld $19, %0" : : "m" (curr_context[4]));
++      __asm__ __volatile__ ("ld $20, %0" : : "m" (curr_context[5]));
++      __asm__ __volatile__ ("ld $21, %0" : : "m" (curr_context[6]));
++      __asm__ __volatile__ ("ld $22, %0" : : "m" (curr_context[7]));
++      __asm__ __volatile__ ("ld $23, %0" : : "m" (curr_context[8]));
++      __asm__ __volatile__ ("ld $25, %0" : : "m" (curr_context[9]));
++      sp_val = curr_context[10];
++      fp_val = curr_context[11];
++      __asm__ __volatile__ ("ld $29, %0\n\t"
++                            "ld $30, %1\n\t" : : "m" (sp_val), "m" (fp_val));
++
++      __asm__ __volatile__ ("dli $2, 1");
++      __asm__ __volatile__ ("j $25");
++
++      for (;;);
++}
++#else
++/*
++ * MIPS 32-bit
++ */
++
++int kgdb_fault_setjmp_aux(unsigned long *curr_context, int sp, int fp)
++{
++      __asm__ __volatile__("sw $gp, %0" : : "m" (curr_context[0]));
++      __asm__ __volatile__("sw $16, %0" : : "m" (curr_context[1]));
++      __asm__ __volatile__("sw $17, %0" : : "m" (curr_context[2]));
++      __asm__ __volatile__("sw $18, %0" : : "m" (curr_context[3]));
++      __asm__ __volatile__("sw $19, %0" : : "m" (curr_context[4]));
++      __asm__ __volatile__("sw $20, %0" : : "m" (curr_context[5]));
++      __asm__ __volatile__("sw $21, %0" : : "m" (curr_context[6]));
++      __asm__ __volatile__("sw $22, %0" : : "m" (curr_context[7]));
++      __asm__ __volatile__("sw $23, %0" : : "m" (curr_context[8]));
++      __asm__ __volatile__("sw $31, %0" : : "m" (curr_context[9]));
++      curr_context[10] = (long *)sp;
++      curr_context[11] = (long *)fp;
++
++      return 0;
++}
++
++void kgdb_fault_longjmp(unsigned long *curr_context)
++{
++      unsigned long sp_val, fp_val;
++
++      __asm__ __volatile__("lw $gp, %0" : : "m" (curr_context[0]));
++      __asm__ __volatile__("lw $16, %0" : : "m" (curr_context[1]));
++      __asm__ __volatile__("lw $17, %0" : : "m" (curr_context[2]));
++      __asm__ __volatile__("lw $18, %0" : : "m" (curr_context[3]));
++      __asm__ __volatile__("lw $19, %0" : : "m" (curr_context[4]));
++      __asm__ __volatile__("lw $20, %0" : : "m" (curr_context[5]));
++      __asm__ __volatile__("lw $21, %0" : : "m" (curr_context[6]));
++      __asm__ __volatile__("lw $22, %0" : : "m" (curr_context[7]));
++      __asm__ __volatile__("lw $23, %0" : : "m" (curr_context[8]));
++      __asm__ __volatile__("lw $25, %0" : : "m" (curr_context[9]));
++      sp_val = curr_context[10];
++      fp_val = curr_context[11];
++      __asm__ __volatile__("lw $29, %0\n\t"
++                            "lw $30, %1\n\t" : : "m" (sp_val), "m" (fp_val));
++
++      __asm__ __volatile__("li $2, 1");
++      __asm__ __volatile__("jr $25");
++
++      for (;;);
++}
++#endif
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/mips/kernel/kgdb-setjmp.S linux-2.6.18.kgdb/arch/mips/kernel/kgdb-setjmp.S
+--- linux-2.6.18/arch/mips/kernel/kgdb-setjmp.S        1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18.kgdb/arch/mips/kernel/kgdb-setjmp.S   2008-06-10 16:19:28.000000000 +0400
+@@ -0,0 +1,28 @@
++/*
++ * arch/mips/kernel/kgdb-jmp.c
++ *
++ * Save and restore system registers so that within a limited frame we
++ * may have a fault and "jump back" to a known safe location.
++ *
++ * Copyright (C) 2005 by MontaVista Software.
++ * Author: Manish Lachwani (mlachwani@mvista.com)
++ *
++ * This file is licensed under the terms of the GNU General Public License
++ * version 2. This program as licensed "as is" without any warranty of
++ * any kind, whether express or implied.
++ */
++
++#include <asm/asm.h>
++#include <asm/mipsregs.h>
++#include <asm/regdef.h>
++#include <asm/stackframe.h>
++
++      .ent    kgdb_fault_setjmp,0
++ENTRY (kgdb_fault_setjmp)
++      move    a1, sp
++      move    a2, fp
++#ifdef CONFIG_MIPS64
++      nop
++#endif
++      j       kgdb_fault_setjmp_aux
++      .end    kgdb_fault_setjmp
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/mips/kernel/kgdb.c linux-2.6.18.kgdb/arch/mips/kernel/kgdb.c
+--- linux-2.6.18/arch/mips/kernel/kgdb.c       1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18.kgdb/arch/mips/kernel/kgdb.c  2008-06-10 16:19:28.000000000 +0400
+@@ -0,0 +1,297 @@
++/*
++ * arch/mips/kernel/kgdb.c
++ *
++ *  Originally written by Glenn Engel, Lake Stevens Instrument Division
++ *
++ *  Contributed by HP Systems
++ *
++ *  Modified for SPARC by Stu Grossman, Cygnus Support.
++ *
++ *  Modified for Linux/MIPS (and MIPS in general) by Andreas Busse
++ *  Send complaints, suggestions etc. to <andy@waldorf-gmbh.de>
++ *
++ *  Copyright (C) 1995 Andreas Busse
++ *
++ *  Copyright (C) 2003 MontaVista Software Inc.
++ *  Author: Jun Sun, jsun@mvista.com or jsun@junsun.net
++ *
++ *  Copyright (C) 2004-2005 MontaVista Software Inc.
++ *  Author: Manish Lachwani, mlachwani@mvista.com or manish@koffee-break.com
++ *
++ *  This file is licensed under the terms of the GNU General Public License
++ *  version 2. This program is licensed "as is" without any warranty of any
++ *  kind, whether express or implied.
++ */
++
++#include <linux/string.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/smp.h>
++#include <linux/spinlock.h>
++#include <linux/delay.h>
++#include <asm/system.h>
++#include <asm/ptrace.h>               /* for linux pt_regs struct */
++#include <linux/kgdb.h>
++#include <linux/init.h>
++#include <asm/inst.h>
++#include <asm/gdb-stub.h>
++#include <asm/cacheflush.h>
++#include <asm/kdebug.h>
++
++static struct hard_trap_info {
++      unsigned char tt;       /* Trap type code for MIPS R3xxx and R4xxx */
++      unsigned char signo;    /* Signal that we map this trap into */
++} hard_trap_info[] = {
++      { 6, SIGBUS },          /* instruction bus error */
++      { 7, SIGBUS },          /* data bus error */
++      { 9, SIGTRAP },         /* break */
++/*    { 11, SIGILL }, */      /* CPU unusable */
++      { 12, SIGFPE },         /* overflow */
++      { 13, SIGTRAP },        /* trap */
++      { 14, SIGSEGV },        /* virtual instruction cache coherency */
++      { 15, SIGFPE },         /* floating point exception */
++      { 23, SIGSEGV },        /* watch */
++      { 31, SIGSEGV },        /* virtual data cache coherency */
++      { 0, 0}                 /* Must be last */
++};
++
++/* Save the normal trap handlers for user-mode traps. */
++void *saved_vectors[32];
++
++extern void trap_low(void);
++extern void breakinst(void);
++extern void init_IRQ(void);
++
++void kgdb_call_nmi_hook(void *ignored)
++{
++      kgdb_nmihook(smp_processor_id(), (void *)0);
++}
++
++void kgdb_roundup_cpus(unsigned long flags)
++{
++      local_irq_restore(flags);
++      smp_call_function(kgdb_call_nmi_hook, 0, 0, 0);
++      local_irq_save(flags);
++}
++
++static int compute_signal(int tt)
++{
++      struct hard_trap_info *ht;
++
++      for (ht = hard_trap_info; ht->tt && ht->signo; ht++)
++              if (ht->tt == tt)
++                      return ht->signo;
++
++      return SIGHUP;          /* default for things we don't know about */
++}
++
++/*
++ * Set up exception handlers for tracing and breakpoints
++ */
++void handle_exception(struct pt_regs *regs)
++{
++      int trap = (regs->cp0_cause & 0x7c) >> 2;
++
++      if (fixup_exception(regs)) {
++              return;
++      }
++
++      if (atomic_read(&debugger_active))
++              kgdb_nmihook(smp_processor_id(), regs);
++
++      if (atomic_read(&kgdb_setting_breakpoint))
++              if ((trap == 9) && (regs->cp0_epc == (unsigned long)breakinst))
++                      regs->cp0_epc += 4;
++
++      kgdb_handle_exception(0, compute_signal(trap), 0, regs);
++
++      /* In SMP mode, __flush_cache_all does IPI */
++      __flush_cache_all();
++}
++
++void set_debug_traps(void)
++{
++      struct hard_trap_info *ht;
++      unsigned long flags;
++
++      local_irq_save(flags);
++
++      for (ht = hard_trap_info; ht->tt && ht->signo; ht++)
++              saved_vectors[ht->tt] = set_except_vector(ht->tt, trap_low);
++
++      local_irq_restore(flags);
++}
++
++#if 0
++/* This should be called before we exit kgdb_handle_exception() I believe.
++ * -- Tom
++ */
++void restore_debug_traps(void)
++{
++      struct hard_trap_info *ht;
++      unsigned long flags;
++
++      local_irq_save(flags);
++      for (ht = hard_trap_info; ht->tt && ht->signo; ht++)
++              set_except_vector(ht->tt, saved_vectors[ht->tt]);
++      local_irq_restore(flags);
++}
++#endif
++
++void regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
++{
++      int reg;
++      gdb_reg_t *ptr = (gdb_reg_t*)gdb_regs;
++
++      for (reg = 0; reg < 32; reg++)
++              *(ptr++) = regs->regs[reg];
++
++      *(ptr++) = regs->cp0_status;
++      *(ptr++) = regs->lo;
++      *(ptr++) = regs->hi;
++      *(ptr++) = regs->cp0_badvaddr;
++      *(ptr++) = regs->cp0_cause;
++      *(ptr++) = regs->cp0_epc;
++
++      return;
++}
++
++void gdb_regs_to_regs(unsigned long *gdb_regs, struct pt_regs *regs)
++{
++
++      int reg;
++      const gdb_reg_t *ptr = (gdb_reg_t*)gdb_regs;
++
++      for (reg = 0; reg < 32; reg++)
++              regs->regs[reg] = *(ptr++);
++
++      regs->cp0_status = *(ptr++);
++      regs->lo = *(ptr++);
++      regs->hi = *(ptr++);
++      regs->cp0_badvaddr = *(ptr++);
++      regs->cp0_cause = *(ptr++);
++      regs->cp0_epc = *(ptr++);
++
++      return;
++}
++
++/*
++ * Similar to regs_to_gdb_regs() except that process is sleeping and so
++ * we may not be able to get all the info.
++ */
++void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
++{
++      int reg;
++      struct thread_info *ti = p->thread_info;
++      unsigned long ksp = (unsigned long)ti + THREAD_SIZE - 32;
++      struct pt_regs *regs = (struct pt_regs *)ksp - 1;
++      gdb_reg_t *ptr = (gdb_reg_t*)gdb_regs;
++
++      for (reg = 0; reg < 16; reg++)
++              *(ptr++) = regs->regs[reg];
++
++      /* S0 - S7 */
++      for (reg = 16; reg < 24; reg++)
++              *(ptr++) = regs->regs[reg];
++
++      for (reg = 24; reg < 28; reg++)
++              *(ptr++) = 0;
++
++      /* GP, SP, FP, RA */
++      for (reg = 28; reg < 32; reg++)
++              *(ptr++) = regs->regs[reg];
++
++      *(ptr++) = regs->cp0_status;
++      *(ptr++) = regs->lo;
++      *(ptr++) = regs->hi;
++      *(ptr++) = regs->cp0_badvaddr;
++      *(ptr++) = regs->cp0_cause;
++      *(ptr++) = regs->cp0_epc;
++
++      return;
++}
++
++/*
++ * Calls linux_debug_hook before the kernel dies. If KGDB is enabled,
++ * then try to fall into the debugger
++ */
++static int kgdb_mips_notify(struct notifier_block *self, unsigned long cmd,
++                          void *ptr)
++{
++      struct die_args *args = (struct die_args *)ptr;
++      struct pt_regs *regs = args->regs;
++      int trap = (regs->cp0_cause & 0x7c) >> 2;
++
++      /* See if KGDB is interested. */
++      if (user_mode(regs))
++              /* Userpace events, ignore. */
++              return NOTIFY_DONE;
++
++      kgdb_handle_exception(trap, compute_signal(trap), 0, regs);
++      return NOTIFY_OK;
++}
++
++static struct notifier_block kgdb_notifier = {
++      .notifier_call = kgdb_mips_notify,
++};
++
++/*
++ * Handle the 's' and 'c' commands
++ */
++int kgdb_arch_handle_exception(int vector, int signo, int err_code,
++                             char *remcom_in_buffer, char *remcom_out_buffer,
++                             struct pt_regs *regs)
++{
++      char *ptr;
++      unsigned long address;
++      int cpu = smp_processor_id();
++
++      switch (remcom_in_buffer[0]) {
++      case 's':
++      case 'c':
++              /* handle the optional parameter */
++              ptr = &remcom_in_buffer[1];
++              if (kgdb_hex2long(&ptr, &address))
++                      regs->cp0_epc = address;
++
++              atomic_set(&cpu_doing_single_step, -1);
++              if (remcom_in_buffer[0] == 's')
++                      if (kgdb_contthread)
++                              atomic_set(&cpu_doing_single_step, cpu);
++
++              return 0;
++      }
++
++      return -1;
++}
++
++struct kgdb_arch arch_kgdb_ops = {
++#ifdef CONFIG_CPU_LITTLE_ENDIAN
++      .gdb_bpt_instr = {0xd},
++#else
++      .gdb_bpt_instr = {0x00, 0x00, 0x00, 0x0d},
++#endif
++};
++
++/*
++ * We use kgdb_early_setup so that functions we need to call now don't
++ * cause trouble when called again later.
++ */
++int kgdb_arch_init(void)
++{
++      /* Board-specifics. */
++      /* Force some calls to happen earlier. */
++      if (kgdb_early_setup == 0) {
++              trap_init();
++              init_IRQ();
++              kgdb_early_setup = 1;
++      }
++
++      /* Set our traps. */
++      /* This needs to be done more finely grained again, paired in
++       * a before/after in kgdb_handle_exception(...) -- Tom */
++      set_debug_traps();
++      notifier_chain_register(&mips_die_chain, &kgdb_notifier);
++
++      return 0;
++}
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/mips/kernel/kgdb_handler.S linux-2.6.18.kgdb/arch/mips/kernel/kgdb_handler.S
+--- linux-2.6.18/arch/mips/kernel/kgdb_handler.S       1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18.kgdb/arch/mips/kernel/kgdb_handler.S  2008-06-10 16:19:28.000000000 +0400
+@@ -0,0 +1,57 @@
++/*
++ * arch/mips/kernel/kgdb_handler.S
++ *
++ * Copyright (C) 2004-2005 MontaVista Software Inc.
++ * Author: Manish Lachwani, mlachwani@mvista.com or manish@koffee-break.com
++ *
++ * This file is licensed under the terms of the GNU General Public
++ * version 2. This program is licensed "as is" without any warranty of any
++ * kind, whether express or implied.
++ */
++
++/*
++ * Trap Handler for the new KGDB framework. The main KGDB handler is
++ * handle_exception that will be called from here
++ *
++ */
++
++#include <linux/config.h>
++#include <linux/sys.h>
++
++#include <asm/asm.h>
++#include <asm/errno.h>
++#include <asm/mipsregs.h>
++#include <asm/regdef.h>
++#include <asm/stackframe.h>
++
++      .align  5
++      NESTED(trap_low, PT_SIZE, sp)
++              .set    noat
++              .set    noreorder
++
++              /*
++               * Check for privileged instructions in user mode. For
++               * this, check the cu0 bit in the CPU status register.
++               */
++              mfc0    k0, CP0_STATUS
++              sll     k0, 3
++              bltz    k0, 1f
++              move    k1, sp
++
++              /*
++               * GDB userland from within KGDB. If a user mode address
++               * then jump to the saved exception handler
++               */
++              mfc0    k1, CP0_CAUSE
++              andi    k1, k1, 0x7c
++              PTR_L   k0, saved_vectors(k1)
++              jr      k0
++              nop
++1:
++              SAVE_ALL
++              .set    at
++              .set    reorder
++              move    a0, sp
++              jal     handle_exception
++              j       ret_from_exception
++      END(trap_low)
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/mips/kernel/traps.c linux-2.6.18.kgdb/arch/mips/kernel/traps.c
+--- linux-2.6.18/arch/mips/kernel/traps.c      2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/mips/kernel/traps.c 2008-06-10 16:19:28.000000000 +0400
+@@ -10,6 +10,8 @@
+  * Kevin D. Kissell, kevink@mips.com and Carsten Langgaard, carstenl@mips.com
+  * Copyright (C) 2000, 01 MIPS Technologies, Inc.
+  * Copyright (C) 2002, 2003, 2004, 2005  Maciej W. Rozycki
++ *
++ * KGDB specific changes - Manish Lachwani (mlachwani@mvista.com)
+  */
+ #include <linux/init.h>
+ #include <linux/mm.h>
+@@ -20,6 +22,7 @@
+ #include <linux/spinlock.h>
+ #include <linux/kallsyms.h>
+ #include <linux/bootmem.h>
++#include <linux/kgdb.h>
+ 
+ #include <asm/bootinfo.h>
+ #include <asm/branch.h>
+@@ -40,6 +43,7 @@
+ #include <asm/mmu_context.h>
+ #include <asm/watch.h>
+ #include <asm/types.h>
++#include <asm/kdebug.h>
+ 
+ extern asmlinkage void handle_int(void);
+ extern asmlinkage void handle_tlbm(void);
+@@ -78,6 +82,21 @@ void (*board_bind_eic_interrupt)(int irq
+  */
+ #define MODULE_RANGE (8*1024*1024)
+ 
++struct notifier_block *mips_die_chain;
++static spinlock_t die_notifier_lock = SPIN_LOCK_UNLOCKED;
++
++int register_die_notifier(struct notifier_block *nb)
++{
++      int err = 0;
++      unsigned long flags;
++
++      spin_lock_irqsave(&die_notifier_lock, flags);
++      err = notifier_chain_register(&mips_die_chain, nb);
++      spin_unlock_irqrestore(&die_notifier_lock, flags);
++
++      return err;
++}
++
+ /*
+  * This routine abuses get_user()/put_user() to reference pointers
+  * with at least a bit of error checking ...
+@@ -1387,6 +1406,11 @@ void __init trap_init(void)
+       extern char except_vec4;
+       unsigned long i;
+ 
++#if defined(CONFIG_KGDB)
++      if (kgdb_early_setup)
++              return; /* Already done */
++#endif
++
+       if (cpu_has_veic || cpu_has_vint)
+               ebase = (unsigned long) alloc_bootmem_low_pages (0x200 + VECTORSPACING*64);
+       else
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/mips/mips-boards/generic/Makefile linux-2.6.18.kgdb/arch/mips/mips-boards/generic/Makefile
+--- linux-2.6.18/arch/mips/mips-boards/generic/Makefile        2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/mips/mips-boards/generic/Makefile   2008-06-10 16:19:28.000000000 +0400
+@@ -21,6 +21,5 @@
+ obj-y                         := reset.o display.o init.o memory.o printf.o \
+                                  cmdline.o time.o
+ obj-$(CONFIG_PCI)             += pci.o
+-obj-$(CONFIG_KGDB)            += gdb_hook.o
+ 
+ EXTRA_AFLAGS := $(CFLAGS)
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/mips/mips-boards/generic/init.c linux-2.6.18.kgdb/arch/mips/mips-boards/generic/init.c
+--- linux-2.6.18/arch/mips/mips-boards/generic/init.c  2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/mips/mips-boards/generic/init.c     2008-06-10 16:19:28.000000000 +0400
+@@ -37,15 +37,6 @@
+ 
+ #include <asm/mips-boards/malta.h>
+ 
+-#ifdef CONFIG_KGDB
+-extern int rs_kgdb_hook(int, int);
+-extern int rs_putDebugChar(char);
+-extern char rs_getDebugChar(void);
+-extern int saa9730_kgdb_hook(int);
+-extern int saa9730_putDebugChar(char);
+-extern char saa9730_getDebugChar(void);
+-#endif
+-
+ int prom_argc;
+ int *_prom_argv, *_prom_envp;
+ 
+@@ -172,58 +163,6 @@ static void __init console_config(void)
+ }
+ #endif
+ 
+-#ifdef CONFIG_KGDB
+-void __init kgdb_config (void)
+-{
+-      extern int (*generic_putDebugChar)(char);
+-      extern char (*generic_getDebugChar)(void);
+-      char *argptr;
+-      int line, speed;
+-
+-      argptr = prom_getcmdline();
+-      if ((argptr = strstr(argptr, "kgdb=ttyS")) != NULL) {
+-              argptr += strlen("kgdb=ttyS");
+-              if (*argptr != '0' && *argptr != '1')
+-                      printk("KGDB: Unknown serial line /dev/ttyS%c, "
+-                             "falling back to /dev/ttyS1\n", *argptr);
+-              line = *argptr == '0' ? 0 : 1;
+-              printk("KGDB: Using serial line /dev/ttyS%d for session\n", line);
+-
+-              speed = 0;
+-              if (*++argptr == ',')
+-              {
+-                      int c;
+-                      while ((c = *++argptr) && ('0' <= c && c <= '9'))
+-                              speed = speed * 10 + c - '0';
+-              }
+-#ifdef CONFIG_MIPS_ATLAS
+-              if (line == 1) {
+-                      speed = saa9730_kgdb_hook(speed);
+-                      generic_putDebugChar = saa9730_putDebugChar;
+-                      generic_getDebugChar = saa9730_getDebugChar;
+-              }
+-              else
+-#endif
+-              {
+-                      speed = rs_kgdb_hook(line, speed);
+-                      generic_putDebugChar = rs_putDebugChar;
+-                      generic_getDebugChar = rs_getDebugChar;
+-              }
+-
+-              prom_printf("KGDB: Using serial line /dev/ttyS%d at %d for session, "
+-                          "please connect your debugger\n", line ? 1 : 0, speed);
+-
+-              {
+-                      char *s;
+-                      for (s = "Please connect GDB to this port\r\n"; *s; )
+-                              generic_putDebugChar (*s++);
+-              }
+-
+-              /* Breakpoint is invoked after interrupts are initialised */
+-      }
+-}
+-#endif
+-
+ void __init mips_nmi_setup (void)
+ {
+       void *base;
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/mips/mips-boards/malta/malta_setup.c linux-2.6.18.kgdb/arch/mips/mips-boards/malta/malta_setup.c
+--- linux-2.6.18/arch/mips/mips-boards/malta/malta_setup.c     2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/mips/mips-boards/malta/malta_setup.c        2008-06-10 16:19:28.000000000 +0400
+@@ -46,10 +46,6 @@ extern void mips_reboot_setup(void);
+ extern void mips_time_init(void);
+ extern unsigned long mips_rtc_get_time(void);
+ 
+-#ifdef CONFIG_KGDB
+-extern void kgdb_config(void);
+-#endif
+-
+ struct resource standard_io_resources[] = {
+       { .name = "dma1", .start = 0x00, .end = 0x1f, .flags = IORESOURCE_BUSY },
+       { .name = "timer", .start = 0x40, .end = 0x5f, .flags = IORESOURCE_BUSY },
+@@ -124,10 +120,6 @@ void __init plat_mem_setup(void)
+        */
+       enable_dma(4);
+ 
+-#ifdef CONFIG_KGDB
+-      kgdb_config ();
+-#endif
+-
+       if ((mips_revision_corid == MIPS_REVISION_CORID_BONITO64) ||
+           (mips_revision_corid == MIPS_REVISION_CORID_CORE_20K) ||
+           (mips_revision_corid == MIPS_REVISION_CORID_CORE_EMUL_BON)) {
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/mips/mm/extable.c linux-2.6.18.kgdb/arch/mips/mm/extable.c
+--- linux-2.6.18/arch/mips/mm/extable.c        2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/mips/mm/extable.c   2008-06-10 16:19:28.000000000 +0400
+@@ -3,6 +3,7 @@
+  */
+ #include <linux/module.h>
+ #include <linux/spinlock.h>
++#include <linux/kgdb.h>
+ #include <asm/branch.h>
+ #include <asm/uaccess.h>
+ 
+@@ -16,6 +17,12 @@ int fixup_exception(struct pt_regs *regs
+ 
+               return 1;
+       }
++#ifdef CONFIG_KGDB
++      if (atomic_read(&debugger_active) && kgdb_may_fault)
++              /* Restore our previous state. */
++              kgdb_fault_longjmp(kgdb_fault_jmp_regs);
++              /* Not reached. */
++#endif
+ 
+       return 0;
+ }
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/mips/sibyte/cfe/setup.c linux-2.6.18.kgdb/arch/mips/sibyte/cfe/setup.c
+--- linux-2.6.18/arch/mips/sibyte/cfe/setup.c  2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/mips/sibyte/cfe/setup.c     2008-06-10 16:19:28.000000000 +0400
+@@ -58,10 +58,6 @@ int cfe_cons_handle;
+ extern unsigned long initrd_start, initrd_end;
+ #endif
+ 
+-#ifdef CONFIG_KGDB
+-extern int kgdb_port;
+-#endif
+-
+ static void ATTRIB_NORET cfe_linux_exit(void *arg)
+ {
+       int warm = *(int *)arg;
+@@ -242,9 +238,6 @@ void __init prom_init(void)
+       int argc = fw_arg0;
+       char **envp = (char **) fw_arg2;
+       int *prom_vec = (int *) fw_arg3;
+-#ifdef CONFIG_KGDB
+-      char *arg;
+-#endif
+ 
+       _machine_restart   = cfe_linux_restart;
+       _machine_halt      = cfe_linux_halt;
+@@ -308,13 +301,6 @@ void __init prom_init(void)
+               }
+       }
+ 
+-#ifdef CONFIG_KGDB
+-      if ((arg = strstr(arcs_cmdline,"kgdb=duart")) != NULL)
+-              kgdb_port = (arg[10] == '0') ? 0 : 1;
+-      else
+-              kgdb_port = 1;
+-#endif
+-
+ #ifdef CONFIG_BLK_DEV_INITRD
+       {
+               char *ptr;
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/mips/sibyte/sb1250/Makefile linux-2.6.18.kgdb/arch/mips/sibyte/sb1250/Makefile
+--- linux-2.6.18/arch/mips/sibyte/sb1250/Makefile      2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/mips/sibyte/sb1250/Makefile 2008-06-10 16:19:28.000000000 +0400
+@@ -4,5 +4,6 @@ obj-$(CONFIG_SMP)                      += smp.o
+ obj-$(CONFIG_SIBYTE_TBPROF)           += bcm1250_tbprof.o
+ obj-$(CONFIG_SIBYTE_STANDALONE)               += prom.o
+ obj-$(CONFIG_SIBYTE_BUS_WATCHER)      += bus_watcher.o
++obj-$(CONFIG_KGDB_SIBYTE)             += kgdb_sibyte.o
+ 
+ EXTRA_AFLAGS := $(CFLAGS)
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/mips/sibyte/sb1250/irq.c linux-2.6.18.kgdb/arch/mips/sibyte/sb1250/irq.c
+--- linux-2.6.18/arch/mips/sibyte/sb1250/irq.c 2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/mips/sibyte/sb1250/irq.c    2008-06-10 16:19:28.000000000 +0400
+@@ -30,6 +30,7 @@
+ #include <asm/system.h>
+ #include <asm/ptrace.h>
+ #include <asm/io.h>
++#include <asm/kgdb.h>
+ 
+ #include <asm/sibyte/sb1250_regs.h>
+ #include <asm/sibyte/sb1250_int.h>
+@@ -59,16 +60,6 @@ static void sb1250_set_affinity(unsigned
+ extern unsigned long ldt_eoi_space;
+ #endif
+ 
+-#ifdef CONFIG_KGDB
+-static int kgdb_irq;
+-
+-/* Default to UART1 */
+-int kgdb_port = 1;
+-#ifdef CONFIG_SIBYTE_SB1250_DUART
+-extern char sb1250_duart_present[];
+-#endif
+-#endif
+-
+ static struct irq_chip sb1250_irq_type = {
+       .typename = "SB1250-IMR",
+       .startup = startup_sb1250_irq,
+@@ -324,6 +315,11 @@ void __init arch_init_irq(void)
+       unsigned int imask = STATUSF_IP4 | STATUSF_IP3 | STATUSF_IP2 |
+               STATUSF_IP1 | STATUSF_IP0;
+ 
++#ifdef CONFIG_KGDB
++      if (kgdb_early_setup)
++              return;
++#endif
++
+       /* Default everything to IP2 */
+       for (i = 0; i < SB1250_NR_IRQS; i++) {  /* was I0 */
+               __raw_writeq(IMR_IP2_VAL,
+@@ -375,50 +371,6 @@ void __init arch_init_irq(void)
+       /* Enable necessary IPs, disable the rest */
+       change_c0_status(ST0_IM, imask);
+ 
+-#ifdef CONFIG_KGDB
+-      if (kgdb_flag) {
+-              kgdb_irq = K_INT_UART_0 + kgdb_port;
+-
+-#ifdef CONFIG_SIBYTE_SB1250_DUART
+-              sb1250_duart_present[kgdb_port] = 0;
+-#endif
+-              /* Setup uart 1 settings, mapper */
+-              __raw_writeq(M_DUART_IMR_BRK,
+-                           IOADDR(A_DUART_IMRREG(kgdb_port)));
+-
+-              sb1250_steal_irq(kgdb_irq);
+-              __raw_writeq(IMR_IP6_VAL,
+-                           IOADDR(A_IMR_REGISTER(0,
+-                                                 R_IMR_INTERRUPT_MAP_BASE) +
+-                                  (kgdb_irq << 3)));
+-              sb1250_unmask_irq(0, kgdb_irq);
+-      }
+-#endif
+-}
+-
+-#ifdef CONFIG_KGDB
+-
+-#include <linux/delay.h>
+-
+-#define duart_out(reg, val)     csr_out32(val, IOADDR(A_DUART_CHANREG(kgdb_port,reg)))
+-#define duart_in(reg)           csr_in32(IOADDR(A_DUART_CHANREG(kgdb_port,reg)))
+-
+-static void sb1250_kgdb_interrupt(struct pt_regs *regs)
+-{
+-      /*
+-       * Clear break-change status (allow some time for the remote
+-       * host to stop the break, since we would see another
+-       * interrupt on the end-of-break too)
+-       */
+-      kstat_this_cpu.irqs[kgdb_irq]++;
+-      mdelay(500);
+-      duart_out(R_DUART_CMD, V_DUART_MISC_CMD_RESET_BREAK_INT |
+-                              M_DUART_RX_EN | M_DUART_TX_EN);
+-      set_async_breakpoint(&regs->cp0_epc);
+-}
+-
+-#endif        /* CONFIG_KGDB */
+-
+ static inline int dclz(unsigned long long x)
+ {
+       int lz;
+@@ -473,7 +425,7 @@ asmlinkage void plat_irq_dispatch(struct
+               sb1250_mailbox_interrupt(regs);
+ #endif
+ 
+-#ifdef CONFIG_KGDB
++#ifdef CONFIG_KGDB_SIBYTE
+       else if (pending & CAUSEF_IP6)                  /* KGDB (uart 1) */
+               sb1250_kgdb_interrupt(regs);
+ #endif
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/mips/sibyte/sb1250/kgdb_sibyte.c linux-2.6.18.kgdb/arch/mips/sibyte/sb1250/kgdb_sibyte.c
+--- linux-2.6.18/arch/mips/sibyte/sb1250/kgdb_sibyte.c 1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18.kgdb/arch/mips/sibyte/sb1250/kgdb_sibyte.c    2008-06-10 16:19:28.000000000 +0400
+@@ -0,0 +1,164 @@
++/*
++ * arch/mips/sibyte/sb1250/kgdb_sibyte.c
++ *
++ * Author: Manish Lachwani, mlachwani@mvista.com or manish@koffee-break.com
++ *
++ * 2004 (c) MontaVista Software, Inc. This file is licensed under
++ * the terms of the GNU General Public License version 2. This program
++ * is licensed "as is" without any warranty of any kind, whether express
++ * or implied.
++ */
++
++/*
++ * Support for KGDB on the Broadcom Sibyte. The SWARM board
++ * for example does not have a 8250/16550 compatible serial
++ * port. Hence, we need to have a driver for the serial
++ * ports to handle KGDB.  This board needs nothing in addition
++ * to what is normally provided by the gdb portion of the stub.
++ */
++
++#include <linux/delay.h>
++#include <linux/kernel_stat.h>
++#include <linux/init.h>
++#include <linux/kgdb.h>
++
++#include <asm/io.h>
++#include <asm/sibyte/sb1250.h>
++#include <asm/sibyte/sb1250_regs.h>
++#include <asm/sibyte/sb1250_uart.h>
++#include <asm/sibyte/sb1250_int.h>
++#include <asm/addrspace.h>
++
++int kgdb_port = 1;
++static int kgdb_irq;
++
++extern char sb1250_duart_present[];
++extern int sb1250_steal_irq(int irq);
++
++/* Forward declarations. */
++static void kgdbsibyte_init_duart(void);
++static int kgdb_init_io(void);
++
++#define IMR_IP6_VAL   K_INT_MAP_I4
++#define       duart_out(reg, val)     csr_out32(val, IOADDR(A_DUART_CHANREG(kgdb_port,reg)))
++#define duart_in(reg)         csr_in32(IOADDR(A_DUART_CHANREG(kgdb_port,reg)))
++
++static void kgdb_swarm_write_char(int c)
++{
++      while ((duart_in(R_DUART_STATUS) & M_DUART_TX_RDY) == 0) ;
++      duart_out(R_DUART_TX_HOLD, c);
++}
++
++static int kgdb_swarm_read_char(void)
++{
++      int ret_char;
++      unsigned int status;
++
++      status = duart_in(R_DUART_STATUS);
++      while ((status & M_DUART_RX_RDY) == 0) {
++              status = duart_in(R_DUART_STATUS);
++      }
++
++      /*
++       * Check for framing error
++       */
++      if (status & M_DUART_FRM_ERR) {
++              kgdbsibyte_init_duart();
++              kgdb_swarm_write_char('-');
++              return '-';
++      }
++
++      ret_char = duart_in(R_DUART_RX_HOLD);
++
++      return ret_char;
++}
++
++void sb1250_kgdb_interrupt(struct pt_regs *regs)
++{
++      int kgdb_irq = K_INT_UART_0 + kgdb_port;
++      /*
++       * Clear break-change status (allow some time for the remote
++       * host to stop the break, since we would see another
++       * interrupt on the end-of-break too)
++       */
++      kstat_this_cpu.irqs[kgdb_irq]++;
++      mdelay(500);
++      duart_out(R_DUART_CMD, V_DUART_MISC_CMD_RESET_BREAK_INT |
++                M_DUART_RX_EN | M_DUART_TX_EN);
++      if (kgdb_io_ops.init != kgdb_init_io) {
++              /* Throw away the data if another I/O routine is
++               * active.
++               */
++              unsigned int status;
++
++              status = duart_in(R_DUART_STATUS);
++              while ((status & M_DUART_RX_RDY) == 0) {
++                      status = duart_in(R_DUART_STATUS);
++              }
++              /*
++               * Check for framing error
++               */
++              if (status & M_DUART_FRM_ERR) {
++                      kgdbsibyte_init_duart();
++              }
++              duart_in(R_DUART_RX_HOLD);
++      } else
++              breakpoint();
++
++}
++
++/*
++ * We use port #1 and we set it for 115200 BAUD, 8n1.
++ */
++static void kgdbsibyte_init_duart(void)
++{
++      /* Set 8n1. */
++      duart_out(R_DUART_MODE_REG_1,
++                V_DUART_BITS_PER_CHAR_8 | V_DUART_PARITY_MODE_NONE);
++      duart_out(R_DUART_MODE_REG_2, M_DUART_STOP_BIT_LEN_1);
++      /* Set baud rate of 115200. */
++      duart_out(R_DUART_CLK_SEL, V_DUART_BAUD_RATE(115200));
++      /* Enable rx and tx */
++      duart_out(R_DUART_CMD, M_DUART_RX_EN | M_DUART_TX_EN);
++}
++
++static int kgdb_init_io(void)
++{
++#ifdef CONFIG_SIBYTE_SB1250_DUART
++      sb1250_duart_present[kgdb_port] = 0;
++#endif
++
++      kgdbsibyte_init_duart();
++
++      return 0;
++}
++
++/*
++ * Hookup our IRQ line.  We will already have been initialized a
++ * this point.
++ */
++static void __init kgdbsibyte_hookup_irq(void)
++{
++      /* Steal the IRQ. */
++      kgdb_irq = K_INT_UART_0 + kgdb_port;
++
++      /* Setup uart 1 settings, mapper */
++      __raw_writeq(M_DUART_IMR_BRK, IOADDR(A_DUART_IMRREG(kgdb_port)));
++
++      sb1250_steal_irq(kgdb_irq);
++
++      __raw_writeq(IMR_IP6_VAL,
++                   IOADDR(A_IMR_REGISTER(0, R_IMR_INTERRUPT_MAP_BASE) +
++                          (kgdb_irq << 3)));
++
++      sb1250_unmask_irq(0, kgdb_irq);
++}
++
++struct kgdb_io kgdb_io_ops = {
++      .read_char = kgdb_swarm_read_char,
++      .write_char = kgdb_swarm_write_char,
++      .init = kgdb_init_io,
++      .late_init = kgdbsibyte_hookup_irq,
++      .pre_exception = NULL,
++      .post_exception = NULL
++};
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/mips/sibyte/swarm/Makefile linux-2.6.18.kgdb/arch/mips/sibyte/swarm/Makefile
+--- linux-2.6.18/arch/mips/sibyte/swarm/Makefile       2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/mips/sibyte/swarm/Makefile  2008-06-10 16:19:28.000000000 +0400
+@@ -1,3 +1 @@
+ lib-y                         = setup.o rtc_xicor1241.o rtc_m41t81.o
+-
+-lib-$(CONFIG_KGDB)            += dbg_io.o
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/mips/sibyte/swarm/dbg_io.c linux-2.6.18.kgdb/arch/mips/sibyte/swarm/dbg_io.c
+--- linux-2.6.18/arch/mips/sibyte/swarm/dbg_io.c       2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/mips/sibyte/swarm/dbg_io.c  1970-01-01 03:00:00.000000000 +0300
+@@ -1,76 +0,0 @@
+-/*
+- * kgdb debug routines for SiByte boards.
+- *
+- * Copyright (C) 2001 MontaVista Software Inc.
+- * Author: Jun Sun, jsun@mvista.com or jsun@junsun.net
+- *
+- * This program is free software; you can redistribute  it and/or modify it
+- * under  the terms of  the GNU General  Public License as published by the
+- * Free Software Foundation;  either version 2 of the  License, or (at your
+- * option) any later version.
+- *
+- */
+-
+-/* -------------------- BEGINNING OF CONFIG --------------------- */
+-
+-#include <linux/delay.h>
+-#include <asm/io.h>
+-#include <asm/sibyte/sb1250.h>
+-#include <asm/sibyte/sb1250_regs.h>
+-#include <asm/sibyte/sb1250_uart.h>
+-#include <asm/sibyte/sb1250_int.h>
+-#include <asm/addrspace.h>
+-
+-/*
+- * We use the second serial port for kgdb traffic.
+- *    115200, 8, N, 1.
+- */
+-
+-#define       BAUD_RATE               115200
+-#define       CLK_DIVISOR             V_DUART_BAUD_RATE(BAUD_RATE)
+-#define       DATA_BITS               V_DUART_BITS_PER_CHAR_8         /* or 7    */
+-#define       PARITY                  V_DUART_PARITY_MODE_NONE        /* or even */
+-#define       STOP_BITS               M_DUART_STOP_BIT_LEN_1          /* or 2    */
+-
+-static int duart_initialized = 0;     /* 0: need to be init'ed by kgdb */
+-
+-/* -------------------- END OF CONFIG --------------------- */
+-extern int kgdb_port;
+-
+-#define       duart_out(reg, val)     csr_out32(val, IOADDR(A_DUART_CHANREG(kgdb_port,reg)))
+-#define duart_in(reg)         csr_in32(IOADDR(A_DUART_CHANREG(kgdb_port,reg)))
+-
+-void putDebugChar(unsigned char c);
+-unsigned char getDebugChar(void);
+-static void
+-duart_init(int clk_divisor, int data, int parity, int stop)
+-{
+-      duart_out(R_DUART_MODE_REG_1, data | parity);
+-      duart_out(R_DUART_MODE_REG_2, stop);
+-      duart_out(R_DUART_CLK_SEL, clk_divisor);
+-
+-      duart_out(R_DUART_CMD, M_DUART_RX_EN | M_DUART_TX_EN);  /* enable rx and tx */
+-}
+-
+-void
+-putDebugChar(unsigned char c)
+-{
+-      if (!duart_initialized) {
+-              duart_initialized = 1;
+-              duart_init(CLK_DIVISOR, DATA_BITS, PARITY, STOP_BITS);
+-      }
+-      while ((duart_in(R_DUART_STATUS) & M_DUART_TX_RDY) == 0);
+-      duart_out(R_DUART_TX_HOLD, c);
+-}
+-
+-unsigned char
+-getDebugChar(void)
+-{
+-      if (!duart_initialized) {
+-              duart_initialized = 1;
+-              duart_init(CLK_DIVISOR, DATA_BITS, PARITY, STOP_BITS);
+-      }
+-      while ((duart_in(R_DUART_STATUS) & M_DUART_RX_RDY) == 0) ;
+-      return duart_in(R_DUART_RX_HOLD);
+-}
+-
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/mips/tx4938/common/Makefile linux-2.6.18.kgdb/arch/mips/tx4938/common/Makefile
+--- linux-2.6.18/arch/mips/tx4938/common/Makefile      2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/mips/tx4938/common/Makefile 2008-06-10 16:19:28.000000000 +0400
+@@ -7,5 +7,5 @@
+ #
+ 
+ obj-y += prom.o setup.o irq.o rtc_rx5c348.o
+-obj-$(CONFIG_KGDB) += dbgio.o
++obj-$(CONFIG_KGDB_8250) += dbgio.o
+ 
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/powerpc/Kconfig.debug linux-2.6.18.kgdb/arch/powerpc/Kconfig.debug
+--- linux-2.6.18/arch/powerpc/Kconfig.debug    2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/powerpc/Kconfig.debug       2008-06-10 16:19:22.000000000 +0400
+@@ -18,52 +18,9 @@ config DEBUG_STACK_USAGE
+ 
+         This option will slow down process creation somewhat.
+ 
+-config DEBUGGER
+-      bool "Enable debugger hooks"
+-      depends on DEBUG_KERNEL
+-      help
+-        Include in-kernel hooks for kernel debuggers. Unless you are
+-        intending to debug the kernel, say N here.
+-
+-config KGDB
+-      bool "Include kgdb kernel debugger"
+-      depends on DEBUGGER && (BROKEN || PPC_GEN550 || 4xx)
+-      select DEBUG_INFO
+-      help
+-        Include in-kernel hooks for kgdb, the Linux kernel source level
+-        debugger.  See <http://kgdb.sourceforge.net/> for more information.
+-        Unless you are intending to debug the kernel, say N here.
+-
+-choice
+-      prompt "Serial Port"
+-      depends on KGDB
+-      default KGDB_TTYS1
+-
+-config KGDB_TTYS0
+-      bool "ttyS0"
+-
+-config KGDB_TTYS1
+-      bool "ttyS1"
+-
+-config KGDB_TTYS2
+-      bool "ttyS2"
+-
+-config KGDB_TTYS3
+-      bool "ttyS3"
+-
+-endchoice
+-
+-config KGDB_CONSOLE
+-      bool "Enable serial console thru kgdb port"
+-      depends on KGDB && 8xx || CPM2
+-      help
+-        If you enable this, all serial console messages will be sent
+-        over the gdb stub.
+-        If unsure, say N.
+-
+ config XMON
+       bool "Include xmon kernel debugger"
+-      depends on DEBUGGER && !PPC_ISERIES
++      depends on DEBUG_KERNEL && !PPC_ISERIES
+       help
+         Include in-kernel hooks for the xmon kernel monitor/debugger.
+         Unless you are intending to debug the kernel, say N here.
+@@ -82,6 +39,11 @@ config XMON_DEFAULT
+         xmon is normally disabled unless booted with 'xmon=on'.
+         Use 'xmon=off' to disable xmon init during runtime.
+ 
++config DEBUGGER
++      bool
++      depends on KGDB || XMON
++      default y
++
+ config IRQSTACKS
+       bool "Use separate kernel stacks when processing interrupts"
+       depends on PPC64
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/powerpc/kernel/Makefile linux-2.6.18.kgdb/arch/powerpc/kernel/Makefile
+--- linux-2.6.18/arch/powerpc/kernel/Makefile  2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/powerpc/kernel/Makefile     2008-06-10 16:19:22.000000000 +0400
+@@ -60,6 +60,7 @@ obj-$(CONFIG_BOOTX_TEXT)     += btext.o
+ obj-$(CONFIG_SMP)             += smp.o
+ obj-$(CONFIG_KPROBES)         += kprobes.o
+ obj-$(CONFIG_PPC_UDBG_16550)  += legacy_serial.o udbg_16550.o
++obj-$(CONFIG_KGDB)            += kgdb.o
+ module-$(CONFIG_PPC64)                += module_64.o
+ obj-$(CONFIG_MODULES)         += $(module-y)
+ 
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/powerpc/kernel/kgdb.c linux-2.6.18.kgdb/arch/powerpc/kernel/kgdb.c
+--- linux-2.6.18/arch/powerpc/kernel/kgdb.c    1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18.kgdb/arch/powerpc/kernel/kgdb.c       2008-06-10 16:19:22.000000000 +0400
+@@ -0,0 +1,568 @@
++/*
++ * arch/powerpc/kernel/kgdb.c
++ *
++ * PowerPC backend to the KGDB stub.
++ *
++ * Maintainer: Tom Rini <trini@kernel.crashing.org>
++ *
++ * Copied from arch/ppc/kernel/kgdb.c, updated for ppc64
++ *
++ * Copyright (C) 1996 Paul Mackerras (setjmp/longjmp)
++ * 1998 (c) Michael AK Tesch (tesch@cs.wisc.edu)
++ * Copyright (C) 2003 Timesys Corporation.
++ * Copyright (C) 2004-2006 MontaVista Software, Inc.
++ * PPC64 Mods (C) 2005 Frank Rowand (frowand@mvista.com)
++ * PPC32 support restored by Vitaly Wool <vwool@ru.mvista.com> and
++ * Sergei Shtylyov <sshtylyov@ru.mvista.com>
++ *
++ * This file is licensed under the terms of the GNU General Public License
++ * version 2. This program as licensed "as is" without any warranty of any
++ * kind, whether express or implied.
++ */
++
++#include <linux/config.h>
++#include <linux/kernel.h>
++#include <linux/init.h>
++#include <linux/kgdb.h>
++#include <linux/smp.h>
++#include <linux/signal.h>
++#include <linux/ptrace.h>
++#include <asm/current.h>
++#include <asm/ptrace.h>
++#include <asm/processor.h>
++#include <asm/machdep.h>
++
++/*
++ * This table contains the mapping between PowerPC hardware trap types, and
++ * signals, which are primarily what GDB understands.  GDB and the kernel
++ * don't always agree on values, so we use constants taken from gdb-6.2.
++ */
++static struct hard_trap_info
++{
++      unsigned int tt;                /* Trap type code for powerpc */
++      unsigned char signo;            /* Signal that we map this trap into */
++} hard_trap_info[] = {
++      { 0x0100, 0x02 /* SIGINT */  },         /* system reset */
++      { 0x0200, 0x0b /* SIGSEGV */ },         /* machine check */
++      { 0x0300, 0x0b /* SIGSEGV */ },         /* data access */
++      { 0x0400, 0x0b /* SIGSEGV */ },  /* instruction access */
++      { 0x0500, 0x02 /* SIGINT */  },  /* external interrupt */
++      { 0x0600, 0x0a /* SIGBUS */  },         /* alignment */
++      { 0x0700, 0x05 /* SIGTRAP */ },  /* program check */
++      { 0x0800, 0x08 /* SIGFPE */  },  /* fp unavailable */
++      { 0x0900, 0x0e /* SIGALRM */ },  /* decrementer */
++      { 0x0c00, 0x14 /* SIGCHLD */ },  /* system call */
++#if defined(CONFIG_40x) || defined(CONFIG_BOOKE)
++      { 0x2002, 0x05 /* SIGTRAP */ },  /* debug */
++#if defined(CONFIG_FSL_BOOKE)
++      { 0x2010, 0x08 /* SIGFPE */  },  /* spe unavailable */
++      { 0x2020, 0x08 /* SIGFPE */  },  /* spe unavailable */
++      { 0x2030, 0x08 /* SIGFPE */  },  /* spe fp data */
++      { 0x2040, 0x08 /* SIGFPE */  },  /* spe fp data */
++      { 0x2050, 0x08 /* SIGFPE */  },  /* spe fp round */
++      { 0x2060, 0x0e /* SIGILL */  },  /* performace monitor */
++      { 0x2900, 0x08 /* SIGFPE */  },  /* apu unavailable */
++      { 0x3100, 0x0e /* SIGALRM */ },  /* fixed interval timer */
++      { 0x3200, 0x02 /* SIGINT */  },  /* watchdog */
++#else
++      { 0x1000, 0x0e /* SIGALRM */ },  /* programmable interval timer */
++      { 0x1010, 0x0e /* SIGALRM */ },  /* fixed interval timer */
++      { 0x1020, 0x02 /* SIGINT */  },  /* watchdog */
++      { 0x2010, 0x08 /* SIGFPE */  },  /* fp unavailable */
++      { 0x2020, 0x08 /* SIGFPE */  },  /* ap unavailable */
++#endif
++#else
++      { 0x0d00, 0x05 /* SIGTRAP */ },  /* single-step */
++#if defined(CONFIG_8xx)
++      { 0x1000, 0x04 /* SIGILL */  },  /* software emulation */
++#else
++      { 0x0f00, 0x04 /* SIGILL */  },         /* performance monitor */
++      { 0x0f20, 0x08 /* SIGFPE */  },         /* altivec unavailable */
++      { 0x1300, 0x05 /* SIGTRAP */ },         /* instruction address break */
++#if defined(CONFIG_PPC64)
++      { 0x1200, 0x05 /* SIGILL */  },         /* system error */
++      { 0x1500, 0x04 /* SIGILL */  },         /* soft patch */
++      { 0x1600, 0x04 /* SIGILL */  },         /* maintenance */
++      { 0x1700, 0x08 /* SIGFPE */  },  /* altivec assist */
++      { 0x1800, 0x04 /* SIGILL */  },         /* thermal */
++#else
++      { 0x1400, 0x02 /* SIGINT */  },  /* SMI */
++      { 0x1600, 0x08 /* SIGFPE */  },  /* altivec assist */
++      { 0x1700, 0x04 /* SIGILL */  },  /* TAU */
++      { 0x2000, 0x05 /* SIGTRAP */ },  /* run mode */
++#endif
++#endif
++#endif
++      { 0x0000, 0x00 }                        /* Must be last */
++};
++
++extern atomic_t cpu_doing_single_step;
++
++static int computeSignal(unsigned int tt)
++{
++      struct hard_trap_info *ht;
++
++      for (ht = hard_trap_info; ht->tt && ht->signo; ht++)
++              if (ht->tt == tt)
++                      return ht->signo;
++
++      return SIGHUP;          /* default for things we don't know about */
++}
++
++static int kgdb_call_nmi_hook(struct pt_regs *regs)
++{
++      kgdb_nmihook(smp_processor_id(), regs);
++      return 0;
++}
++
++#ifdef CONFIG_SMP
++void kgdb_roundup_cpus(unsigned long flags)
++{
++      smp_send_debugger_break(MSG_ALL_BUT_SELF);
++}
++#endif
++
++/* KGDB functions to use existing PowerPC64 hooks. */
++static int kgdb_debugger(struct pt_regs *regs)
++{
++      return kgdb_handle_exception(0, computeSignal(TRAP(regs)), 0, regs);
++}
++
++static int kgdb_breakpoint(struct pt_regs *regs)
++{
++      if (user_mode(regs))
++              return 0;
++
++      kgdb_handle_exception(0, SIGTRAP, 0, regs);
++
++      if (*(u32 *) (regs->nip) == *(u32 *) (&arch_kgdb_ops.gdb_bpt_instr))
++              regs->nip += 4;
++
++      return 1;
++}
++
++static int kgdb_singlestep(struct pt_regs *regs)
++{
++      struct thread_info *thread_info, *exception_thread_info;
++      if (user_mode(regs))
++              return 0;
++      /*
++      * On Book E and perhaps other processsors, singlestep is handled on
++      * the critical exception stack.  This causes current_thread_info()
++      * to fail, since it it locates the thread_info by masking off
++      * the low bits of the current stack pointer.  We work around
++      * this issue by copying the thread_info from the kernel stack
++      * before calling kgdb_handle_exception, and copying it back
++      * afterwards.  On most processors the copy is avoided since
++      * exception_thread_info == thread_info.
++      */
++      thread_info = (struct thread_info *)(regs->gpr[1] & ~(THREAD_SIZE-1));
++      exception_thread_info = current_thread_info();
++
++      if (thread_info != exception_thread_info)
++              memcpy(exception_thread_info, thread_info, sizeof *thread_info);
++
++      kgdb_handle_exception(0, SIGTRAP, 0, regs);
++
++      if (thread_info != exception_thread_info)
++              memcpy(thread_info, exception_thread_info, sizeof *thread_info);
++
++      return 1;
++}
++
++int kgdb_iabr_match(struct pt_regs *regs)
++{
++      if (user_mode(regs))
++              return 0;
++
++      kgdb_handle_exception(0, computeSignal(TRAP(regs)), 0, regs);
++      return 1;
++}
++
++int kgdb_dabr_match(struct pt_regs *regs)
++{
++      if (user_mode(regs))
++              return 0;
++
++      kgdb_handle_exception(0, computeSignal(TRAP(regs)), 0, regs);
++      return 1;
++}
++
++#define PACK64(ptr,src) do { *(ptr++) = (src); } while(0)
++
++#define PACK32(ptr,src) do {          \
++      u32 *ptr32;                   \
++      ptr32 = (u32 *)ptr;           \
++      *(ptr32++) = (src);           \
++      ptr = (unsigned long *)ptr32; \
++      } while(0)
++
++void regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
++{
++      unsigned long *ptr = gdb_regs;
++      int reg;
++
++      memset(gdb_regs, 0, NUMREGBYTES);
++
++      for (reg = 0; reg < 32; reg++)
++              PACK64(ptr, regs->gpr[reg]);
++
++#ifdef CONFIG_FSL_BOOKE
++#ifdef CONFIG_SPE
++      for (reg = 0; reg < 32; reg++)
++              PACK64(ptr, current->thread.evr[reg]);
++#else
++      ptr += 32;
++#endif
++#else
++      /* fp registers not used by kernel, leave zero */
++      ptr += 32 * 8 / sizeof(long);
++#endif
++
++      PACK64(ptr, regs->nip);
++      PACK64(ptr, regs->msr);
++      PACK32(ptr, regs->ccr);
++      PACK64(ptr, regs->link);
++      PACK64(ptr, regs->ctr);
++      PACK32(ptr, regs->xer);
++
++#if 0
++      Following are in struct thread_struct, not struct pt_regs,
++      ignoring for now since kernel does not use them.  Would it
++      make sense to get them from the thread that kgdb is set to?
++
++      If this code is enabled, update the definition of NUMREGBYTES to
++      include the vector registers and vector state registers.
++
++      PACK32(ptr, current->thread->fpscr);
++
++      /* vr registers not used by kernel, leave zero */
++      ptr += 32 * 16 / sizeof(long);
++
++#ifdef CONFIG_ALTIVEC
++      PACK32(ptr, current->thread->vscr);
++      PACK32(ptr, current->thread->vrsave);
++#else
++      ptr += 2 * 4 / sizeof(long);
++#endif
++#else
++#ifdef CONFIG_FSL_BOOKE
++#ifdef CONFIG_SPE
++      /* u64 acc */
++      PACK32(ptr, current->thread.acc >> 32);
++      PACK32(ptr, current->thread.acc & 0xffffffff);
++      PACK64(ptr, current->thread.spefscr);
++#else
++      ptr += 2 + 1;
++#endif
++#else
++      /* fpscr not used by kernel, leave zero */
++      PACK32(ptr, 0);
++#endif
++#endif
++
++      BUG_ON((unsigned long)ptr >
++             (unsigned long)(((void *)gdb_regs) + NUMREGBYTES));
++}
++
++void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
++{
++      struct pt_regs *regs = (struct pt_regs *)(p->thread.ksp +
++                                                STACK_FRAME_OVERHEAD);
++      unsigned long *ptr = gdb_regs;
++      int reg;
++
++      memset(gdb_regs, 0, NUMREGBYTES);
++
++      /* Regs GPR0-2 */
++      for (reg = 0; reg < 3; reg++)
++              PACK64(ptr, regs->gpr[reg]);
++
++      /* Regs GPR3-13 are caller saved, not in regs->gpr[] */
++      ptr += 11;
++
++      /* Regs GPR14-31 */
++      for (reg = 14; reg < 32; reg++)
++              PACK64(ptr, regs->gpr[reg]);
++
++#ifdef CONFIG_FSL_BOOKE
++#ifdef CONFIG_SPE
++      for (reg = 0; reg < 32; reg++)
++              PACK64(ptr, p->thread.evr[reg]);
++#else
++      ptr += 32;
++#endif
++#else
++      /* fp registers not used by kernel, leave zero */
++      ptr += 32 * 8 / sizeof(long);
++#endif
++      PACK64(ptr, regs->nip);
++      PACK64(ptr, regs->msr);
++      PACK32(ptr, regs->ccr);
++      PACK64(ptr, regs->link);
++      PACK64(ptr, regs->ctr);
++      PACK32(ptr, regs->xer);
++
++#if 0
++      Following are in struct thread_struct, not struct pt_regs,
++      ignoring for now since kernel does not use them.  Would it
++      make sense to get them from the thread that kgdb is set to?
++
++      If this code is enabled, update the definition of NUMREGBYTES to
++      include the vector registers and vector state registers.
++
++      PACK32(ptr, p->thread->fpscr);
++
++      /* vr registers not used by kernel, leave zero */
++      ptr += 32 * 16 / sizeof(long);
++
++#ifdef CONFIG_ALTIVEC
++      PACK32(ptr, p->thread->vscr);
++      PACK32(ptr, p->thread->vrsave);
++#else
++      ptr += 2 * 4 / sizeof(long);
++#endif
++#else
++#ifdef CONFIG_FSL_BOOKE
++#ifdef CONFIG_SPE
++      /* u64 acc */
++      PACK32(ptr, p->thread.acc >> 32);
++      PACK32(ptr, p->thread.acc & 0xffffffff);
++      PACK64(ptr, p->thread.spefscr);
++#else
++      ptr += 2 + 1;
++#endif
++#else
++      /* fpscr not used by kernel, leave zero */
++      PACK32(ptr, 0);
++#endif
++#endif
++
++      BUG_ON((unsigned long)ptr >
++             (unsigned long)(((void *)gdb_regs) + NUMREGBYTES));
++}
++
++#define UNPACK64(dest,ptr) do { dest = *(ptr++); } while(0)
++
++#define UNPACK32(dest,ptr) do {       \
++      u32 *ptr32;                   \
++      ptr32 = (u32 *)ptr;           \
++      dest = *(ptr32++);            \
++      ptr = (unsigned long *)ptr32; \
++      } while(0)
++
++void gdb_regs_to_regs(unsigned long *gdb_regs, struct pt_regs *regs)
++{
++      unsigned long *ptr = gdb_regs;
++      int reg;
++
++#ifdef CONFIG_SPE
++      union {
++              u32 v32[2];
++              u64 v64;
++      } acc;
++#endif
++      for (reg = 0; reg < 32; reg++)
++              UNPACK64(regs->gpr[reg], ptr);
++
++#ifdef CONFIG_FSL_BOOKE
++#ifdef CONFIG_SPE
++      for (reg = 0; reg < 32; reg++)
++              UNPACK64(current->thread.evr[reg], ptr);
++#else
++      ptr += 32;
++#endif
++#else
++      /* fp registers not used by kernel, leave zero */
++      ptr += 32 * 8 / sizeof(int);
++#endif
++      UNPACK64(regs->nip, ptr);
++      UNPACK64(regs->msr, ptr);
++      UNPACK32(regs->ccr, ptr);
++      UNPACK64(regs->link, ptr);
++      UNPACK64(regs->ctr, ptr);
++      UNPACK32(regs->xer, ptr);
++
++#if 0
++      Following are in struct thread_struct, not struct pt_regs,
++      ignoring for now since kernel does not use them.  Would it
++      make sense to get them from the thread that kgdb is set to?
++
++      If this code is enabled, update the definition of NUMREGBYTES to
++      include the vector registers and vector state registers.
++
++      /* fpscr, vscr, vrsave not used by kernel, leave unchanged */
++
++      UNPACK32(current->thread->fpscr, ptr);
++
++      /* vr registers not used by kernel, leave zero */
++      ptr += 32 * 16 / sizeof(long);
++
++      #ifdef CONFIG_ALTIVEC
++      UNPACK32(current->thread->vscr, ptr);
++      UNPACK32(current->thread->vrsave, ptr);
++#else
++      ptr += 2 * 4 / sizeof(long);
++#endif
++#else
++#ifdef CONFIG_FSL_BOOKE
++#ifdef CONFIG_SPE
++      /* u64 acc */
++      UNPACK32(acc.v32[0], ptr);
++      UNPACK32(acc.v32[1], ptr);
++      current->thread.acc = acc.v64;
++      UNPACK64(current->thread.spefscr, ptr);
++#else
++      ptr += 2 + 1;
++#endif
++#endif
++#endif
++
++      BUG_ON((unsigned long)ptr >
++             (unsigned long)(((void *)gdb_regs) + NUMREGBYTES));
++}
++
++/*
++ * This function does PowerPC specific procesing for interfacing to gdb.
++ */
++int kgdb_arch_handle_exception(int vector, int signo, int err_code,
++                             char *remcom_in_buffer, char *remcom_out_buffer,
++                             struct pt_regs *linux_regs)
++{
++      char *ptr = &remcom_in_buffer[1];
++      unsigned long addr;
++
++      switch (remcom_in_buffer[0]) {
++              /*
++               * sAA..AA   Step one instruction from AA..AA
++               * This will return an error to gdb ..
++               */
++      case 's':
++      case 'c':
++              /* handle the optional parameter */
++              if (kgdb_hex2long(&ptr, &addr))
++                      linux_regs->nip = addr;
++
++              atomic_set(&cpu_doing_single_step, -1);
++              /* set the trace bit if we're stepping */
++              if (remcom_in_buffer[0] == 's') {
++#if defined(CONFIG_40x) || defined(CONFIG_BOOKE)
++                      mtspr(SPRN_DBCR0,
++                              mfspr(SPRN_DBCR0) | DBCR0_IC | DBCR0_IDM);
++                      linux_regs->msr |= MSR_DE;
++#else
++                      linux_regs->msr |= MSR_SE;
++#endif
++                      debugger_step = 1;
++                      if (kgdb_contthread)
++                              atomic_set(&cpu_doing_single_step,
++                                         smp_processor_id());
++              }
++              return 0;
++      }
++
++      return -1;
++}
++
++int kgdb_fault_setjmp(unsigned long *curr_context)
++{
++#ifdef CONFIG_PPC32
++      __asm__ __volatile__("mflr 0; stw 0,0(%0);\n\
++                              stw 1,4(%0); stw 2,8(%0);\n\
++                              mfcr 0; stw 0,12(%0);\n\
++                              stmw 13,16(%0)\n" : : "r" (curr_context));
++#else
++      __asm__ __volatile__("mflr 0; std 0,0(%0)\n\
++                            std       1,8(%0)\n\
++                            std       2,16(%0)\n\
++                            mfcr 0; std 0,24(%0)\n\
++                            std       13,32(%0)\n\
++                            std       14,40(%0)\n\
++                            std       15,48(%0)\n\
++                            std       16,56(%0)\n\
++                            std       17,64(%0)\n\
++                            std       18,72(%0)\n\
++                            std       19,80(%0)\n\
++                            std       20,88(%0)\n\
++                            std       21,96(%0)\n\
++                            std       22,104(%0)\n\
++                            std       23,112(%0)\n\
++                            std       24,120(%0)\n\
++                            std       25,128(%0)\n\
++                            std       26,136(%0)\n\
++                            std       27,144(%0)\n\
++                            std       28,152(%0)\n\
++                            std       29,160(%0)\n\
++                            std       30,168(%0)\n\
++                            std       31,176(%0)\n" : : "r" (curr_context));
++#endif
++      return 0;
++}
++
++void kgdb_fault_longjmp(unsigned long *curr_context)
++{
++#ifdef CONFIG_PPC32
++      __asm__ __volatile__("lmw 13,16(%0);\n\
++                            lwz 0,12(%0); mtcrf 0x38,0;\n\
++                            lwz 0,0(%0); lwz 1,4(%0); lwz 2,8(%0);\n\
++                            mtlr 0; mr 3,1\n" : : "r" (curr_context));
++#else
++      __asm__ __volatile__("ld        13,32(%0)\n\
++                            ld        14,40(%0)\n\
++                            ld        15,48(%0)\n\
++                            ld        16,56(%0)\n\
++                            ld        17,64(%0)\n\
++                            ld        18,72(%0)\n\
++                            ld        19,80(%0)\n\
++                            ld        20,88(%0)\n\
++                            ld        21,96(%0)\n\
++                            ld        22,104(%0)\n\
++                            ld        23,112(%0)\n\
++                            ld        24,120(%0)\n\
++                            ld        25,128(%0)\n\
++                            ld        26,136(%0)\n\
++                            ld        27,144(%0)\n\
++                            ld        28,152(%0)\n\
++                            ld        29,160(%0)\n\
++                            ld        30,168(%0)\n\
++                            ld        31,176(%0)\n\
++                            ld        0,24(%0)\n\
++                            mtcrf     0x38,0\n\
++                            ld        0,0(%0)\n\
++                            ld        1,8(%0)\n\
++                            ld        2,16(%0)\n\
++                            mtlr      0\n\
++                            mr        3,1\n" : : "r" (curr_context));
++#endif
++}
++
++/*
++ * Global data
++ */
++struct kgdb_arch arch_kgdb_ops = {
++      .gdb_bpt_instr = {0x7d, 0x82, 0x10, 0x08},
++};
++
++int kgdb_not_implemented(struct pt_regs *regs)
++{
++      return 0;
++}
++
++int kgdb_arch_init(void)
++{
++#ifdef CONFIG_XMON
++#error Both XMON and KGDB selected in .config.  Unselect one of them.
++#endif
++
++      __debugger_ipi = kgdb_call_nmi_hook;
++      __debugger = kgdb_debugger;
++      __debugger_bpt = kgdb_breakpoint;
++      __debugger_sstep = kgdb_singlestep;
++      __debugger_iabr_match = kgdb_iabr_match;
++      __debugger_dabr_match = kgdb_dabr_match;
++      __debugger_fault_handler = kgdb_not_implemented;
++
++      return 0;
++}
++
++arch_initcall(kgdb_arch_init);
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/powerpc/kernel/legacy_serial.c linux-2.6.18.kgdb/arch/powerpc/kernel/legacy_serial.c
+--- linux-2.6.18/arch/powerpc/kernel/legacy_serial.c   2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/powerpc/kernel/legacy_serial.c      2008-06-10 16:19:22.000000000 +0400
+@@ -11,6 +11,9 @@
+ #include <asm/udbg.h>
+ #include <asm/pci-bridge.h>
+ #include <asm/ppc-pci.h>
++#ifdef CONFIG_KGDB_8250
++#include <linux/kgdb.h>
++#endif
+ 
+ #undef DEBUG
+ 
+@@ -470,6 +473,9 @@ static int __init serial_dev_init(void)
+                       fixup_port_pio(i, np, port);
+               if ((port->iotype == UPIO_MEM) || (port->iotype == UPIO_TSI))
+                       fixup_port_mmio(i, np, port);
++#ifdef CONFIG_KGDB_8250
++              kgdb8250_add_platform_port(i, port);
++#endif
+       }
+ 
+       DBG("Registering platform serial ports\n");
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/powerpc/kernel/setup_32.c linux-2.6.18.kgdb/arch/powerpc/kernel/setup_32.c
+--- linux-2.6.18/arch/powerpc/kernel/setup_32.c        2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/powerpc/kernel/setup_32.c   2008-06-10 16:19:22.000000000 +0400
+@@ -45,10 +45,6 @@
+ 
+ #define DBG(fmt...)
+ 
+-#if defined CONFIG_KGDB
+-#include <asm/kgdb.h>
+-#endif
+-
+ extern void bootx_init(unsigned long r4, unsigned long phys);
+ 
+ struct ide_machdep_calls ppc_ide_md;
+@@ -248,18 +244,6 @@ void __init setup_arch(char **cmdline_p)
+       /* Register early console */
+       register_early_udbg_console();
+ 
+-#if defined(CONFIG_KGDB)
+-      if (ppc_md.kgdb_map_scc)
+-              ppc_md.kgdb_map_scc();
+-      set_debug_traps();
+-      if (strstr(cmd_line, "gdb")) {
+-              if (ppc_md.progress)
+-                      ppc_md.progress("setup_arch: kgdb breakpoint", 0x4000);
+-              printk("kgdb breakpoint activated\n");
+-              breakpoint();
+-      }
+-#endif
+-
+       /*
+        * Set cache line size based on type of cpu as a default.
+        * Systems with OF can look in the properties on the cpu node(s)
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/powerpc/mm/fault.c linux-2.6.18.kgdb/arch/powerpc/mm/fault.c
+--- linux-2.6.18/arch/powerpc/mm/fault.c       2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/powerpc/mm/fault.c  2008-06-10 16:19:22.000000000 +0400
+@@ -28,6 +28,7 @@
+ #include <linux/highmem.h>
+ #include <linux/module.h>
+ #include <linux/kprobes.h>
++#include <linux/kgdb.h>
+ 
+ #include <asm/page.h>
+ #include <asm/pgtable.h>
+@@ -424,6 +425,13 @@ void bad_page_fault(struct pt_regs *regs
+               return;
+       }
+ 
++#ifdef CONFIG_KGDB
++      if (atomic_read(&debugger_active) && kgdb_may_fault)
++              /* Restore our previous state. */
++              kgdb_fault_longjmp(kgdb_fault_jmp_regs);
++              /* Not reached. */
++#endif
++
+       /* kernel has accessed a bad area */
+ 
+       printk(KERN_ALERT "Unable to handle kernel paging request for ");
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/powerpc/platforms/powermac/setup.c linux-2.6.18.kgdb/arch/powerpc/platforms/powermac/setup.c
+--- linux-2.6.18/arch/powerpc/platforms/powermac/setup.c       2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/powerpc/platforms/powermac/setup.c  2008-06-10 16:19:22.000000000 +0400
+@@ -98,8 +98,6 @@ extern struct machdep_calls pmac_md;
+ int sccdbg;
+ #endif
+ 
+-extern void zs_kgdb_hook(int tty_num);
+-
+ sys_ctrler_t sys_ctrler = SYS_CTRLER_UNKNOWN;
+ EXPORT_SYMBOL(sys_ctrler);
+ 
+@@ -319,10 +317,6 @@ static void __init pmac_setup_arch(void)
+       l2cr_init();
+ #endif /* CONFIG_PPC32 */
+ 
+-#ifdef CONFIG_KGDB
+-      zs_kgdb_hook(0);
+-#endif
+-
+       find_via_cuda();
+       find_via_pmu();
+       smu_init();
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ppc/Kconfig.debug linux-2.6.18.kgdb/arch/ppc/Kconfig.debug
+--- linux-2.6.18/arch/ppc/Kconfig.debug        2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/ppc/Kconfig.debug   2008-06-10 16:19:22.000000000 +0400
+@@ -2,42 +2,6 @@ menu "Kernel hacking"
+ 
+ source "lib/Kconfig.debug"
+ 
+-config KGDB
+-      bool "Include kgdb kernel debugger"
+-      depends on DEBUG_KERNEL && (BROKEN || PPC_GEN550 || 4xx)
+-      select DEBUG_INFO
+-      help
+-        Include in-kernel hooks for kgdb, the Linux kernel source level
+-        debugger.  See <http://kgdb.sourceforge.net/> for more information.
+-        Unless you are intending to debug the kernel, say N here.
+-
+-choice
+-      prompt "Serial Port"
+-      depends on KGDB
+-      default KGDB_TTYS1
+-
+-config KGDB_TTYS0
+-      bool "ttyS0"
+-
+-config KGDB_TTYS1
+-      bool "ttyS1"
+-
+-config KGDB_TTYS2
+-      bool "ttyS2"
+-
+-config KGDB_TTYS3
+-      bool "ttyS3"
+-
+-endchoice
+-
+-config KGDB_CONSOLE
+-      bool "Enable serial console thru kgdb port"
+-      depends on KGDB && 8xx || CPM2
+-      help
+-        If you enable this, all serial console messages will be sent
+-        over the gdb stub.
+-        If unsure, say N.
+-
+ config XMON
+       bool "Include xmon kernel debugger"
+       depends on DEBUG_KERNEL
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ppc/kernel/kgdb.c linux-2.6.18.kgdb/arch/ppc/kernel/kgdb.c
+--- linux-2.6.18/arch/ppc/kernel/kgdb.c        1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18.kgdb/arch/ppc/kernel/kgdb.c   2008-06-10 16:20:19.000000000 +0400
+@@ -0,0 +1,350 @@
++/*
++ * arch/ppc/kernel/kgdb.c
++ *
++ * PowerPC backend to the KGDB stub.
++ *
++ * Maintainer: Tom Rini <trini@kernel.crashing.org>
++ *
++ * 1998 (c) Michael AK Tesch (tesch@cs.wisc.edu)
++ * Copyright (C) 2003 Timesys Corporation.
++ * 2004 (c) MontaVista Software, Inc.
++ *
++ * This file is licensed under the terms of the GNU General Public License
++ * version 2. This program as licensed "as is" without any warranty of any
++ * kind, whether express or implied.
++ */
++
++#include <linux/config.h>
++#include <linux/kernel.h>
++#include <linux/init.h>
++#include <linux/kgdb.h>
++#include <linux/smp.h>
++#include <linux/signal.h>
++#include <linux/ptrace.h>
++#include <asm/current.h>
++#include <asm/ptrace.h>
++#include <asm/processor.h>
++#include <asm/machdep.h>
++
++/*
++ * This table contains the mapping between PowerPC hardware trap types, and
++ * signals, which are primarily what GDB understands.  GDB and the kernel
++ * don't always agree on values, so we use constants taken from gdb-6.2.
++ */
++static struct hard_trap_info
++{
++      unsigned int tt;                /* Trap type code for powerpc */
++      unsigned char signo;            /* Signal that we map this trap into */
++} hard_trap_info[] = {
++#if defined(CONFIG_40x) || defined(CONFIG_BOOKE)
++      { 0x0100, 0x02 /* SIGINT */  },         /* critical input interrupt */
++      { 0x0200, 0x0b /* SIGSEGV */ },         /* machine check */
++      { 0x0300, 0x0b /* SIGSEGV */ },         /* data storage */
++      { 0x0400, 0x0a /* SIGBUS */  },         /* instruction storage */
++      { 0x0500, 0x02 /* SIGINT */  },         /* interrupt */
++      { 0x0600, 0x0a /* SIGBUS */  },         /* alignment */
++      { 0x0700, 0x04 /* SIGILL */  },         /* program */
++      { 0x0800, 0x04 /* SIGILL */  },         /* reserved */
++      { 0x0900, 0x04 /* SIGILL */  },         /* reserved */
++      { 0x0a00, 0x04 /* SIGILL */  },         /* reserved */
++      { 0x0b00, 0x04 /* SIGILL */  },         /* reserved */
++      { 0x0c00, 0x14 /* SIGCHLD */ },         /* syscall */
++      { 0x0d00, 0x04 /* SIGILL */  },         /* reserved */
++      { 0x0e00, 0x04 /* SIGILL */  },         /* reserved */
++      { 0x0f00, 0x04 /* SIGILL */  },         /* reserved */
++      { 0x2002, 0x05 /* SIGTRAP */},          /* debug */
++#else
++      { 0x0200, 0x0b /* SIGSEGV */ },         /* machine check */
++      { 0x0300, 0x0b /* SIGSEGV */ },         /* address error (store) */
++      { 0x0400, 0x0a /* SIGBUS */ },          /* instruction bus error */
++      { 0x0500, 0x02 /* SIGINT */ },          /* interrupt */
++      { 0x0600, 0x0a /* SIGBUS */ },          /* alingment */
++      { 0x0700, 0x05 /* SIGTRAP */ },         /* breakpoint trap */
++      { 0x0800, 0x08 /* SIGFPE */},           /* fpu unavail */
++      { 0x0900, 0x0e /* SIGALRM */ },         /* decrementer */
++      { 0x0a00, 0x04 /* SIGILL */ },          /* reserved */
++      { 0x0b00, 0x04 /* SIGILL */ },          /* reserved */
++      { 0x0c00, 0x14 /* SIGCHLD */ },         /* syscall */
++      { 0x0d00, 0x05 /* SIGTRAP */ },         /* single-step/watch */
++      { 0x0e00, 0x08 /* SIGFPE */ },          /* fp assist */
++#endif
++      { 0x0000, 0x000 }                       /* Must be last */
++};
++
++extern atomic_t cpu_doing_single_step;
++
++static int computeSignal(unsigned int tt)
++{
++      struct hard_trap_info *ht;
++
++      for (ht = hard_trap_info; ht->tt && ht->signo; ht++)
++              if (ht->tt == tt)
++                      return ht->signo;
++
++      return SIGHUP;          /* default for things we don't know about */
++}
++
++/* KGDB functions to use existing PowerPC hooks. */
++static void kgdb_debugger(struct pt_regs *regs)
++{
++      kgdb_handle_exception(0, computeSignal(TRAP(regs)), 0, regs);
++}
++
++static int kgdb_breakpoint(struct pt_regs *regs)
++{
++      if (user_mode(regs))
++              return 0;
++
++      kgdb_handle_exception(0, SIGTRAP, 0, regs);
++
++      if (*(u32 *) (regs->nip) == *(u32 *) (&arch_kgdb_ops.gdb_bpt_instr))
++              regs->nip += 4;
++
++      return 1;
++}
++
++static int kgdb_singlestep(struct pt_regs *regs)
++{
++      struct thread_info *thread_info, *exception_thread_info;
++
++      if (user_mode(regs))
++              return 0;
++      /*
++      * On Book E and perhaps other processsors, singlestep is handled on
++      * the critical exception stack.  This causes current_thread_info()
++      * to fail, since it it locates the thread_info by masking off
++      * the low bits of the current stack pointer.  We work around
++      * this issue by copying the thread_info from the kernel stack
++      * before calling kgdb_handle_exception, and copying it back
++      * afterwards.  On most processors the copy is avoided since
++      * exception_thread_info == thread_info.
++      */
++      thread_info = (struct thread_info *)(regs->gpr[1] & ~(THREAD_SIZE-1));
++      exception_thread_info = current_thread_info();
++
++      if (thread_info != exception_thread_info)
++              memcpy(exception_thread_info, thread_info, sizeof *thread_info);
++
++      kgdb_handle_exception(0, SIGTRAP, 0, regs);
++
++      if (thread_info != exception_thread_info)
++              memcpy(thread_info, exception_thread_info, sizeof *thread_info);
++
++      return 1;
++}
++
++int kgdb_iabr_match(struct pt_regs *regs)
++{
++      if (user_mode(regs))
++              return 0;
++
++      kgdb_handle_exception(0, computeSignal(TRAP(regs)), 0, regs);
++      return 1;
++}
++
++int kgdb_dabr_match(struct pt_regs *regs)
++{
++      if (user_mode(regs))
++              return 0;
++
++      kgdb_handle_exception(0, computeSignal(TRAP(regs)), 0, regs);
++      return 1;
++}
++
++void regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
++{
++      int reg;
++      unsigned long *ptr = gdb_regs;
++
++      memset(gdb_regs, 0, MAXREG * 4);
++
++      for (reg = 0; reg < 32; reg++)
++              *(ptr++) = regs->gpr[reg];
++
++#ifndef CONFIG_E500
++      for (reg = 0; reg < 64; reg++)
++              *(ptr++) = 0;
++#else
++      for (reg = 0; reg < 32; reg++)
++              *(ptr++) = current->thread.evr[reg];
++#endif
++
++      *(ptr++) = regs->nip;
++      *(ptr++) = regs->msr;
++      *(ptr++) = regs->ccr;
++      *(ptr++) = regs->link;
++      *(ptr++) = regs->ctr;
++      *(ptr++) = regs->xer;
++
++#ifdef CONFIG_SPE
++      /* u64 acc */
++      *(ptr++) = (current->thread.acc >> 32);
++      *(ptr++) = (current->thread.acc & 0xffffffff);
++      *(ptr++) = current->thread.spefscr;
++#endif
++}
++
++void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
++{
++      struct pt_regs *regs = (struct pt_regs *)(p->thread.ksp +
++                                                STACK_FRAME_OVERHEAD);
++      int reg;
++      unsigned long *ptr = gdb_regs;
++
++      memset(gdb_regs, 0, MAXREG * 4);
++
++      /* Regs GPR0-2 */
++      for (reg = 0; reg < 3; reg++)
++              *(ptr++) = regs->gpr[reg];
++
++      /* Regs GPR3-13 are not saved */
++      for (reg = 3; reg < 14; reg++)
++              *(ptr++) = 0;
++
++      /* Regs GPR14-31 */
++      for (reg = 14; reg < 32; reg++)
++              *(ptr++) = regs->gpr[reg];
++
++#ifndef CONFIG_E500
++      for (reg = 0; reg < 64; reg++)
++              *(ptr++) = 0;
++#else
++      for (reg = 0; reg < 32; reg++)
++              *(ptr++) = current->thread.evr[reg];
++#endif
++
++      *(ptr++) = regs->nip;
++      *(ptr++) = regs->msr;
++      *(ptr++) = regs->ccr;
++      *(ptr++) = regs->link;
++      *(ptr++) = regs->ctr;
++      *(ptr++) = regs->xer;
++
++#ifdef CONFIG_SPE
++      /* u64 acc */
++      *(ptr++) = (current->thread.acc >> 32);
++      *(ptr++) = (current->thread.acc & 0xffffffff);
++      *(ptr++) = current->thread.spefscr;
++#endif
++}
++
++void gdb_regs_to_regs(unsigned long *gdb_regs, struct pt_regs *regs)
++{
++      int reg;
++      unsigned long *ptr = gdb_regs;
++#ifdef CONFIG_SPE
++      union {
++              u32 v32[2];
++              u64 v64;
++      } u;
++#endif
++
++      for (reg = 0; reg < 32; reg++)
++              regs->gpr[reg] = *(ptr++);
++
++#ifndef CONFIG_E500
++      for (reg = 0; reg < 64; reg++)
++              ptr++;
++#else
++      for (reg = 0; reg < 32; reg++)
++              current->thread.evr[reg] = *(ptr++);
++#endif
++
++      regs->nip = *(ptr++);
++      regs->msr = *(ptr++);
++      regs->ccr = *(ptr++);
++      regs->link = *(ptr++);
++      regs->ctr = *(ptr++);
++      regs->xer = *(ptr++);
++
++#ifdef CONFIG_SPE
++      /* u64 acc */
++      u.v32[0] = *(ptr++);
++      u.v32[1] = *(ptr++);
++      current->thread.acc = u.v64;
++      current->thread.spefscr = *(ptr++);
++#endif
++}
++
++/*
++ * Save/restore state in case a memory access causes a fault.
++ */
++int kgdb_fault_setjmp(unsigned long *curr_context)
++{
++      __asm__ __volatile__("mflr 0; stw 0,0(%0);"
++                           "stw 1,4(%0); stw 2,8(%0);"
++                           "mfcr 0; stw 0,12(%0);"
++                           "stmw 13,16(%0)"::"r"(curr_context));
++      return 0;
++}
++
++void kgdb_fault_longjmp(unsigned long *curr_context)
++{
++      __asm__ __volatile__("lmw 13,16(%0);"
++                           "lwz 0,12(%0); mtcrf 0x38,0;"
++                           "lwz 0,0(%0); lwz 1,4(%0); lwz 2,8(%0);"
++                           "mtlr 0; mr 3,1"::"r"(curr_context));
++}
++
++/*
++ * This function does PoerPC specific procesing for interfacing to gdb.
++ */
++int kgdb_arch_handle_exception(int vector, int signo, int err_code,
++                             char *remcom_in_buffer, char *remcom_out_buffer,
++                             struct pt_regs *linux_regs)
++{
++      char *ptr = &remcom_in_buffer[1];
++      unsigned long addr;
++
++      switch (remcom_in_buffer[0])
++              {
++              /*
++               * sAA..AA   Step one instruction from AA..AA
++               * This will return an error to gdb ..
++               */
++              case 's':
++              case 'c':
++                      /* handle the optional parameter */
++                      if (kgdb_hex2long (&ptr, &addr))
++                              linux_regs->nip = addr;
++
++                      atomic_set(&cpu_doing_single_step, -1);
++                      /* set the trace bit if we're stepping */
++                      if (remcom_in_buffer[0] == 's') {
++#if defined (CONFIG_40x) || defined(CONFIG_BOOKE)
++                              mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) |
++                                              DBCR0_IC | DBCR0_IDM);
++                              linux_regs->msr |= MSR_DE;
++#else
++                              linux_regs->msr |= MSR_SE;
++#endif
++                              debugger_step = 1;
++                              if (kgdb_contthread)
++                                      atomic_set(&cpu_doing_single_step,
++                                                      smp_processor_id());
++                      }
++                      return 0;
++      }
++
++      return -1;
++}
++
++/*
++ * Global data
++ */
++struct kgdb_arch arch_kgdb_ops = {
++      .gdb_bpt_instr = {0x7d, 0x82, 0x10, 0x08},
++};
++
++int kgdb_arch_init(void)
++{
++      debugger = kgdb_debugger;
++      debugger_bpt = kgdb_breakpoint;
++      debugger_sstep = kgdb_singlestep;
++      debugger_iabr_match = kgdb_iabr_match;
++      debugger_dabr_match = kgdb_dabr_match;
++
++      return 0;
++}
++
++arch_initcall(kgdb_arch_init);
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ppc/kernel/ppc-stub.c linux-2.6.18.kgdb/arch/ppc/kernel/ppc-stub.c
+--- linux-2.6.18/arch/ppc/kernel/ppc-stub.c    2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/ppc/kernel/ppc-stub.c       1970-01-01 03:00:00.000000000 +0300
+@@ -1,866 +0,0 @@
+-/*
+- * ppc-stub.c:  KGDB support for the Linux kernel.
+- *
+- * adapted from arch/sparc/kernel/sparc-stub.c for the PowerPC
+- * some stuff borrowed from Paul Mackerras' xmon
+- * Copyright (C) 1998 Michael AK Tesch (tesch@cs.wisc.edu)
+- *
+- * Modifications to run under Linux
+- * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu)
+- *
+- * This file originally came from the gdb sources, and the
+- * copyright notices have been retained below.
+- */
+-
+-/****************************************************************************
+-
+-              THIS SOFTWARE IS NOT COPYRIGHTED
+-
+-   HP offers the following for use in the public domain.  HP makes no
+-   warranty with regard to the software or its performance and the
+-   user accepts the software "AS IS" with all faults.
+-
+-   HP DISCLAIMS ANY WARRANTIES, EXPRESS OR IMPLIED, WITH REGARD
+-   TO THIS SOFTWARE INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+-   OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+-
+-****************************************************************************/
+-
+-/****************************************************************************
+- *  Header: remcom.c,v 1.34 91/03/09 12:29:49 glenne Exp $
+- *
+- *  Module name: remcom.c $
+- *  Revision: 1.34 $
+- *  Date: 91/03/09 12:29:49 $
+- *  Contributor:     Lake Stevens Instrument Division$
+- *
+- *  Description:     low level support for gdb debugger. $
+- *
+- *  Considerations:  only works on target hardware $
+- *
+- *  Written by:      Glenn Engel $
+- *  ModuleState:     Experimental $
+- *
+- *  NOTES:           See Below $
+- *
+- *  Modified for SPARC by Stu Grossman, Cygnus Support.
+- *
+- *  This code has been extensively tested on the Fujitsu SPARClite demo board.
+- *
+- *  To enable debugger support, two things need to happen.  One, a
+- *  call to set_debug_traps() is necessary in order to allow any breakpoints
+- *  or error conditions to be properly intercepted and reported to gdb.
+- *  Two, a breakpoint needs to be generated to begin communication.  This
+- *  is most easily accomplished by a call to breakpoint().  Breakpoint()
+- *  simulates a breakpoint by executing a trap #1.
+- *
+- *************
+- *
+- *    The following gdb commands are supported:
+- *
+- * command          function                    Return value
+- *
+- *    g             return the value of the CPU registers  hex data or ENN
+- *    G             set the value of the CPU registers     OK or ENN
+- *    qOffsets      Get section offsets.  Reply is Text=xxx;Data=yyy;Bss=zzz
+- *
+- *    mAA..AA,LLLL  Read LLLL bytes at address AA..AA      hex data or ENN
+- *    MAA..AA,LLLL: Write LLLL bytes at address AA.AA      OK or ENN
+- *
+- *    c             Resume at current address              SNN   ( signal NN)
+- *    cAA..AA       Continue at address AA..AA             SNN
+- *
+- *    s             Step one instruction                   SNN
+- *    sAA..AA       Step one instruction from AA..AA       SNN
+- *
+- *    k             kill
+- *
+- *    ?             What was the last sigval ?             SNN   (signal NN)
+- *
+- *    bBB..BB     Set baud rate to BB..BB                OK or BNN, then sets
+- *                                                       baud rate
+- *
+- * All commands and responses are sent with a packet which includes a
+- * checksum.  A packet consists of
+- *
+- * $<packet info>#<checksum>.
+- *
+- * where
+- * <packet info> :: <characters representing the command or response>
+- * <checksum>    :: <two hex digits computed as modulo 256 sum of <packetinfo>>
+- *
+- * When a packet is received, it is first acknowledged with either '+' or '-'.
+- * '+' indicates a successful transfer.  '-' indicates a failed transfer.
+- *
+- * Example:
+- *
+- * Host:                  Reply:
+- * $m0,10#2a               +$00010203040506070809101112131415#42
+- *
+- ****************************************************************************/
+-
+-#include <linux/kernel.h>
+-#include <linux/string.h>
+-#include <linux/mm.h>
+-#include <linux/smp.h>
+-#include <linux/smp_lock.h>
+-#include <linux/init.h>
+-#include <linux/sysrq.h>
+-
+-#include <asm/cacheflush.h>
+-#include <asm/system.h>
+-#include <asm/signal.h>
+-#include <asm/kgdb.h>
+-#include <asm/pgtable.h>
+-#include <asm/ptrace.h>
+-
+-void breakinst(void);
+-
+-/*
+- * BUFMAX defines the maximum number of characters in inbound/outbound buffers
+- * at least NUMREGBYTES*2 are needed for register packets
+- */
+-#define BUFMAX 2048
+-static char remcomInBuffer[BUFMAX];
+-static char remcomOutBuffer[BUFMAX];
+-
+-static int initialized;
+-static int kgdb_active;
+-static int kgdb_started;
+-static u_int fault_jmp_buf[100];
+-static int kdebug;
+-
+-
+-static const char hexchars[]="0123456789abcdef";
+-
+-/* Place where we save old trap entries for restoration - sparc*/
+-/* struct tt_entry kgdb_savettable[256]; */
+-/* typedef void (*trapfunc_t)(void); */
+-
+-static void kgdb_fault_handler(struct pt_regs *regs);
+-static int handle_exception (struct pt_regs *regs);
+-
+-#if 0
+-/* Install an exception handler for kgdb */
+-static void exceptionHandler(int tnum, unsigned int *tfunc)
+-{
+-      /* We are dorking with a live trap table, all irqs off */
+-}
+-#endif
+-
+-int
+-kgdb_setjmp(long *buf)
+-{
+-      asm ("mflr 0; stw 0,0(%0);"
+-           "stw 1,4(%0); stw 2,8(%0);"
+-           "mfcr 0; stw 0,12(%0);"
+-           "stmw 13,16(%0)"
+-           : : "r" (buf));
+-      /* XXX should save fp regs as well */
+-      return 0;
+-}
+-void
+-kgdb_longjmp(long *buf, int val)
+-{
+-      if (val == 0)
+-              val = 1;
+-      asm ("lmw 13,16(%0);"
+-           "lwz 0,12(%0); mtcrf 0x38,0;"
+-           "lwz 0,0(%0); lwz 1,4(%0); lwz 2,8(%0);"
+-           "mtlr 0; mr 3,%1"
+-           : : "r" (buf), "r" (val));
+-}
+-/* Convert ch from a hex digit to an int */
+-static int
+-hex(unsigned char ch)
+-{
+-      if (ch >= 'a' && ch <= 'f')
+-              return ch-'a'+10;
+-      if (ch >= '0' && ch <= '9')
+-              return ch-'0';
+-      if (ch >= 'A' && ch <= 'F')
+-              return ch-'A'+10;
+-      return -1;
+-}
+-
+-/* Convert the memory pointed to by mem into hex, placing result in buf.
+- * Return a pointer to the last char put in buf (null), in case of mem fault,
+- * return 0.
+- */
+-static unsigned char *
+-mem2hex(const char *mem, char *buf, int count)
+-{
+-      unsigned char ch;
+-      unsigned short tmp_s;
+-      unsigned long tmp_l;
+-
+-      if (kgdb_setjmp((long*)fault_jmp_buf) == 0) {
+-              debugger_fault_handler = kgdb_fault_handler;
+-
+-              /* Accessing 16 bit and 32 bit objects in a single
+-              ** load instruction is required to avoid bad side
+-              ** effects for some IO registers.
+-              */
+-
+-              if ((count == 2) && (((long)mem & 1) == 0)) {
+-                      tmp_s = *(unsigned short *)mem;
+-                      mem += 2;
+-                      *buf++ = hexchars[(tmp_s >> 12) & 0xf];
+-                      *buf++ = hexchars[(tmp_s >> 8) & 0xf];
+-                      *buf++ = hexchars[(tmp_s >> 4) & 0xf];
+-                      *buf++ = hexchars[tmp_s & 0xf];
+-
+-              } else if ((count == 4) && (((long)mem & 3) == 0)) {
+-                      tmp_l = *(unsigned int *)mem;
+-                      mem += 4;
+-                      *buf++ = hexchars[(tmp_l >> 28) & 0xf];
+-                      *buf++ = hexchars[(tmp_l >> 24) & 0xf];
+-                      *buf++ = hexchars[(tmp_l >> 20) & 0xf];
+-                      *buf++ = hexchars[(tmp_l >> 16) & 0xf];
+-                      *buf++ = hexchars[(tmp_l >> 12) & 0xf];
+-                      *buf++ = hexchars[(tmp_l >> 8) & 0xf];
+-                      *buf++ = hexchars[(tmp_l >> 4) & 0xf];
+-                      *buf++ = hexchars[tmp_l & 0xf];
+-
+-              } else {
+-                      while (count-- > 0) {
+-                              ch = *mem++;
+-                              *buf++ = hexchars[ch >> 4];
+-                              *buf++ = hexchars[ch & 0xf];
+-                      }
+-              }
+-
+-      } else {
+-              /* error condition */
+-      }
+-      debugger_fault_handler = NULL;
+-      *buf = 0;
+-      return buf;
+-}
+-
+-/* convert the hex array pointed to by buf into binary to be placed in mem
+- * return a pointer to the character AFTER the last byte written.
+-*/
+-static char *
+-hex2mem(char *buf, char *mem, int count)
+-{
+-      unsigned char ch;
+-      int i;
+-      char *orig_mem;
+-      unsigned short tmp_s;
+-      unsigned long tmp_l;
+-
+-      orig_mem = mem;
+-
+-      if (kgdb_setjmp((long*)fault_jmp_buf) == 0) {
+-              debugger_fault_handler = kgdb_fault_handler;
+-
+-              /* Accessing 16 bit and 32 bit objects in a single
+-              ** store instruction is required to avoid bad side
+-              ** effects for some IO registers.
+-              */
+-
+-              if ((count == 2) && (((long)mem & 1) == 0)) {
+-                      tmp_s = hex(*buf++) << 12;
+-                      tmp_s |= hex(*buf++) << 8;
+-                      tmp_s |= hex(*buf++) << 4;
+-                      tmp_s |= hex(*buf++);
+-
+-                      *(unsigned short *)mem = tmp_s;
+-                      mem += 2;
+-
+-              } else if ((count == 4) && (((long)mem & 3) == 0)) {
+-                      tmp_l = hex(*buf++) << 28;
+-                      tmp_l |= hex(*buf++) << 24;
+-                      tmp_l |= hex(*buf++) << 20;
+-                      tmp_l |= hex(*buf++) << 16;
+-                      tmp_l |= hex(*buf++) << 12;
+-                      tmp_l |= hex(*buf++) << 8;
+-                      tmp_l |= hex(*buf++) << 4;
+-                      tmp_l |= hex(*buf++);
+-
+-                      *(unsigned long *)mem = tmp_l;
+-                      mem += 4;
+-
+-              } else {
+-                      for (i=0; i<count; i++) {
+-                              ch = hex(*buf++) << 4;
+-                              ch |= hex(*buf++);
+-                              *mem++ = ch;
+-                      }
+-              }
+-
+-
+-              /*
+-              ** Flush the data cache, invalidate the instruction cache.
+-              */
+-              flush_icache_range((int)orig_mem, (int)orig_mem + count - 1);
+-
+-      } else {
+-              /* error condition */
+-      }
+-      debugger_fault_handler = NULL;
+-      return mem;
+-}
+-
+-/*
+- * While we find nice hex chars, build an int.
+- * Return number of chars processed.
+- */
+-static int
+-hexToInt(char **ptr, int *intValue)
+-{
+-      int numChars = 0;
+-      int hexValue;
+-
+-      *intValue = 0;
+-
+-      if (kgdb_setjmp((long*)fault_jmp_buf) == 0) {
+-              debugger_fault_handler = kgdb_fault_handler;
+-              while (**ptr) {
+-                      hexValue = hex(**ptr);
+-                      if (hexValue < 0)
+-                              break;
+-
+-                      *intValue = (*intValue << 4) | hexValue;
+-                      numChars ++;
+-
+-                      (*ptr)++;
+-              }
+-      } else {
+-              /* error condition */
+-      }
+-      debugger_fault_handler = NULL;
+-
+-      return (numChars);
+-}
+-
+-/* scan for the sequence $<data>#<checksum> */
+-static void
+-getpacket(char *buffer)
+-{
+-      unsigned char checksum;
+-      unsigned char xmitcsum;
+-      int i;
+-      int count;
+-      unsigned char ch;
+-
+-      do {
+-              /* wait around for the start character, ignore all other
+-               * characters */
+-              while ((ch = (getDebugChar() & 0x7f)) != '$') ;
+-
+-              checksum = 0;
+-              xmitcsum = -1;
+-
+-              count = 0;
+-
+-              /* now, read until a # or end of buffer is found */
+-              while (count < BUFMAX) {
+-                      ch = getDebugChar() & 0x7f;
+-                      if (ch == '#')
+-                              break;
+-                      checksum = checksum + ch;
+-                      buffer[count] = ch;
+-                      count = count + 1;
+-              }
+-
+-              if (count >= BUFMAX)
+-                      continue;
+-
+-              buffer[count] = 0;
+-
+-              if (ch == '#') {
+-                      xmitcsum = hex(getDebugChar() & 0x7f) << 4;
+-                      xmitcsum |= hex(getDebugChar() & 0x7f);
+-                      if (checksum != xmitcsum)
+-                              putDebugChar('-');      /* failed checksum */
+-                      else {
+-                              putDebugChar('+'); /* successful transfer */
+-                              /* if a sequence char is present, reply the ID */
+-                              if (buffer[2] == ':') {
+-                                      putDebugChar(buffer[0]);
+-                                      putDebugChar(buffer[1]);
+-                                      /* remove sequence chars from buffer */
+-                                      count = strlen(buffer);
+-                                      for (i=3; i <= count; i++)
+-                                              buffer[i-3] = buffer[i];
+-                              }
+-                      }
+-              }
+-      } while (checksum != xmitcsum);
+-}
+-
+-/* send the packet in buffer. */
+-static void putpacket(unsigned char *buffer)
+-{
+-      unsigned char checksum;
+-      int count;
+-      unsigned char ch, recv;
+-
+-      /* $<packet info>#<checksum>. */
+-      do {
+-              putDebugChar('$');
+-              checksum = 0;
+-              count = 0;
+-
+-              while ((ch = buffer[count])) {
+-                      putDebugChar(ch);
+-                      checksum += ch;
+-                      count += 1;
+-              }
+-
+-              putDebugChar('#');
+-              putDebugChar(hexchars[checksum >> 4]);
+-              putDebugChar(hexchars[checksum & 0xf]);
+-              recv = getDebugChar();
+-      } while ((recv & 0x7f) != '+');
+-}
+-
+-static void kgdb_flush_cache_all(void)
+-{
+-      flush_instruction_cache();
+-}
+-
+-/* Set up exception handlers for tracing and breakpoints
+- * [could be called kgdb_init()]
+- */
+-void set_debug_traps(void)
+-{
+-#if 0
+-      unsigned char c;
+-
+-      save_and_cli(flags);
+-
+-      /* In case GDB is started before us, ack any packets (presumably
+-       * "$?#xx") sitting there.
+-       *
+-       * I've found this code causes more problems than it solves,
+-       * so that's why it's commented out.  GDB seems to work fine
+-       * now starting either before or after the kernel   -bwb
+-       */
+-
+-      while((c = getDebugChar()) != '$');
+-      while((c = getDebugChar()) != '#');
+-      c = getDebugChar(); /* eat first csum byte */
+-      c = getDebugChar(); /* eat second csum byte */
+-      putDebugChar('+'); /* ack it */
+-#endif
+-      debugger = kgdb;
+-      debugger_bpt = kgdb_bpt;
+-      debugger_sstep = kgdb_sstep;
+-      debugger_iabr_match = kgdb_iabr_match;
+-      debugger_dabr_match = kgdb_dabr_match;
+-
+-      initialized = 1;
+-}
+-
+-static void kgdb_fault_handler(struct pt_regs *regs)
+-{
+-      kgdb_longjmp((long*)fault_jmp_buf, 1);
+-}
+-
+-int kgdb_bpt(struct pt_regs *regs)
+-{
+-      return handle_exception(regs);
+-}
+-
+-int kgdb_sstep(struct pt_regs *regs)
+-{
+-      return handle_exception(regs);
+-}
+-
+-void kgdb(struct pt_regs *regs)
+-{
+-      handle_exception(regs);
+-}
+-
+-int kgdb_iabr_match(struct pt_regs *regs)
+-{
+-      printk(KERN_ERR "kgdb doesn't support iabr, what?!?\n");
+-      return handle_exception(regs);
+-}
+-
+-int kgdb_dabr_match(struct pt_regs *regs)
+-{
+-      printk(KERN_ERR "kgdb doesn't support dabr, what?!?\n");
+-      return handle_exception(regs);
+-}
+-
+-/* Convert the hardware trap type code to a unix signal number. */
+-/*
+- * This table contains the mapping between PowerPC hardware trap types, and
+- * signals, which are primarily what GDB understands.
+- */
+-static struct hard_trap_info
+-{
+-      unsigned int tt;                /* Trap type code for powerpc */
+-      unsigned char signo;            /* Signal that we map this trap into */
+-} hard_trap_info[] = {
+-#if defined(CONFIG_40x) || defined(CONFIG_BOOKE)
+-      { 0x100, SIGINT  },             /* critical input interrupt */
+-      { 0x200, SIGSEGV },             /* machine check */
+-      { 0x300, SIGSEGV },             /* data storage */
+-      { 0x400, SIGBUS  },             /* instruction storage */
+-      { 0x500, SIGINT  },             /* interrupt */
+-      { 0x600, SIGBUS  },             /* alignment */
+-      { 0x700, SIGILL  },             /* program */
+-      { 0x800, SIGILL  },             /* reserved */
+-      { 0x900, SIGILL  },             /* reserved */
+-      { 0xa00, SIGILL  },             /* reserved */
+-      { 0xb00, SIGILL  },             /* reserved */
+-      { 0xc00, SIGCHLD },             /* syscall */
+-      { 0xd00, SIGILL  },             /* reserved */
+-      { 0xe00, SIGILL  },             /* reserved */
+-      { 0xf00, SIGILL  },             /* reserved */
+-      /*
+-      ** 0x1000  PIT
+-      ** 0x1010  FIT
+-      ** 0x1020  watchdog
+-      ** 0x1100  data TLB miss
+-      ** 0x1200  instruction TLB miss
+-      */
+-      { 0x2002, SIGTRAP},             /* debug */
+-#else
+-      { 0x200, SIGSEGV },             /* machine check */
+-      { 0x300, SIGSEGV },             /* address error (store) */
+-      { 0x400, SIGBUS },              /* instruction bus error */
+-      { 0x500, SIGINT },              /* interrupt */
+-      { 0x600, SIGBUS },              /* alingment */
+-      { 0x700, SIGTRAP },             /* breakpoint trap */
+-      { 0x800, SIGFPE },              /* fpu unavail */
+-      { 0x900, SIGALRM },             /* decrementer */
+-      { 0xa00, SIGILL },              /* reserved */
+-      { 0xb00, SIGILL },              /* reserved */
+-      { 0xc00, SIGCHLD },             /* syscall */
+-      { 0xd00, SIGTRAP },             /* single-step/watch */
+-      { 0xe00, SIGFPE },              /* fp assist */
+-#endif
+-      { 0, 0}                         /* Must be last */
+-
+-};
+-
+-static int computeSignal(unsigned int tt)
+-{
+-      struct hard_trap_info *ht;
+-
+-      for (ht = hard_trap_info; ht->tt && ht->signo; ht++)
+-              if (ht->tt == tt)
+-                      return ht->signo;
+-
+-      return SIGHUP; /* default for things we don't know about */
+-}
+-
+-#define PC_REGNUM 64
+-#define SP_REGNUM 1
+-
+-/*
+- * This function does all command processing for interfacing to gdb.
+- */
+-static int
+-handle_exception (struct pt_regs *regs)
+-{
+-      int sigval;
+-      int addr;
+-      int length;
+-      char *ptr;
+-      unsigned int msr;
+-
+-      /* We don't handle user-mode breakpoints. */
+-      if (user_mode(regs))
+-              return 0;
+-
+-      if (debugger_fault_handler) {
+-              debugger_fault_handler(regs);
+-              panic("kgdb longjump failed!\n");
+-      }
+-      if (kgdb_active) {
+-              printk(KERN_ERR "interrupt while in kgdb, returning\n");
+-              return 0;
+-      }
+-
+-      kgdb_active = 1;
+-      kgdb_started = 1;
+-
+-#ifdef KGDB_DEBUG
+-      printk("kgdb: entering handle_exception; trap [0x%x]\n",
+-                      (unsigned int)regs->trap);
+-#endif
+-
+-      kgdb_interruptible(0);
+-      lock_kernel();
+-      msr = mfmsr();
+-      mtmsr(msr & ~MSR_EE);   /* disable interrupts */
+-
+-      if (regs->nip == (unsigned long)breakinst) {
+-              /* Skip over breakpoint trap insn */
+-              regs->nip += 4;
+-      }
+-
+-      /* reply to host that an exception has occurred */
+-      sigval = computeSignal(regs->trap);
+-      ptr = remcomOutBuffer;
+-
+-      *ptr++ = 'T';
+-      *ptr++ = hexchars[sigval >> 4];
+-      *ptr++ = hexchars[sigval & 0xf];
+-      *ptr++ = hexchars[PC_REGNUM >> 4];
+-      *ptr++ = hexchars[PC_REGNUM & 0xf];
+-      *ptr++ = ':';
+-      ptr = mem2hex((char *)&regs->nip, ptr, 4);
+-      *ptr++ = ';';
+-      *ptr++ = hexchars[SP_REGNUM >> 4];
+-      *ptr++ = hexchars[SP_REGNUM & 0xf];
+-      *ptr++ = ':';
+-      ptr = mem2hex(((char *)regs) + SP_REGNUM*4, ptr, 4);
+-      *ptr++ = ';';
+-      *ptr++ = 0;
+-
+-      putpacket(remcomOutBuffer);
+-      if (kdebug)
+-              printk("remcomOutBuffer: %s\n", remcomOutBuffer);
+-
+-      /* XXX We may want to add some features dealing with poking the
+-       * XXX page tables, ... (look at sparc-stub.c for more info)
+-       * XXX also required hacking to the gdb sources directly...
+-       */
+-
+-      while (1) {
+-              remcomOutBuffer[0] = 0;
+-
+-              getpacket(remcomInBuffer);
+-              switch (remcomInBuffer[0]) {
+-              case '?': /* report most recent signal */
+-                      remcomOutBuffer[0] = 'S';
+-                      remcomOutBuffer[1] = hexchars[sigval >> 4];
+-                      remcomOutBuffer[2] = hexchars[sigval & 0xf];
+-                      remcomOutBuffer[3] = 0;
+-                      break;
+-#if 0
+-              case 'q': /* this screws up gdb for some reason...*/
+-              {
+-                      extern long _start, sdata, __bss_start;
+-
+-                      ptr = &remcomInBuffer[1];
+-                      if (strncmp(ptr, "Offsets", 7) != 0)
+-                              break;
+-
+-                      ptr = remcomOutBuffer;
+-                      sprintf(ptr, "Text=%8.8x;Data=%8.8x;Bss=%8.8x",
+-                              &_start, &sdata, &__bss_start);
+-                      break;
+-              }
+-#endif
+-              case 'd':
+-                      /* toggle debug flag */
+-                      kdebug ^= 1;
+-                      break;
+-
+-              case 'g':       /* return the value of the CPU registers.
+-                               * some of them are non-PowerPC names :(
+-                               * they are stored in gdb like:
+-                               * struct {
+-                               *     u32 gpr[32];
+-                               *     f64 fpr[32];
+-                               *     u32 pc, ps, cnd, lr; (ps=msr)
+-                               *     u32 cnt, xer, mq;
+-                               * }
+-                               */
+-              {
+-                      int i;
+-                      ptr = remcomOutBuffer;
+-                      /* General Purpose Regs */
+-                      ptr = mem2hex((char *)regs, ptr, 32 * 4);
+-                      /* Floating Point Regs - FIXME */
+-                      /*ptr = mem2hex((char *), ptr, 32 * 8);*/
+-                      for(i=0; i<(32*8*2); i++) { /* 2chars/byte */
+-                              ptr[i] = '0';
+-                      }
+-                      ptr += 32*8*2;
+-                      /* pc, msr, cr, lr, ctr, xer, (mq is unused) */
+-                      ptr = mem2hex((char *)&regs->nip, ptr, 4);
+-                      ptr = mem2hex((char *)&regs->msr, ptr, 4);
+-                      ptr = mem2hex((char *)&regs->ccr, ptr, 4);
+-                      ptr = mem2hex((char *)&regs->link, ptr, 4);
+-                      ptr = mem2hex((char *)&regs->ctr, ptr, 4);
+-                      ptr = mem2hex((char *)&regs->xer, ptr, 4);
+-              }
+-                      break;
+-
+-              case 'G': /* set the value of the CPU registers */
+-              {
+-                      ptr = &remcomInBuffer[1];
+-
+-                      /*
+-                       * If the stack pointer has moved, you should pray.
+-                       * (cause only god can help you).
+-                       */
+-
+-                      /* General Purpose Regs */
+-                      hex2mem(ptr, (char *)regs, 32 * 4);
+-
+-                      /* Floating Point Regs - FIXME?? */
+-                      /*ptr = hex2mem(ptr, ??, 32 * 8);*/
+-                      ptr += 32*8*2;
+-
+-                      /* pc, msr, cr, lr, ctr, xer, (mq is unused) */
+-                      ptr = hex2mem(ptr, (char *)&regs->nip, 4);
+-                      ptr = hex2mem(ptr, (char *)&regs->msr, 4);
+-                      ptr = hex2mem(ptr, (char *)&regs->ccr, 4);
+-                      ptr = hex2mem(ptr, (char *)&regs->link, 4);
+-                      ptr = hex2mem(ptr, (char *)&regs->ctr, 4);
+-                      ptr = hex2mem(ptr, (char *)&regs->xer, 4);
+-
+-                      strcpy(remcomOutBuffer,"OK");
+-              }
+-                      break;
+-              case 'H':
+-                      /* don't do anything, yet, just acknowledge */
+-                      hexToInt(&ptr, &addr);
+-                      strcpy(remcomOutBuffer,"OK");
+-                      break;
+-
+-              case 'm':       /* mAA..AA,LLLL  Read LLLL bytes at address AA..AA */
+-                              /* Try to read %x,%x.  */
+-
+-                      ptr = &remcomInBuffer[1];
+-
+-                      if (hexToInt(&ptr, &addr) && *ptr++ == ','
+-                                      && hexToInt(&ptr, &length)) {
+-                              if (mem2hex((char *)addr, remcomOutBuffer,
+-                                                      length))
+-                                      break;
+-                              strcpy(remcomOutBuffer, "E03");
+-                      } else
+-                              strcpy(remcomOutBuffer, "E01");
+-                      break;
+-
+-              case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA.AA return OK */
+-                      /* Try to read '%x,%x:'.  */
+-
+-                      ptr = &remcomInBuffer[1];
+-
+-                      if (hexToInt(&ptr, &addr) && *ptr++ == ','
+-                                      && hexToInt(&ptr, &length)
+-                                      && *ptr++ == ':') {
+-                              if (hex2mem(ptr, (char *)addr, length))
+-                                      strcpy(remcomOutBuffer, "OK");
+-                              else
+-                                      strcpy(remcomOutBuffer, "E03");
+-                              flush_icache_range(addr, addr+length);
+-                      } else
+-                              strcpy(remcomOutBuffer, "E02");
+-                      break;
+-
+-
+-              case 'k': /* kill the program, actually just continue */
+-              case 'c': /* cAA..AA  Continue; address AA..AA optional */
+-                      /* try to read optional parameter, pc unchanged if no parm */
+-
+-                      ptr = &remcomInBuffer[1];
+-                      if (hexToInt(&ptr, &addr))
+-                              regs->nip = addr;
+-
+-/* Need to flush the instruction cache here, as we may have deposited a
+- * breakpoint, and the icache probably has no way of knowing that a data ref to
+- * some location may have changed something that is in the instruction cache.
+- */
+-                      kgdb_flush_cache_all();
+-                      mtmsr(msr);
+-
+-                      kgdb_interruptible(1);
+-                      unlock_kernel();
+-                      kgdb_active = 0;
+-                      if (kdebug) {
+-                              printk("remcomInBuffer: %s\n", remcomInBuffer);
+-                              printk("remcomOutBuffer: %s\n", remcomOutBuffer);
+-                      }
+-                      return 1;
+-
+-              case 's':
+-                      kgdb_flush_cache_all();
+-#if defined(CONFIG_40x) || defined(CONFIG_BOOKE)
+-                      mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) | DBCR0_IC);
+-                      regs->msr |= MSR_DE;
+-#else
+-                      regs->msr |= MSR_SE;
+-#endif
+-                      unlock_kernel();
+-                      kgdb_active = 0;
+-                      if (kdebug) {
+-                              printk("remcomInBuffer: %s\n", remcomInBuffer);
+-                              printk("remcomOutBuffer: %s\n", remcomOutBuffer);
+-                      }
+-                      return 1;
+-
+-              case 'r':               /* Reset (if user process..exit ???)*/
+-                      panic("kgdb reset.");
+-                      break;
+-              }                       /* switch */
+-              if (remcomOutBuffer[0] && kdebug) {
+-                      printk("remcomInBuffer: %s\n", remcomInBuffer);
+-                      printk("remcomOutBuffer: %s\n", remcomOutBuffer);
+-              }
+-              /* reply to the request */
+-              putpacket(remcomOutBuffer);
+-      } /* while(1) */
+-}
+-
+-/* This function will generate a breakpoint exception.  It is used at the
+-   beginning of a program to sync up with a debugger and can be used
+-   otherwise as a quick means to stop program execution and "break" into
+-   the debugger. */
+-
+-void
+-breakpoint(void)
+-{
+-      if (!initialized) {
+-              printk("breakpoint() called b4 kgdb init\n");
+-              return;
+-      }
+-
+-      asm("   .globl breakinst        \n\
+-           breakinst: .long 0x7d821008");
+-}
+-
+-#ifdef CONFIG_KGDB_CONSOLE
+-/* Output string in GDB O-packet format if GDB has connected. If nothing
+-   output, returns 0 (caller must then handle output). */
+-int
+-kgdb_output_string (const char* s, unsigned int count)
+-{
+-      char buffer[512];
+-
+-      if (!kgdb_started)
+-              return 0;
+-
+-      count = (count <= (sizeof(buffer) / 2 - 2))
+-              ? count : (sizeof(buffer) / 2 - 2);
+-
+-      buffer[0] = 'O';
+-      mem2hex (s, &buffer[1], count);
+-      putpacket(buffer);
+-
+-      return 1;
+-}
+-#endif
+-
+-static void sysrq_handle_gdb(int key, struct pt_regs *pt_regs,
+-                           struct tty_struct *tty)
+-{
+-      printk("Entering GDB stub\n");
+-      breakpoint();
+-}
+-static struct sysrq_key_op sysrq_gdb_op = {
+-        .handler        = sysrq_handle_gdb,
+-        .help_msg       = "Gdb",
+-        .action_msg     = "GDB",
+-};
+-
+-static int gdb_register_sysrq(void)
+-{
+-      printk("Registering GDB sysrq handler\n");
+-      register_sysrq_key('g', &sysrq_gdb_op);
+-      return 0;
+-}
+-module_init(gdb_register_sysrq);
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ppc/kernel/setup.c linux-2.6.18.kgdb/arch/ppc/kernel/setup.c
+--- linux-2.6.18/arch/ppc/kernel/setup.c       2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/ppc/kernel/setup.c  2008-06-10 16:19:22.000000000 +0400
+@@ -47,10 +47,6 @@
+ #include <asm/ppc_sys.h>
+ #endif
+ 
+-#if defined CONFIG_KGDB
+-#include <asm/kgdb.h>
+-#endif
+-
+ extern void platform_init(unsigned long r3, unsigned long r4,
+               unsigned long r5, unsigned long r6, unsigned long r7);
+ extern void identify_cpu(unsigned long offset, unsigned long cpu);
+@@ -504,18 +500,6 @@ void __init setup_arch(char **cmdline_p)
+ #endif /* CONFIG_XMON */
+       if ( ppc_md.progress ) ppc_md.progress("setup_arch: enter", 0x3eab);
+ 
+-#if defined(CONFIG_KGDB)
+-      if (ppc_md.kgdb_map_scc)
+-              ppc_md.kgdb_map_scc();
+-      set_debug_traps();
+-      if (strstr(cmd_line, "gdb")) {
+-              if (ppc_md.progress)
+-                      ppc_md.progress("setup_arch: kgdb breakpoint", 0x4000);
+-              printk("kgdb breakpoint activated\n");
+-              breakpoint();
+-      }
+-#endif
+-
+       /*
+        * Set cache line size based on type of cpu as a default.
+        * Systems with OF can look in the properties on the cpu node(s)
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ppc/mm/fault.c linux-2.6.18.kgdb/arch/ppc/mm/fault.c
+--- linux-2.6.18/arch/ppc/mm/fault.c   2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/ppc/mm/fault.c      2008-06-10 16:19:22.000000000 +0400
+@@ -25,6 +25,7 @@
+ #include <linux/interrupt.h>
+ #include <linux/highmem.h>
+ #include <linux/module.h>
++#include <linux/kgdb.h>
+ 
+ #include <asm/page.h>
+ #include <asm/pgtable.h>
+@@ -329,6 +330,14 @@ bad_page_fault(struct pt_regs *regs, uns
+               return;
+       }
+ 
++#ifdef CONFIG_KGDB
++      if (atomic_read(&debugger_active) && kgdb_may_fault) {
++              /* Restore our previous state. */
++              kgdb_fault_longjmp(kgdb_fault_jmp_regs);
++              /* Not reached. */
++      }
++#endif
++
+       /* kernel has accessed a bad area */
+ #if defined(CONFIG_XMON) || defined(CONFIG_KGDB)
+       if (debugger_kernel_faults)
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ppc/platforms/4xx/bubinga.c linux-2.6.18.kgdb/arch/ppc/platforms/4xx/bubinga.c
+--- linux-2.6.18/arch/ppc/platforms/4xx/bubinga.c      2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/ppc/platforms/4xx/bubinga.c 2008-06-10 16:19:22.000000000 +0400
+@@ -4,7 +4,7 @@
+  * Author: SAW (IBM), derived from walnut.c.
+  *         Maintained by MontaVista Software <source@mvista.com>
+  *
+- * 2003 (c) MontaVista Softare Inc.  This file is licensed under the
++ * 2003-2004 (c) MontaVista Softare Inc.  This file is licensed under the
+  * terms of the GNU General Public License version 2. This program is
+  * licensed "as is" without any warranty of any kind, whether express
+  * or implied.
+@@ -100,17 +100,26 @@ bubinga_early_serial_map(void)
+       port.flags = UPF_BOOT_AUTOCONF | UPF_SKIP_TEST;
+       port.line = 0;
+ 
+-      if (early_serial_setup(&port) != 0) {
++#ifdef CONFIG_SERIAL_8250
++      if (early_serial_setup(&port) != 0)
+               printk("Early serial init of port 0 failed\n");
+-      }
++#endif
++
++#ifdef CONFIG_KGDB_8250
++      kgdb8250_add_port(0, &port);
++#endif
+ 
+       port.membase = (void*)ACTING_UART1_IO_BASE;
+       port.irq = ACTING_UART1_INT;
+       port.line = 1;
+ 
+-      if (early_serial_setup(&port) != 0) {
++#ifdef CONFIG_SERIAL_8250
++      if (early_serial_setup(&port) != 0)
+               printk("Early serial init of port 1 failed\n");
+-      }
++#endif
++#ifdef CONFIG_KGDB_8250
++      kgdb8250_add_port(1, &port);
++#endif
+ }
+ 
+ void __init
+@@ -255,8 +264,4 @@ platform_init(unsigned long r3, unsigned
+       ppc_md.nvram_read_val = todc_direct_read_val;
+       ppc_md.nvram_write_val = todc_direct_write_val;
+ #endif
+-#ifdef CONFIG_KGDB
+-      ppc_md.early_serial_map = bubinga_early_serial_map;
+-#endif
+ }
+-
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ppc/platforms/4xx/ebony.c linux-2.6.18.kgdb/arch/ppc/platforms/4xx/ebony.c
+--- linux-2.6.18/arch/ppc/platforms/4xx/ebony.c        2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/ppc/platforms/4xx/ebony.c   2008-06-10 16:19:22.000000000 +0400
+@@ -32,6 +32,7 @@
+ #include <linux/tty.h>
+ #include <linux/serial.h>
+ #include <linux/serial_core.h>
++#include <linux/kgdb.h>
+ 
+ #include <asm/system.h>
+ #include <asm/pgtable.h>
+@@ -226,14 +227,20 @@ ebony_early_serial_map(void)
+       port.flags = UPF_BOOT_AUTOCONF | UPF_SKIP_TEST;
+       port.line = 0;
+ 
+-      if (early_serial_setup(&port) != 0) {
++#ifdef CONFIG_SERIAL_8250
++      if (early_serial_setup(&port) != 0)
+               printk("Early serial init of port 0 failed\n");
+-      }
++#endif
+ 
+-#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB)
++#ifdef CONFIG_SERIAL_TEXT_DEBUG
+       /* Configure debug serial access */
+       gen550_init(0, &port);
++#endif
++#ifdef CONFIG_KGDB_8250
++      kgdb8250_add_port(0, &port);
++#endif
+ 
++#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB_8250)
+       /* Purge TLB entry added in head_44x.S for early serial access */
+       _tlbie(UART0_IO_BASE);
+ #endif
+@@ -243,14 +250,18 @@ ebony_early_serial_map(void)
+       port.uartclk = clocks.uart1;
+       port.line = 1;
+ 
+-      if (early_serial_setup(&port) != 0) {
++#ifdef CONFIG_SERIAL_8250
++      if (early_serial_setup(&port) != 1)
+               printk("Early serial init of port 1 failed\n");
+-      }
++#endif
+ 
+-#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB)
++#ifdef CONFIG_SERIAL_TEXT_DEBUG
+       /* Configure debug serial access */
+       gen550_init(1, &port);
+ #endif
++#ifdef CONFIG_KGDB_8250
++      kgdb8250_add_port(1, &port);
++#endif
+ }
+ 
+ static void __init
+@@ -327,8 +338,4 @@ void __init platform_init(unsigned long 
+ 
+       ppc_md.nvram_read_val = todc_direct_read_val;
+       ppc_md.nvram_write_val = todc_direct_write_val;
+-#ifdef CONFIG_KGDB
+-      ppc_md.early_serial_map = ebony_early_serial_map;
+-#endif
+ }
+-
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ppc/platforms/4xx/ocotea.c linux-2.6.18.kgdb/arch/ppc/platforms/4xx/ocotea.c
+--- linux-2.6.18/arch/ppc/platforms/4xx/ocotea.c       2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/ppc/platforms/4xx/ocotea.c  2008-06-10 16:19:22.000000000 +0400
+@@ -30,6 +30,7 @@
+ #include <linux/tty.h>
+ #include <linux/serial.h>
+ #include <linux/serial_core.h>
++#include <linux/kgdb.h>
+ 
+ #include <asm/system.h>
+ #include <asm/pgtable.h>
+@@ -249,14 +250,20 @@ ocotea_early_serial_map(void)
+       port.flags = UPF_BOOT_AUTOCONF | UPF_SKIP_TEST;
+       port.line = 0;
+ 
+-      if (early_serial_setup(&port) != 0) {
++#ifdef CONFIG_SERIAL_8250
++      if (early_serial_setup(&port) != 0)
+               printk("Early serial init of port 0 failed\n");
+-      }
++#endif
+ 
+-#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB)
++#ifdef CONFIG_SERIAL_TEXT_DEBUG
+       /* Configure debug serial access */
+       gen550_init(0, &port);
++#endif
++#ifdef CONFIG_KGDB_8250
++      kgdb8250_add_port(0, &port);
++#endif
+ 
++#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB_8250)
+       /* Purge TLB entry added in head_44x.S for early serial access */
+       _tlbie(UART0_IO_BASE);
+ #endif
+@@ -266,14 +273,18 @@ ocotea_early_serial_map(void)
+       port.uartclk = clocks.uart1;
+       port.line = 1;
+ 
+-      if (early_serial_setup(&port) != 0) {
++#ifdef CONFIG_SERIAL_8250
++      if (early_serial_setup(&port) != 1)
+               printk("Early serial init of port 1 failed\n");
+-      }
++#endif
+ 
+-#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB)
++#ifdef CONFIG_SERIAL_TEXT_DEBUG
+       /* Configure debug serial access */
+       gen550_init(1, &port);
+ #endif
++#ifdef CONFIG_KGDB_8250
++      kgdb8250_add_port(1, &port);
++#endif
+ }
+ 
+ static void __init
+@@ -343,8 +354,5 @@ void __init platform_init(unsigned long 
+ 
+       ppc_md.nvram_read_val = todc_direct_read_val;
+       ppc_md.nvram_write_val = todc_direct_write_val;
+-#ifdef CONFIG_KGDB
+-      ppc_md.early_serial_map = ocotea_early_serial_map;
+-#endif
+       ppc_md.init = ocotea_init;
+ }
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ppc/platforms/4xx/xilinx_ml300.c linux-2.6.18.kgdb/arch/ppc/platforms/4xx/xilinx_ml300.c
+--- linux-2.6.18/arch/ppc/platforms/4xx/xilinx_ml300.c 2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/ppc/platforms/4xx/xilinx_ml300.c    2008-06-10 16:19:22.000000000 +0400
+@@ -41,9 +41,6 @@
+  *      ppc4xx_map_io                         arch/ppc/syslib/ppc4xx_setup.c
+  *  start_kernel                              init/main.c
+  *    setup_arch                              arch/ppc/kernel/setup.c
+- * #if defined(CONFIG_KGDB)
+- *      *ppc_md.kgdb_map_scc() == gen550_kgdb_map_scc
+- * #endif
+  *      *ppc_md.setup_arch == ml300_setup_arch        this file
+  *        ppc4xx_setup_arch                   arch/ppc/syslib/ppc4xx_setup.c
+  *          ppc4xx_find_bridges                       arch/ppc/syslib/ppc405_pci.c
+@@ -117,7 +114,6 @@ ml300_early_serial_init(int num, struct 
+ void __init
+ ml300_early_serial_map(void)
+ {
+-#ifdef CONFIG_SERIAL_8250
+       struct plat_serial8250_port *pdata;
+       int i = 0;
+ 
+@@ -129,7 +125,14 @@ ml300_early_serial_map(void)
+               pdata++;
+               i++;
+       }
+-#endif /* CONFIG_SERIAL_8250 */
++#ifdef CONFIG_SERIAL_8250
++                if (early_serial_setup(&port) != 0)
++                        printk("Early serial init of port %d failed\n", i);
++#endif
++
++#ifdef CONFIG_KGDB_8250
++      kgdb8250_add_port(i, &port)
++#endif
+ }
+ 
+ void __init
+@@ -165,9 +168,4 @@ platform_init(unsigned long r3, unsigned
+ #if defined(XPAR_POWER_0_POWERDOWN_BASEADDR)
+       ppc_md.power_off = xilinx_power_off;
+ #endif
+-
+-#ifdef CONFIG_KGDB
+-      ppc_md.early_serial_map = ml300_early_serial_map;
+-#endif
+ }
+-
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ppc/platforms/85xx/sbc8560.c linux-2.6.18.kgdb/arch/ppc/platforms/85xx/sbc8560.c
+--- linux-2.6.18/arch/ppc/platforms/85xx/sbc8560.c     2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/ppc/platforms/85xx/sbc8560.c        2008-06-10 16:19:22.000000000 +0400
+@@ -50,7 +50,6 @@
+ #include <syslib/ppc85xx_common.h>
+ #include <syslib/ppc85xx_setup.h>
+ 
+-#ifdef CONFIG_SERIAL_8250
+ static void __init
+ sbc8560_early_serial_map(void)
+ {
+@@ -66,12 +65,16 @@ sbc8560_early_serial_map(void)
+         uart_req.membase = ioremap(uart_req.mapbase, MPC85xx_UART0_SIZE);
+       uart_req.type = PORT_16650;
+ 
+-#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB)
+-        gen550_init(0, &uart_req);
++#ifdef CONFIG_SERIAL_8250
++      if (early_serial_setup(&uart_req) != 0)
++              printk("Early serial init of port 0 failed\n");
++#endif
++#ifdef CONFIG_SERIAL_TEXT_DEBUG
++      gen550_init(0, &uart_req);
++#endif
++#ifdef CONFIG_KGDB_8250
++      kgdb8250_add_port(0, &uart_req);
+ #endif
+- 
+-        if (early_serial_setup(&uart_req) != 0)
+-                printk("Early serial init of port 0 failed\n");
+  
+         /* Assume early_serial_setup() doesn't modify uart_req */
+       uart_req.line = 1;
+@@ -79,14 +82,17 @@ sbc8560_early_serial_map(void)
+         uart_req.membase = ioremap(uart_req.mapbase, MPC85xx_UART1_SIZE);
+       uart_req.irq = MPC85xx_IRQ_EXT10;
+  
+-#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB)
+-        gen550_init(1, &uart_req);
++#ifdef CONFIG_SERIAL_8250
++      if (early_serial_setup(&uart_req) != 0)
++              printk("Early serial init of port 0 failed\n");
+ #endif
+- 
+-        if (early_serial_setup(&uart_req) != 0)
+-                printk("Early serial init of port 1 failed\n");
+-}
++#ifdef CONFIG_SERIAL_TEXT_DEBUG
++      gen550_init(0, &uart_req);
++#endif
++#ifdef CONFIG_KGDB_8250
++      kgdb8250_add_port(0, &uart_req);
+ #endif
++}
+ 
+ /* ************************************************************************
+  *
+@@ -115,9 +121,7 @@ sbc8560_setup_arch(void)
+       /* setup PCI host bridges */
+       mpc85xx_setup_hose();
+ #endif
+-#ifdef CONFIG_SERIAL_8250
+       sbc8560_early_serial_map();
+-#endif
+ #ifdef CONFIG_SERIAL_TEXT_DEBUG
+       /* Invalidate the entry we stole earlier the serial ports
+        * should be properly mapped */ 
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ppc/platforms/chestnut.c linux-2.6.18.kgdb/arch/ppc/platforms/chestnut.c
+--- linux-2.6.18/arch/ppc/platforms/chestnut.c 2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/ppc/platforms/chestnut.c    2008-06-10 16:19:22.000000000 +0400
+@@ -492,7 +492,7 @@ chestnut_power_off(void)
+ static void __init
+ chestnut_map_io(void)
+ {
+-#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB)
++#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB_8250)
+       io_block_mapping(CHESTNUT_UART_BASE, CHESTNUT_UART_BASE, 0x100000,
+               _PAGE_IO);
+ #endif
+@@ -566,9 +566,6 @@ platform_init(unsigned long r3, unsigned
+ #if defined(CONFIG_SERIAL_TEXT_DEBUG)
+       ppc_md.progress = gen550_progress;
+ #endif
+-#if defined(CONFIG_KGDB)
+-      ppc_md.kgdb_map_scc = gen550_kgdb_map_scc;
+-#endif
+ 
+       if (ppc_md.progress)
+                 ppc_md.progress("chestnut_init(): exit", 0);
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ppc/platforms/pplus.c linux-2.6.18.kgdb/arch/ppc/platforms/pplus.c
+--- linux-2.6.18/arch/ppc/platforms/pplus.c    2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/ppc/platforms/pplus.c       2008-06-10 16:19:22.000000000 +0400
+@@ -893,9 +893,6 @@ platform_init(unsigned long r3, unsigned
+ #ifdef CONFIG_SERIAL_TEXT_DEBUG
+       ppc_md.progress = gen550_progress;
+ #endif                                /* CONFIG_SERIAL_TEXT_DEBUG */
+-#ifdef CONFIG_KGDB
+-      ppc_md.kgdb_map_scc = gen550_kgdb_map_scc;
+-#endif
+ #ifdef CONFIG_SMP
+       smp_ops = &pplus_smp_ops;
+ #endif                                /* CONFIG_SMP */
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ppc/platforms/sandpoint.c linux-2.6.18.kgdb/arch/ppc/platforms/sandpoint.c
+--- linux-2.6.18/arch/ppc/platforms/sandpoint.c        2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/ppc/platforms/sandpoint.c   2008-06-10 16:19:22.000000000 +0400
+@@ -730,9 +730,6 @@ platform_init(unsigned long r3, unsigned
+       ppc_md.nvram_read_val = todc_mc146818_read_val;
+       ppc_md.nvram_write_val = todc_mc146818_write_val;
+ 
+-#ifdef CONFIG_KGDB
+-      ppc_md.kgdb_map_scc = gen550_kgdb_map_scc;
+-#endif
+ #ifdef CONFIG_SERIAL_TEXT_DEBUG
+       ppc_md.progress = gen550_progress;
+ #endif
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ppc/platforms/spruce.c linux-2.6.18.kgdb/arch/ppc/platforms/spruce.c
+--- linux-2.6.18/arch/ppc/platforms/spruce.c   2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/ppc/platforms/spruce.c      2008-06-10 16:19:22.000000000 +0400
+@@ -178,26 +178,32 @@ spruce_early_serial_map(void)
+       serial_req.membase = (u_char *)UART0_IO_BASE;
+       serial_req.regshift = 0;
+ 
+-#if defined(CONFIG_KGDB) || defined(CONFIG_SERIAL_TEXT_DEBUG)
+-      gen550_init(0, &serial_req);
+-#endif
+ #ifdef CONFIG_SERIAL_8250
+       if (early_serial_setup(&serial_req) != 0)
+               printk("Early serial init of port 0 failed\n");
+ #endif
++#ifdef CONFIG_SERIAL_TEXT_DEBUG
++      gen550_init(0, &serial_req);
++#endif
++#ifdef CONFIG_KGDB_8250
++      kgdb8250_add_port(0, &port);
++#endif
+ 
+       /* Assume early_serial_setup() doesn't modify serial_req */
+       serial_req.line = 1;
+       serial_req.irq = UART1_INT;
+       serial_req.membase = (u_char *)UART1_IO_BASE;
+ 
+-#if defined(CONFIG_KGDB) || defined(CONFIG_SERIAL_TEXT_DEBUG)
+-      gen550_init(1, &serial_req);
+-#endif
+ #ifdef CONFIG_SERIAL_8250
+       if (early_serial_setup(&serial_req) != 0)
+               printk("Early serial init of port 1 failed\n");
+ #endif
++#ifdef CONFIG_SERIAL_TEXT_DEBUG
++      gen550_init(1, &serial_req);
++#endif
++#ifdef CONFIG_KGDB_8250
++      kgdb8250_add_port(1, &serial_req);
++#endif
+ }
+ 
+ TODC_ALLOC();
+@@ -316,7 +322,4 @@ platform_init(unsigned long r3, unsigned
+ #ifdef CONFIG_SERIAL_TEXT_DEBUG
+       ppc_md.progress = gen550_progress;
+ #endif /* CONFIG_SERIAL_TEXT_DEBUG */
+-#ifdef CONFIG_KGDB
+-      ppc_md.kgdb_map_scc = gen550_kgdb_map_scc;
+-#endif
+ }
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ppc/syslib/Makefile linux-2.6.18.kgdb/arch/ppc/syslib/Makefile
+--- linux-2.6.18/arch/ppc/syslib/Makefile      2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/ppc/syslib/Makefile 2008-06-10 16:19:22.000000000 +0400
+@@ -76,7 +76,6 @@ obj-$(CONFIG_PCI_8260)               += m82xx_pci.o p
+ obj-$(CONFIG_8260_PCI9)               += m8260_pci_erratum9.o
+ obj-$(CONFIG_CPM2)            += cpm2_common.o cpm2_pic.o
+ ifeq ($(CONFIG_PPC_GEN550),y)
+-obj-$(CONFIG_KGDB)            += gen550_kgdb.o gen550_dbg.o
+ obj-$(CONFIG_SERIAL_TEXT_DEBUG)       += gen550_dbg.o
+ endif
+ ifeq ($(CONFIG_SERIAL_MPSC_CONSOLE),y)
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ppc/syslib/gen550.h linux-2.6.18.kgdb/arch/ppc/syslib/gen550.h
+--- linux-2.6.18/arch/ppc/syslib/gen550.h      2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/ppc/syslib/gen550.h 2008-06-10 16:19:22.000000000 +0400
+@@ -11,4 +11,3 @@
+ 
+ extern void gen550_progress(char *, unsigned short);
+ extern void gen550_init(int, struct uart_port *);
+-extern void gen550_kgdb_map_scc(void);
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ppc/syslib/ibm44x_common.c linux-2.6.18.kgdb/arch/ppc/syslib/ibm44x_common.c
+--- linux-2.6.18/arch/ppc/syslib/ibm44x_common.c       2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/ppc/syslib/ibm44x_common.c  2008-06-10 16:19:22.000000000 +0400
+@@ -192,9 +192,6 @@ void __init ibm44x_platform_init(unsigne
+ #ifdef CONFIG_SERIAL_TEXT_DEBUG
+       ppc_md.progress = gen550_progress;
+ #endif /* CONFIG_SERIAL_TEXT_DEBUG */
+-#ifdef CONFIG_KGDB
+-      ppc_md.kgdb_map_scc = gen550_kgdb_map_scc;
+-#endif
+ 
+       /*
+        * The Abatron BDI JTAG debugger does not tolerate others
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ppc/syslib/mv64x60.c linux-2.6.18.kgdb/arch/ppc/syslib/mv64x60.c
+--- linux-2.6.18/arch/ppc/syslib/mv64x60.c     2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/ppc/syslib/mv64x60.c        2008-06-10 16:19:22.000000000 +0400
+@@ -241,6 +241,12 @@ static struct resource mv64x60_mpsc0_res
+               .end    = MV64x60_IRQ_SDMA_0,
+               .flags  = IORESOURCE_IRQ,
+       },
++      [4] = {
++              .name   = "mpsc 0 irq",
++              .start  = MV64x60_IRQ_MPSC_0,
++              .end    = MV64x60_IRQ_MPSC_0,
++              .flags  = IORESOURCE_IRQ,
++      },
+ };
+ 
+ static struct platform_device mpsc0_device = {
+@@ -298,6 +304,12 @@ static struct resource mv64x60_mpsc1_res
+               .end    = MV64360_IRQ_SDMA_1,
+               .flags  = IORESOURCE_IRQ,
+       },
++      [4] = {
++              .name   = "mpsc 1 irq",
++              .start  = MV64360_IRQ_MPSC_1,
++              .end    = MV64360_IRQ_MPSC_1,
++              .flags  = IORESOURCE_IRQ,
++      },
+ };
+ 
+ static struct platform_device mpsc1_device = {
+@@ -1426,12 +1438,46 @@ mv64x60_pd_fixup(struct mv64x60_handle *
+ static int __init
+ mv64x60_add_pds(void)
+ {
+-      return platform_add_devices(mv64x60_pd_devs,
+-              ARRAY_SIZE(mv64x60_pd_devs));
++      int i, ret = 0;
++
++      for (i = 0; i < ARRAY_SIZE(mv64x60_pd_devs); i++) {
++              if (mv64x60_pd_devs[i]) {
++                      ret = platform_device_register(mv64x60_pd_devs[i]);
++              }
++              if (ret) {
++                      while (--i >= 0)
++                              platform_device_unregister(mv64x60_pd_devs[i]);
++                      break;
++              }
++      }
++      return ret;
+ }
+ arch_initcall(mv64x60_add_pds);
+ 
+ /*
++ * mv64x60_early_get_pdev_data()
++ *
++ * Get the data associated with a platform device by name and number.
++ */
++struct platform_device * __init
++mv64x60_early_get_pdev_data(const char *name, int id, int remove)
++{
++      int i;
++      struct platform_device *pdev;
++
++      for (i = 0; i <ARRAY_SIZE(mv64x60_pd_devs); i++) {
++              if ((pdev = mv64x60_pd_devs[i]) &&
++                      pdev->id == id &&
++                      !strcmp(pdev->name, name)) {
++                      if (remove)
++                              mv64x60_pd_devs[i] = NULL;
++                      return pdev;
++              }
++      }
++      return NULL;
++}
++
++/*
+  *****************************************************************************
+  *
+  *    GT64260-Specific Routines
+@@ -1764,6 +1810,11 @@ gt64260a_chip_specific_init(struct mv64x
+               r->start = MV64x60_IRQ_SDMA_0;
+               r->end = MV64x60_IRQ_SDMA_0;
+       }
++      if ((r = platform_get_resource(&mpsc1_device, IORESOURCE_IRQ, 1))
++                      != NULL) {
++              r->start = GT64260_IRQ_MPSC_1;
++              r->end = GT64260_IRQ_MPSC_1;
++      }
+ #endif
+ }
+ 
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ppc/syslib/mv64x60_dbg.c linux-2.6.18.kgdb/arch/ppc/syslib/mv64x60_dbg.c
+--- linux-2.6.18/arch/ppc/syslib/mv64x60_dbg.c 2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/ppc/syslib/mv64x60_dbg.c    2008-06-10 16:19:22.000000000 +0400
+@@ -34,7 +34,7 @@ static struct mv64x60_handle mv64x60_dbg
+ void
+ mv64x60_progress_init(u32 base)
+ {
+-      mv64x60_dbg_bh.v_base = base;
++      mv64x60_dbg_bh.v_base = (void*)base;
+       return;
+ }
+ 
+@@ -69,53 +69,3 @@ mv64x60_mpsc_progress(char *s, unsigned 
+       return;
+ }
+ #endif        /* CONFIG_SERIAL_TEXT_DEBUG */
+-
+-
+-#if defined(CONFIG_KGDB)
+-
+-#if defined(CONFIG_KGDB_TTYS0)
+-#define KGDB_PORT 0
+-#elif defined(CONFIG_KGDB_TTYS1)
+-#define KGDB_PORT 1
+-#else
+-#error "Invalid kgdb_tty port"
+-#endif
+-
+-void
+-putDebugChar(unsigned char c)
+-{
+-      mv64x60_polled_putc(KGDB_PORT, (char)c);
+-}
+-
+-int
+-getDebugChar(void)
+-{
+-      unsigned char   c;
+-
+-      while (!mv64x60_polled_getc(KGDB_PORT, &c));
+-      return (int)c;
+-}
+-
+-void
+-putDebugString(char* str)
+-{
+-      while (*str != '\0') {
+-              putDebugChar(*str);
+-              str++;
+-      }
+-      putDebugChar('\r');
+-      return;
+-}
+-
+-void
+-kgdb_interruptible(int enable)
+-{
+-}
+-
+-void
+-kgdb_map_scc(void)
+-{
+-      if (ppc_md.early_serial_map)
+-              ppc_md.early_serial_map();
+-}
+-#endif        /* CONFIG_KGDB */
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ppc/syslib/ppc85xx_setup.c linux-2.6.18.kgdb/arch/ppc/syslib/ppc85xx_setup.c
+--- linux-2.6.18/arch/ppc/syslib/ppc85xx_setup.c       2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/ppc/syslib/ppc85xx_setup.c  2008-06-10 16:19:22.000000000 +0400
+@@ -69,7 +69,6 @@ mpc85xx_calibrate_decr(void)
+       mtspr(SPRN_TCR, TCR_DIE);
+ }
+ 
+-#ifdef CONFIG_SERIAL_8250
+ void __init
+ mpc85xx_early_serial_map(void)
+ {
+@@ -85,7 +84,7 @@ mpc85xx_early_serial_map(void)
+       pdata[0].mapbase += binfo->bi_immr_base;
+       pdata[0].membase = ioremap(pdata[0].mapbase, MPC85xx_UART0_SIZE);
+ 
+-#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB)
++#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB_8250)
+       memset(&serial_req, 0, sizeof (serial_req));
+       serial_req.iotype = UPIO_MEM;
+       serial_req.mapbase = pdata[0].mapbase;
+@@ -93,18 +92,24 @@ mpc85xx_early_serial_map(void)
+       serial_req.regshift = 0;
+ 
+       gen550_init(0, &serial_req);
++#ifdef CONFIG_KGDB_8250
++      kgdb8250_add_port(0, &serial_req);
++#endif
+ #endif
+ 
+       pdata[1].uartclk = binfo->bi_busfreq;
+       pdata[1].mapbase += binfo->bi_immr_base;
+       pdata[1].membase = ioremap(pdata[1].mapbase, MPC85xx_UART0_SIZE);
+ 
+-#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB)
++#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB_8250)
+       /* Assume gen550_init() doesn't modify serial_req */
+       serial_req.mapbase = pdata[1].mapbase;
+       serial_req.membase = pdata[1].membase;
+ 
+       gen550_init(1, &serial_req);
++#ifdef CONFIG_KGDB_8250
++      kgdb8250_add_port(1, &serial_req);
++#endif
+ #endif
+ }
+ #endif
+@@ -363,5 +368,3 @@ mpc85xx_setup_hose(void)
+       return;
+ }
+ #endif /* CONFIG_PCI */
+-
+-
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/sh/Kconfig.debug linux-2.6.18.kgdb/arch/sh/Kconfig.debug
+--- linux-2.6.18/arch/sh/Kconfig.debug 2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/sh/Kconfig.debug    2008-06-10 16:19:47.000000000 +0400
+@@ -29,96 +29,4 @@ config EARLY_PRINTK
+         This option is only useful porting the kernel to a new machine,
+         when the kernel may crash or hang before the serial console is
+         initialised. If unsure, say N.
+-
+-config KGDB
+-      bool "Include KGDB kernel debugger"
+-      help
+-        Include in-kernel hooks for kgdb, the Linux kernel source level
+-        debugger.  See <http://kgdb.sourceforge.net/> for more information.
+-        Unless you are intending to debug the kernel, say N here.
+-
+-menu "KGDB configuration options"
+-      depends on KGDB
+-
+-config MORE_COMPILE_OPTIONS
+-      bool "Add any additional compile options"
+-      help
+-        If you want to add additional CFLAGS to the kernel build, enable this
+-        option and then enter what you would like to add in the next question.
+-        Note however that -g is already appended with the selection of KGDB.
+-
+-config COMPILE_OPTIONS
+-      string "Additional compile arguments"
+-      depends on MORE_COMPILE_OPTIONS
+-
+-config KGDB_NMI
+-      bool "Enter KGDB on NMI"
+-      default n
+-
+-config KGDB_THREAD
+-      bool "Include KGDB thread support"
+-      default y
+-
+-config SH_KGDB_CONSOLE
+-      bool "Console messages through GDB"
+-      default n
+-
+-config KGDB_SYSRQ
+-      bool "Allow SysRq 'G' to enter KGDB"
+-      default y
+-
+-config KGDB_KERNEL_ASSERTS
+-      bool "Include KGDB kernel assertions"
+-      default n
+-
+-comment "Serial port setup"
+-
+-config KGDB_DEFPORT
+-      int "Port number (ttySCn)"
+-      default "1"
+-
+-config KGDB_DEFBAUD
+-      int "Baud rate"
+-      default "115200"
+-
+-choice
+-      prompt "Parity"
+-      depends on KGDB
+-      default KGDB_DEFPARITY_N
+-
+-config KGDB_DEFPARITY_N
+-      bool "None"
+-
+-config KGDB_DEFPARITY_E
+-      bool "Even"
+-
+-config KGDB_DEFPARITY_O
+-      bool "Odd"
+-
+-endchoice
+-
+-choice
+-      prompt "Data bits"
+-      depends on KGDB
+-      default KGDB_DEFBITS_8
+-
+-config KGDB_DEFBITS_8
+-      bool "8"
+-
+-config KGDB_DEFBITS_7
+-      bool "7"
+-
+-endchoice
+-
+-endmenu
+-
+-config FRAME_POINTER
+-      bool "Compile the kernel with frame pointers"
+-      default y if KGDB
+-      help
+-        If you say Y here the resulting kernel image will be slightly larger
+-        and slower, but it will give very useful debugging information.
+-        If you don't debug the kernel, you can say N, but we may not be able
+-        to solve problems without frame pointers.
+-
+ endmenu
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/sh/Makefile linux-2.6.18.kgdb/arch/sh/Makefile
+--- linux-2.6.18/arch/sh/Makefile      2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/sh/Makefile 2008-06-10 16:19:47.000000000 +0400
+@@ -43,7 +43,6 @@ cflags-$(CONFIG_CPU_SH4)             += -m4 \
+ cflags-$(CONFIG_CPU_SH4A)             += $(call cc-option,-m4a-nofpu,)
+ 
+ cflags-$(CONFIG_SH_DSP)                       += -Wa,-dsp
+-cflags-$(CONFIG_SH_KGDB)              += -g
+ 
+ cflags-$(CONFIG_MORE_COMPILE_OPTIONS) += \
+       $(shell echo $(CONFIG_COMPILE_OPTIONS) | sed -e 's/"//g')
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/sh/boards/se/7751/setup.c linux-2.6.18.kgdb/arch/sh/boards/se/7751/setup.c
+--- linux-2.6.18/arch/sh/boards/se/7751/setup.c        2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/sh/boards/se/7751/setup.c   2008-06-10 16:19:47.000000000 +0400
+@@ -17,10 +17,6 @@
+ #include <asm/io.h>
+ #include <asm/se7751/se7751.h>
+ 
+-#ifdef CONFIG_SH_KGDB
+-#include <asm/kgdb.h>
+-#endif
+-
+ /*
+  * Configure the Super I/O chip
+  */
+@@ -82,12 +78,6 @@ const char *get_system_type(void)
+       return "7751 SolutionEngine";
+ }
+ 
+-#ifdef CONFIG_SH_KGDB
+-static int kgdb_uart_setup(void);
+-static struct kgdb_sermap kgdb_uart_sermap = 
+-{ "ttyS", 0, kgdb_uart_setup, NULL };
+-#endif
+- 
+ /*
+  * Initialize the board
+  */
+@@ -95,133 +85,4 @@ void __init platform_setup(void)
+ {
+       /* Call init_smsc() replacement to set up SuperIO. */
+       /* XXX: RTC setting comes here */
+-#ifdef CONFIG_SH_KGDB
+-      kgdb_register_sermap(&kgdb_uart_sermap);
+-#endif
+-}
+-
+-/*********************************************************************
+- * Currently a hack (e.g. does not interact well w/serial.c, lots of *
+- * hardcoded stuff) but may be useful if SCI/F needs debugging.      *
+- * Mostly copied from x86 code (see files asm-i386/kgdb_local.h and  *
+- * arch/i386/lib/kgdb_serial.c).                                     *
+- *********************************************************************/
+-
+-#ifdef CONFIG_SH_KGDB
+-#include <linux/types.h>
+-#include <linux/serial.h>
+-#include <linux/serialP.h>
+-#include <linux/serial_reg.h>
+-
+-#define COM1_PORT 0x3f8  /* Base I/O address */
+-#define COM1_IRQ  4      /* IRQ not used yet */
+-#define COM2_PORT 0x2f8  /* Base I/O address */
+-#define COM2_IRQ  3      /* IRQ not used yet */
+-
+-#define SB_CLOCK 1843200 /* Serial baud clock */
+-#define SB_BASE (SB_CLOCK/16)
+-#define SB_MCR UART_MCR_OUT2 | UART_MCR_DTR | UART_MCR_RTS
+-
+-struct uart_port {
+-      int base;
+-};
+-#define UART_NPORTS 2
+-struct uart_port uart_ports[] = {
+-      { COM1_PORT },
+-      { COM2_PORT },
+-};
+-struct uart_port *kgdb_uart_port;
+-
+-#define UART_IN(reg)  inb_p(kgdb_uart_port->base + reg)
+-#define UART_OUT(reg,v)       outb_p((v), kgdb_uart_port->base + reg)
+-
+-/* Basic read/write functions for the UART */
+-#define UART_LSR_RXCERR    (UART_LSR_BI | UART_LSR_FE | UART_LSR_PE)
+-static int kgdb_uart_getchar(void)
+-{
+-      int lsr;
+-      int c = -1;
+-
+-      while (c == -1) {
+-              lsr = UART_IN(UART_LSR);
+-              if (lsr & UART_LSR_DR) 
+-                      c = UART_IN(UART_RX);
+-              if ((lsr & UART_LSR_RXCERR))
+-                      c = -1;
+-      }
+-      return c;
+-}
+-
+-static void kgdb_uart_putchar(int c)
+-{
+-      while ((UART_IN(UART_LSR) & UART_LSR_THRE) == 0)
+-              ;
+-      UART_OUT(UART_TX, c);
+-}
+-
+-/*
+- * Initialize UART to configured/requested values.
+- * (But we don't interrupts yet, or interact w/serial.c)
+- */
+-static int kgdb_uart_setup(void)
+-{
+-      int port;
+-      int lcr = 0;
+-      int bdiv = 0;
+-
+-      if (kgdb_portnum >= UART_NPORTS) {
+-              KGDB_PRINTK("uart port %d invalid.\n", kgdb_portnum);
+-              return -1;
+-      }
+-
+-      kgdb_uart_port = &uart_ports[kgdb_portnum];
+-
+-      /* Init sequence from gdb_hook_interrupt */
+-      UART_IN(UART_RX);
+-      UART_OUT(UART_IER, 0);
+-
+-      UART_IN(UART_RX);       /* Serial driver comments say */
+-      UART_IN(UART_IIR);      /* this clears interrupt regs */
+-      UART_IN(UART_MSR);
+-
+-      /* Figure basic LCR values */
+-      switch (kgdb_bits) {
+-      case '7':
+-              lcr |= UART_LCR_WLEN7;
+-              break;
+-      default: case '8': 
+-              lcr |= UART_LCR_WLEN8;
+-              break;
+-      }
+-      switch (kgdb_parity) {
+-      case 'O':
+-              lcr |= UART_LCR_PARITY;
+-              break;
+-      case 'E':
+-              lcr |= (UART_LCR_PARITY | UART_LCR_EPAR);
+-              break;
+-      default: break;
+-      }
+-
+-      /* Figure the baud rate divisor */
+-      bdiv = (SB_BASE/kgdb_baud);
+-      
+-      /* Set the baud rate and LCR values */
+-      UART_OUT(UART_LCR, (lcr | UART_LCR_DLAB));
+-      UART_OUT(UART_DLL, (bdiv & 0xff));
+-      UART_OUT(UART_DLM, ((bdiv >> 8) & 0xff));
+-      UART_OUT(UART_LCR, lcr);
+-
+-      /* Set the MCR */
+-      UART_OUT(UART_MCR, SB_MCR);
+-
+-      /* Turn off FIFOs for now */
+-      UART_OUT(UART_FCR, 0);
+-
+-      /* Setup complete: initialize function pointers */
+-      kgdb_getchar = kgdb_uart_getchar;
+-      kgdb_putchar = kgdb_uart_putchar;
+-
+-      return 0;
+ }
+-#endif /* CONFIG_SH_KGDB */
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/sh/kernel/Makefile linux-2.6.18.kgdb/arch/sh/kernel/Makefile
+--- linux-2.6.18/arch/sh/kernel/Makefile       2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/sh/kernel/Makefile  2008-06-10 16:19:47.000000000 +0400
+@@ -13,7 +13,7 @@ obj-y                                += cpu/ timers/
+ obj-$(CONFIG_SMP)             += smp.o
+ obj-$(CONFIG_CF_ENABLER)      += cf-enabler.o
+ obj-$(CONFIG_SH_STANDARD_BIOS)        += sh_bios.o
+-obj-$(CONFIG_SH_KGDB)         += kgdb_stub.o kgdb_jmp.o
++obj-$(CONFIG_KGDB)            += kgdb.o kgdb-jmp.o
+ obj-$(CONFIG_SH_CPU_FREQ)     += cpufreq.o
+ obj-$(CONFIG_MODULES)         += module.o
+ obj-$(CONFIG_EARLY_PRINTK)    += early_printk.o
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/sh/kernel/cpu/sh3/ex.S linux-2.6.18.kgdb/arch/sh/kernel/cpu/sh3/ex.S
+--- linux-2.6.18/arch/sh/kernel/cpu/sh3/ex.S   2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/sh/kernel/cpu/sh3/ex.S      2008-06-10 16:19:47.000000000 +0400
+@@ -42,7 +42,7 @@ ENTRY(exception_handling_table)
+       .long   exception_error ! reserved_instruction (filled by trap_init) /* 180 */
+       .long   exception_error ! illegal_slot_instruction (filled by trap_init) /*1A0*/
+ ENTRY(nmi_slot)
+-#if defined (CONFIG_KGDB_NMI)
++#if defined (CONFIG_KGDB)
+       .long   debug_enter     /* 1C0 */       ! Allow trap to debugger
+ #else
+       .long   exception_none  /* 1C0 */       ! Not implemented yet
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/sh/kernel/cpu/sh4/ex.S linux-2.6.18.kgdb/arch/sh/kernel/cpu/sh4/ex.S
+--- linux-2.6.18/arch/sh/kernel/cpu/sh4/ex.S   2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/sh/kernel/cpu/sh4/ex.S      2008-06-10 16:19:47.000000000 +0400
+@@ -46,7 +46,7 @@ ENTRY(exception_handling_table)
+       .long   exception_error ! reserved_instruction (filled by trap_init) /* 180 */
+       .long   exception_error ! illegal_slot_instruction (filled by trap_init) /*1A0*/
+ ENTRY(nmi_slot)
+-#if defined (CONFIG_KGDB_NMI)
++#if defined (CONFIG_KGDB)
+       .long   debug_enter     /* 1C0 */       ! Allow trap to debugger
+ #else
+       .long   exception_none  /* 1C0 */       ! Not implemented yet
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/sh/kernel/entry.S linux-2.6.18.kgdb/arch/sh/kernel/entry.S
+--- linux-2.6.18/arch/sh/kernel/entry.S        2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/sh/kernel/entry.S   2008-06-10 16:19:47.000000000 +0400
+@@ -75,7 +75,7 @@
+ ENOSYS = 38
+ EINVAL = 22
+ 
+-#if defined(CONFIG_KGDB_NMI)
++#if defined(CONFIG_KGDB)
+ NMI_VEC = 0x1c0                       ! Must catch early for debounce
+ #endif
+ 
+@@ -227,31 +227,33 @@ call_dae:
+ 2:    .long   do_address_error
+ #endif /* CONFIG_MMU */
+ 
+-#if defined(CONFIG_SH_STANDARD_BIOS) || defined(CONFIG_SH_KGDB)
++#if defined(CONFIG_SH_STANDARD_BIOS) || defined(CONFIG_KGDB)
+ ! Handle kernel debug if either kgdb (SW) or gdb-stub (FW) is present.
+ ! If both are configured, handle the debug traps (breakpoints) in SW,
+ ! but still allow BIOS traps to FW.
+ 
+       .align  2
+ debug_kernel:
+-#if defined(CONFIG_SH_STANDARD_BIOS) && defined(CONFIG_SH_KGDB)
++#if defined(CONFIG_SH_STANDARD_BIOS) && defined(CONFIG_KGDB)
+       /* Force BIOS call to FW (debug_trap put TRA in r8) */
+       mov     r8,r0
+       shlr2   r0
+       cmp/eq  #0x3f,r0
+       bt      debug_kernel_fw
+-#endif /* CONFIG_SH_STANDARD_BIOS && CONFIG_SH_KGDB */
++#endif /* CONFIG_SH_STANDARD_BIOS && CONFIG_KGDB */
+ 
+-debug_enter:          
+-#if defined(CONFIG_SH_KGDB)
++      .align 2
++      .globl debug_enter
++debug_enter:
++#if defined(CONFIG_KGDB)
+       /* Jump to kgdb, pass stacked regs as arg */
+ debug_kernel_sw:
+       mov.l   3f, r0
+       jmp     @r0
+        mov    r15, r4
+       .align  2
+-3:    .long   kgdb_handle_exception
+-#endif /* CONFIG_SH_KGDB */
++3:    .long   kgdb_exception_handler
++#endif /* CONFIG_KGDB */
+ 
+ #if defined(CONFIG_SH_STANDARD_BIOS)
+       /* Unwind the stack and jmp to the debug entry */
+@@ -293,12 +295,12 @@ debug_kernel_fw:
+ 2:    .long   gdb_vbr_vector
+ #endif /* CONFIG_SH_STANDARD_BIOS */
+ 
+-#endif /* CONFIG_SH_STANDARD_BIOS || CONFIG_SH_KGDB */
++#endif /* CONFIG_SH_STANDARD_BIOS || CONFIG_KGDB */
+ 
+ 
+       .align  2
+-debug_trap:   
+-#if defined(CONFIG_SH_STANDARD_BIOS) || defined(CONFIG_SH_KGDB)
++debug_trap:
++#if defined(CONFIG_SH_STANDARD_BIOS) || defined(CONFIG_KGDB)
+       mov     #OFF_SR, r0
+       mov.l   @(r0,r15), r0           ! get status register
+       shll    r0
+@@ -642,7 +644,7 @@ skip_restore:
+ 6:    or      k0, k2                  ! Set the IMASK-bits
+       ldc     k2, ssr
+       !
+-#if defined(CONFIG_KGDB_NMI)
++#if defined(CONFIG_KGDB)
+       ! Clear in_nmi
+       mov.l   4f, k0
+       mov     #0, k1
+@@ -694,7 +696,7 @@ tlb_miss:
+ interrupt:
+       mov.l   2f, k2
+       mov.l   3f, k3
+-#if defined(CONFIG_KGDB_NMI)
++#if defined(CONFIG_KGDB)
+       ! Debounce (filter nested NMI)
+       mov.l   @k2, k0
+       mov.l   5f, k1
+@@ -709,7 +711,7 @@ interrupt:
+ 5:    .long   NMI_VEC
+ 6:    .long   in_nmi
+ 0:
+-#endif /* defined(CONFIG_KGDB_NMI) */
++#endif /* defined(CONFIG_KGDB) */
+       bra     handle_exception
+        mov.l  @k2, k2
+ 
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/sh/kernel/kgdb-jmp.S linux-2.6.18.kgdb/arch/sh/kernel/kgdb-jmp.S
+--- linux-2.6.18/arch/sh/kernel/kgdb-jmp.S     1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18.kgdb/arch/sh/kernel/kgdb-jmp.S        2008-06-10 16:19:47.000000000 +0400
+@@ -0,0 +1,32 @@
++#include <linux/linkage.h>
++
++ENTRY(kgdb_fault_setjmp)
++      add     #(9*4), r4
++      sts.l   pr, @-r4
++      mov.l   r15, @-r4
++      mov.l   r14, @-r4
++      mov.l   r13, @-r4
++      mov.l   r12, @-r4
++      mov.l   r11, @-r4
++      mov.l   r10, @-r4
++      mov.l   r9, @-r4
++      mov.l   r8, @-r4
++      rts
++       mov    #0, r0
++
++ENTRY(kgdb_fault_longjmp)
++      mov.l   @r4+, r8
++      mov.l   @r4+, r9
++      mov.l   @r4+, r10
++      mov.l   @r4+, r11
++      mov.l   @r4+, r12
++      mov.l   @r4+, r13
++      mov.l   @r4+, r14
++      mov.l   @r4+, r15
++      lds.l   @r4+, pr
++      mov     r5, r0
++      tst     r0, r0
++      bf      1f
++      mov     #1, r0
++1:    rts
++       nop
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/sh/kernel/kgdb.c linux-2.6.18.kgdb/arch/sh/kernel/kgdb.c
+--- linux-2.6.18/arch/sh/kernel/kgdb.c 1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18.kgdb/arch/sh/kernel/kgdb.c    2008-06-10 16:19:47.000000000 +0400
+@@ -0,0 +1,363 @@
++/*
++ * arch/sh/kernel/kgdb.c
++ *
++ * Contains SH-specific low-level support for KGDB.
++ *
++ * Containes extracts from code by Glenn Engel, Jim Kingdon,
++ * David Grothe <dave@gcom.com>, Tigran Aivazian <tigran@sco.com>,
++ * Amit S. Kale <akale@veritas.com>,  William Gatliff <bgat@open-widgets.com>,
++ * Ben Lee, Steve Chamberlain and Benoit Miller <fulg@iname.com>,
++ * Henry Bell <henry.bell@st.com> and Jeremy Siegel <jsiegel@mvista.com>
++ *
++ * Maintainer: Tom Rini <trini@kernel.crashing.org>
++ *
++ * 2004 (c) MontaVista Software, Inc. This file is licensed under
++ * the terms of the GNU General Public License version 2. This program
++ * is licensed "as is" without any warranty of any kind, whether express
++ * or implied.
++ */
++
++#include <linux/string.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/smp.h>
++#include <linux/spinlock.h>
++#include <linux/delay.h>
++#include <linux/linkage.h>
++#include <linux/init.h>
++#include <linux/kgdb.h>
++
++#include <asm/system.h>
++#include <asm/current.h>
++#include <asm/signal.h>
++#include <asm/pgtable.h>
++#include <asm/ptrace.h>
++
++extern void per_cpu_trap_init(void);
++extern atomic_t cpu_doing_single_step;
++
++/* Function pointers for linkage */
++static struct kgdb_regs trap_registers;
++
++/* Globals. */
++char in_nmi;                  /* Set during NMI to prevent reentry */
++
++/* TRA differs sh3/4 */
++#if defined(CONFIG_CPU_SH3)
++#define TRA 0xffffffd0
++#elif defined(CONFIG_CPU_SH4)
++#define TRA 0xff000020
++#endif
++
++/* Macros for single step instruction identification */
++#define OPCODE_BT(op)         (((op) & 0xff00) == 0x8900)
++#define OPCODE_BF(op)         (((op) & 0xff00) == 0x8b00)
++#define OPCODE_BTF_DISP(op)   (((op) & 0x80) ? (((op) | 0xffffff80) << 1) : \
++                            (((op) & 0x7f ) << 1))
++#define OPCODE_BFS(op)        (((op) & 0xff00) == 0x8f00)
++#define OPCODE_BTS(op)        (((op) & 0xff00) == 0x8d00)
++#define OPCODE_BRA(op)        (((op) & 0xf000) == 0xa000)
++#define OPCODE_BRA_DISP(op)   (((op) & 0x800) ? (((op) | 0xfffff800) << 1) : \
++                            (((op) & 0x7ff) << 1))
++#define OPCODE_BRAF(op)       (((op) & 0xf0ff) == 0x0023)
++#define OPCODE_BRAF_REG(op)   (((op) & 0x0f00) >> 8)
++#define OPCODE_BSR(op)        (((op) & 0xf000) == 0xb000)
++#define OPCODE_BSR_DISP(op)   (((op) & 0x800) ? (((op) | 0xfffff800) << 1) : \
++                            (((op) & 0x7ff) << 1))
++#define OPCODE_BSRF(op)       (((op) & 0xf0ff) == 0x0003)
++#define OPCODE_BSRF_REG(op)   (((op) >> 8) & 0xf)
++#define OPCODE_JMP(op)        (((op) & 0xf0ff) == 0x402b)
++#define OPCODE_JMP_REG(op)    (((op) >> 8) & 0xf)
++#define OPCODE_JSR(op)        (((op) & 0xf0ff) == 0x400b)
++#define OPCODE_JSR_REG(op)    (((op) >> 8) & 0xf)
++#define OPCODE_RTS(op)        ((op) == 0xb)
++#define OPCODE_RTE(op)        ((op) == 0x2b)
++
++#define SR_T_BIT_MASK           0x1
++#define STEP_OPCODE             0xc320
++#define BIOS_CALL_TRAP          0x3f
++
++/* Exception codes as per SH-4 core manual */
++#define ADDRESS_ERROR_LOAD_VEC   7
++#define ADDRESS_ERROR_STORE_VEC  8
++#define TRAP_VEC                 11
++#define INVALID_INSN_VEC         12
++#define INVALID_SLOT_VEC         13
++#define NMI_VEC                  14
++#define SERIAL_BREAK_VEC         58
++
++/* Misc static */
++static int stepped_address;
++static short stepped_opcode;
++
++/* Translate SH-3/4 exception numbers to unix-like signal values */
++static int compute_signal(const int excep_code)
++{
++      switch (excep_code) {
++      case INVALID_INSN_VEC:
++      case INVALID_SLOT_VEC:
++              return SIGILL;
++      case ADDRESS_ERROR_LOAD_VEC:
++      case ADDRESS_ERROR_STORE_VEC:
++              return SIGSEGV;
++      case SERIAL_BREAK_VEC:
++      case NMI_VEC:
++              return SIGINT;
++      default:
++              /* Act like it was a break/trap. */
++              return SIGTRAP;
++      }
++}
++
++/*
++ * Translate the registers of the system into the format that GDB wants.  Since
++ * we use a local structure to store things, instead of getting them out
++ * of pt_regs, we can just do a memcpy.
++ */
++void regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *ign)
++{
++      memcpy(gdb_regs, &trap_registers, sizeof(trap_registers));
++}
++
++/*
++ * On SH we save: r1 (prev->thread.sp) r2 (prev->thread.pc) r4 (prev) r5 (next)
++ * r6 (next->thread.sp) r7 (next->thread.pc)
++ */
++void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
++{
++      int count;
++
++      for (count = 0; count < 16; count++)
++              *(gdb_regs++) = 0;
++      *(gdb_regs++) = p->thread.pc;
++      *(gdb_regs++) = 0;
++      *(gdb_regs++) = 0;
++      *(gdb_regs++) = 0;
++      *(gdb_regs++) = 0;
++      *(gdb_regs++) = 0;
++      *(gdb_regs++) = 0;
++}
++
++/*
++ * Translate the registers values that GDB has given us back into the
++ * format of the system.  See the comment above about memcpy.
++ */
++void gdb_regs_to_regs(unsigned long *gdb_regs, struct pt_regs *ign)
++{
++      memcpy(&trap_registers, gdb_regs, sizeof(trap_registers));
++}
++
++/* Calculate the new address for after a step */
++static short *get_step_address(void)
++{
++      short op = *(short *)trap_registers.pc;
++      long addr;
++
++      /* BT */
++      if (OPCODE_BT(op)) {
++              if (trap_registers.sr & SR_T_BIT_MASK)
++                      addr = trap_registers.pc + 4 + OPCODE_BTF_DISP(op);
++              else
++                      addr = trap_registers.pc + 2;
++      }
++
++      /* BTS */
++      else if (OPCODE_BTS(op)) {
++              if (trap_registers.sr & SR_T_BIT_MASK)
++                      addr = trap_registers.pc + 4 + OPCODE_BTF_DISP(op);
++              else
++                      addr = trap_registers.pc + 4;   /* Not in delay slot */
++      }
++
++      /* BF */
++      else if (OPCODE_BF(op)) {
++              if (!(trap_registers.sr & SR_T_BIT_MASK))
++                      addr = trap_registers.pc + 4 + OPCODE_BTF_DISP(op);
++              else
++                      addr = trap_registers.pc + 2;
++      }
++
++      /* BFS */
++      else if (OPCODE_BFS(op)) {
++              if (!(trap_registers.sr & SR_T_BIT_MASK))
++                      addr = trap_registers.pc + 4 + OPCODE_BTF_DISP(op);
++              else
++                      addr = trap_registers.pc + 4;   /* Not in delay slot */
++      }
++
++      /* BRA */
++      else if (OPCODE_BRA(op))
++              addr = trap_registers.pc + 4 + OPCODE_BRA_DISP(op);
++
++      /* BRAF */
++      else if (OPCODE_BRAF(op))
++              addr = trap_registers.pc + 4
++                  + trap_registers.regs[OPCODE_BRAF_REG(op)];
++
++      /* BSR */
++      else if (OPCODE_BSR(op))
++              addr = trap_registers.pc + 4 + OPCODE_BSR_DISP(op);
++
++      /* BSRF */
++      else if (OPCODE_BSRF(op))
++              addr = trap_registers.pc + 4
++                  + trap_registers.regs[OPCODE_BSRF_REG(op)];
++
++      /* JMP */
++      else if (OPCODE_JMP(op))
++              addr = trap_registers.regs[OPCODE_JMP_REG(op)];
++
++      /* JSR */
++      else if (OPCODE_JSR(op))
++              addr = trap_registers.regs[OPCODE_JSR_REG(op)];
++
++      /* RTS */
++      else if (OPCODE_RTS(op))
++              addr = trap_registers.pr;
++
++      /* RTE */
++      else if (OPCODE_RTE(op))
++              addr = trap_registers.regs[15];
++
++      /* Other */
++      else
++              addr = trap_registers.pc + 2;
++
++      kgdb_flush_icache_range(addr, addr + 2);
++      return (short *)addr;
++}
++
++/* The command loop, read and act on requests */
++int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
++                             char *remcom_in_buffer, char *remcom_out_buffer,
++                             struct pt_regs *ign)
++{
++      unsigned long addr;
++      char *ptr = &remcom_in_buffer[1];
++
++      /* Examine first char of buffer to see what we need to do */
++      switch (remcom_in_buffer[0]) {
++      case 'c':               /* Continue at address AA..AA (optional) */
++      case 's':               /* Step one instruction from AA..AA */
++              /* Try to read optional parameter, PC unchanged if none */
++              if (kgdb_hex2long(&ptr, &addr))
++                      trap_registers.pc = addr;
++
++              atomic_set(&cpu_doing_single_step, -1);
++              if (remcom_in_buffer[0] == 's') {
++                      /* Replace the instruction immediately after the
++                       * current instruction (i.e. next in the expected
++                       * flow of control) with a trap instruction, so that
++                       * returning will cause only a single instruction to
++                       * be executed. Note that this model is slightly
++                       * broken for instructions with delay slots
++                       * (e.g. B[TF]S, BSR, BRA etc), where both the branch
++                       * and the instruction in the delay slot will be
++                       * executed.
++                       */
++                      /* Determine where the target instruction will send
++                       * us to */
++                      unsigned short *next_addr = get_step_address();
++                      stepped_address = (int)next_addr;
++
++                      /* Replace it */
++                      stepped_opcode = *(short *)next_addr;
++                      *next_addr = STEP_OPCODE;
++
++                      /* Flush and return */
++                      kgdb_flush_icache_range((long)next_addr,
++                                              (long)next_addr + 2);
++                      if (kgdb_contthread)
++                              atomic_set(&cpu_doing_single_step,
++                                         smp_processor_id());
++              }
++              return 0;
++      }
++      return -1;
++}
++
++/*
++ * When an exception has occured, we are called.  We need to set things
++ * up so that we can call kgdb_handle_exception to handle requests from
++ * the remote GDB.
++ */
++void kgdb_exception_handler(struct pt_regs *regs)
++{
++      int excep_code, vbr_val;
++      int count;
++
++      /* Copy kernel regs (from stack) */
++      for (count = 0; count < 16; count++)
++              trap_registers.regs[count] = regs->regs[count];
++      trap_registers.pc = regs->pc;
++      trap_registers.pr = regs->pr;
++      trap_registers.sr = regs->sr;
++      trap_registers.gbr = regs->gbr;
++      trap_registers.mach = regs->mach;
++      trap_registers.macl = regs->macl;
++
++      __asm__ __volatile__("stc vbr, %0":"=r"(vbr_val));
++      trap_registers.vbr = vbr_val;
++
++      /* Get the execption code. */
++      __asm__ __volatile__("stc r2_bank, %0":"=r"(excep_code));
++
++      excep_code >>= 5;
++
++      /* If we got an NMI, and KGDB is not yet initialized, call
++       * breakpoint() to try and initialize everything for us. */
++      if (excep_code == NMI_VEC && !kgdb_initialized) {
++              breakpoint();
++              return;
++      }
++
++      /* TRAP_VEC exception indicates a software trap inserted in place of
++       * code by GDB so back up PC by one instruction, as this instruction
++       * will later be replaced by its original one.  Do NOT do this for
++       * trap 0xff, since that indicates a compiled-in breakpoint which
++       * will not be replaced (and we would retake the trap forever) */
++      if (excep_code == TRAP_VEC &&
++          (*(volatile unsigned long *)TRA != (0xff << 2)))
++              trap_registers.pc -= 2;
++
++      /* If we have been single-stepping, put back the old instruction.
++       * We use stepped_address in case we have stopped more than one
++       * instruction away. */
++      if (stepped_opcode != 0) {
++              *(short *)stepped_address = stepped_opcode;
++              kgdb_flush_icache_range(stepped_address, stepped_address + 2);
++      }
++      stepped_opcode = 0;
++
++      /* Call the stub to do the processing.  Note that not everything we
++       * need to send back and forth lives in pt_regs. */
++      kgdb_handle_exception(excep_code, compute_signal(excep_code), 0, regs);
++
++      /* Copy back the (maybe modified) registers */
++      for (count = 0; count < 16; count++)
++              regs->regs[count] = trap_registers.regs[count];
++      regs->pc = trap_registers.pc;
++      regs->pr = trap_registers.pr;
++      regs->sr = trap_registers.sr;
++      regs->gbr = trap_registers.gbr;
++      regs->mach = trap_registers.mach;
++      regs->macl = trap_registers.macl;
++
++      vbr_val = trap_registers.vbr;
++      __asm__ __volatile__("ldc %0, vbr": :"r"(vbr_val));
++}
++
++int __init kgdb_arch_init(void)
++{
++      per_cpu_trap_init();
++
++      return 0;
++}
++
++struct kgdb_arch arch_kgdb_ops = {
++#ifdef CONFIG_CPU_LITTLE_ENDIAN
++      .gdb_bpt_instr = {0xff, 0xc3},
++#else
++      .gdb_bpt_instr = {0xc3, 0xff},
++#endif
++};
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/sh/kernel/kgdb_jmp.S linux-2.6.18.kgdb/arch/sh/kernel/kgdb_jmp.S
+--- linux-2.6.18/arch/sh/kernel/kgdb_jmp.S     2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/sh/kernel/kgdb_jmp.S        1970-01-01 03:00:00.000000000 +0300
+@@ -1,33 +0,0 @@
+-#include <linux/linkage.h>
+-
+-ENTRY(setjmp)
+-      add     #(9*4), r4
+-      sts.l   pr, @-r4
+-      mov.l   r15, @-r4
+-      mov.l   r14, @-r4
+-      mov.l   r13, @-r4
+-      mov.l   r12, @-r4
+-      mov.l   r11, @-r4
+-      mov.l   r10, @-r4
+-      mov.l   r9, @-r4
+-      mov.l   r8, @-r4
+-      rts
+-       mov    #0, r0
+-
+-ENTRY(longjmp)
+-      mov.l   @r4+, r8
+-      mov.l   @r4+, r9
+-      mov.l   @r4+, r10
+-      mov.l   @r4+, r11
+-      mov.l   @r4+, r12
+-      mov.l   @r4+, r13
+-      mov.l   @r4+, r14
+-      mov.l   @r4+, r15
+-      lds.l   @r4+, pr
+-      mov     r5, r0
+-      tst     r0, r0
+-      bf      1f
+-      mov     #1, r0  ! in case val==0
+-1:    rts
+-       nop
+-
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/sh/kernel/kgdb_stub.c linux-2.6.18.kgdb/arch/sh/kernel/kgdb_stub.c
+--- linux-2.6.18/arch/sh/kernel/kgdb_stub.c    2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/sh/kernel/kgdb_stub.c       1970-01-01 03:00:00.000000000 +0300
+@@ -1,1491 +0,0 @@
+-/*
+- * May be copied or modified under the terms of the GNU General Public
+- * License.  See linux/COPYING for more information.
+- *
+- * Containes extracts from code by Glenn Engel, Jim Kingdon,
+- * David Grothe <dave@gcom.com>, Tigran Aivazian <tigran@sco.com>,
+- * Amit S. Kale <akale@veritas.com>,  William Gatliff <bgat@open-widgets.com>,
+- * Ben Lee, Steve Chamberlain and Benoit Miller <fulg@iname.com>.
+- * 
+- * This version by Henry Bell <henry.bell@st.com>
+- * Minor modifications by Jeremy Siegel <jsiegel@mvista.com>
+- * 
+- * Contains low-level support for remote debug using GDB. 
+- *
+- * To enable debugger support, two things need to happen. A call to
+- * set_debug_traps() is necessary in order to allow any breakpoints
+- * or error conditions to be properly intercepted and reported to gdb.
+- * A breakpoint also needs to be generated to begin communication.  This
+- * is most easily accomplished by a call to breakpoint() which does
+- * a trapa if the initialisation phase has been successfully completed.
+- *
+- * In this case, set_debug_traps() is not used to "take over" exceptions;
+- * other kernel code is modified instead to enter the kgdb functions here
+- * when appropriate (see entry.S for breakpoint traps and NMI interrupts,
+- * see traps.c for kernel error exceptions).
+- *
+- * The following gdb commands are supported:
+- *
+- *    Command       Function                               Return value
+- *
+- *    g             return the value of the CPU registers  hex data or ENN
+- *    G             set the value of the CPU registers     OK or ENN
+- *
+- *    mAA..AA,LLLL  Read LLLL bytes at address AA..AA      hex data or ENN
+- *    MAA..AA,LLLL: Write LLLL bytes at address AA.AA      OK or ENN
+- *    XAA..AA,LLLL: Same, but data is binary (not hex)     OK or ENN
+- *
+- *    c             Resume at current address              SNN   ( signal NN)
+- *    cAA..AA       Continue at address AA..AA             SNN
+- *    CNN;          Resume at current address with signal  SNN
+- *    CNN;AA..AA    Resume at address AA..AA with signal   SNN
+- *
+- *    s             Step one instruction                   SNN
+- *    sAA..AA       Step one instruction from AA..AA       SNN
+- *    SNN;          Step one instruction with signal       SNN
+- *    SNNAA..AA     Step one instruction from AA..AA w/NN  SNN
+- *
+- *    k             kill (Detach GDB)
+- *
+- *    d             Toggle debug flag
+- *    D             Detach GDB 
+- *
+- *    Hct           Set thread t for operations,           OK or ENN
+- *                  c = 'c' (step, cont), c = 'g' (other
+- *                  operations)
+- *
+- *    qC            Query current thread ID                QCpid
+- *    qfThreadInfo  Get list of current threads (first)    m<id>
+- *    qsThreadInfo   "    "  "     "      "   (subsequent)
+- *    qOffsets      Get section offsets                  Text=x;Data=y;Bss=z
+- * 
+- *    TXX           Find if thread XX is alive             OK or ENN
+- *    ?             What was the last sigval ?             SNN   (signal NN)
+- *    O             Output to GDB console
+- *
+- * Remote communication protocol.
+- *
+- *    A debug packet whose contents are <data> is encapsulated for
+- *    transmission in the form:
+- *
+- *       $ <data> # CSUM1 CSUM2
+- *
+- *       <data> must be ASCII alphanumeric and cannot include characters
+- *       '$' or '#'.  If <data> starts with two characters followed by
+- *       ':', then the existing stubs interpret this as a sequence number.
+- *
+- *       CSUM1 and CSUM2 are ascii hex representation of an 8-bit 
+- *       checksum of <data>, the most significant nibble is sent first.
+- *       the hex digits 0-9,a-f are used.
+- *
+- *    Receiver responds with:
+- *
+- *       +       - if CSUM is correct and ready for next packet
+- *       -       - if CSUM is incorrect
+- *
+- * Responses can be run-length encoded to save space.  A '*' means that
+- * the next character is an ASCII encoding giving a repeat count which
+- * stands for that many repititions of the character preceding the '*'.
+- * The encoding is n+29, yielding a printable character where n >=3 
+- * (which is where RLE starts to win).  Don't use an n > 126. 
+- *
+- * So "0* " means the same as "0000".
+- */
+-
+-#include <linux/string.h>
+-#include <linux/kernel.h>
+-#include <linux/sched.h>
+-#include <linux/smp.h>
+-#include <linux/spinlock.h>
+-#include <linux/delay.h>
+-#include <linux/linkage.h>
+-#include <linux/init.h>
+-
+-#include <asm/system.h>
+-#include <asm/current.h>
+-#include <asm/signal.h>
+-#include <asm/pgtable.h>
+-#include <asm/ptrace.h>
+-#include <asm/kgdb.h>
+-
+-#ifdef CONFIG_SH_KGDB_CONSOLE
+-#include <linux/console.h>
+-#endif
+-
+-/* Function pointers for linkage */
+-kgdb_debug_hook_t *kgdb_debug_hook;
+-kgdb_bus_error_hook_t *kgdb_bus_err_hook;
+-
+-int (*kgdb_getchar)(void);
+-void (*kgdb_putchar)(int);
+-
+-static void put_debug_char(int c)
+-{
+-      if (!kgdb_putchar)
+-              return;
+-      (*kgdb_putchar)(c);
+-}
+-static int get_debug_char(void)
+-{
+-      if (!kgdb_getchar)
+-              return -1;
+-      return (*kgdb_getchar)();
+-}
+-
+-/* Num chars in in/out bound buffers, register packets need NUMREGBYTES * 2 */
+-#define BUFMAX 1024
+-#define NUMREGBYTES (MAXREG*4)
+-#define OUTBUFMAX (NUMREGBYTES*2+512)
+-
+-enum regs {
+-      R0 = 0, R1,  R2,  R3,   R4,   R5,  R6, R7,
+-      R8, R9, R10, R11, R12,  R13,  R14, R15,
+-      PC, PR, GBR, VBR, MACH, MACL, SR,
+-      /*  */
+-      MAXREG
+-};
+-
+-static unsigned int registers[MAXREG];
+-struct kgdb_regs trap_registers;
+-
+-char kgdb_in_gdb_mode;
+-char in_nmi;                  /* Set during NMI to prevent reentry */
+-int kgdb_nofault;             /* Boolean to ignore bus errs (i.e. in GDB) */
+-int kgdb_enabled = 1;         /* Default to enabled, cmdline can disable */
+-int kgdb_halt;
+-
+-/* Exposed for user access */
+-struct task_struct *kgdb_current;
+-unsigned int kgdb_g_imask;
+-int kgdb_trapa_val;
+-int kgdb_excode;
+-
+-/* Default values for SCI (can override via kernel args in setup.c) */
+-#ifndef CONFIG_KGDB_DEFPORT
+-#define CONFIG_KGDB_DEFPORT 1
+-#endif
+-
+-#ifndef CONFIG_KGDB_DEFBAUD
+-#define CONFIG_KGDB_DEFBAUD 115200
+-#endif
+-
+-#if defined(CONFIG_KGDB_DEFPARITY_E)
+-#define CONFIG_KGDB_DEFPARITY 'E'
+-#elif defined(CONFIG_KGDB_DEFPARITY_O)
+-#define CONFIG_KGDB_DEFPARITY 'O'
+-#else /* CONFIG_KGDB_DEFPARITY_N */
+-#define CONFIG_KGDB_DEFPARITY 'N'
+-#endif
+-
+-#ifdef CONFIG_KGDB_DEFBITS_7
+-#define CONFIG_KGDB_DEFBITS '7'
+-#else /* CONFIG_KGDB_DEFBITS_8 */
+-#define CONFIG_KGDB_DEFBITS '8'
+-#endif
+-
+-/* SCI/UART settings, used in kgdb_console_setup() */
+-int  kgdb_portnum = CONFIG_KGDB_DEFPORT;
+-int  kgdb_baud = CONFIG_KGDB_DEFBAUD;
+-char kgdb_parity = CONFIG_KGDB_DEFPARITY;
+-char kgdb_bits = CONFIG_KGDB_DEFBITS;
+-
+-/* Jump buffer for setjmp/longjmp */
+-static jmp_buf rem_com_env;
+-
+-/* TRA differs sh3/4 */
+-#if defined(CONFIG_CPU_SH3)
+-#define TRA 0xffffffd0
+-#elif defined(CONFIG_CPU_SH4)
+-#define TRA 0xff000020
+-#endif
+-
+-/* Macros for single step instruction identification */
+-#define OPCODE_BT(op)         (((op) & 0xff00) == 0x8900)
+-#define OPCODE_BF(op)         (((op) & 0xff00) == 0x8b00)
+-#define OPCODE_BTF_DISP(op)   (((op) & 0x80) ? (((op) | 0xffffff80) << 1) : \
+-                            (((op) & 0x7f ) << 1))
+-#define OPCODE_BFS(op)        (((op) & 0xff00) == 0x8f00)
+-#define OPCODE_BTS(op)        (((op) & 0xff00) == 0x8d00)
+-#define OPCODE_BRA(op)        (((op) & 0xf000) == 0xa000)
+-#define OPCODE_BRA_DISP(op)   (((op) & 0x800) ? (((op) | 0xfffff800) << 1) : \
+-                            (((op) & 0x7ff) << 1))
+-#define OPCODE_BRAF(op)       (((op) & 0xf0ff) == 0x0023)
+-#define OPCODE_BRAF_REG(op)   (((op) & 0x0f00) >> 8)
+-#define OPCODE_BSR(op)        (((op) & 0xf000) == 0xb000)
+-#define OPCODE_BSR_DISP(op)   (((op) & 0x800) ? (((op) | 0xfffff800) << 1) : \
+-                            (((op) & 0x7ff) << 1))
+-#define OPCODE_BSRF(op)       (((op) & 0xf0ff) == 0x0003)
+-#define OPCODE_BSRF_REG(op)   (((op) >> 8) & 0xf)
+-#define OPCODE_JMP(op)        (((op) & 0xf0ff) == 0x402b)
+-#define OPCODE_JMP_REG(op)    (((op) >> 8) & 0xf)
+-#define OPCODE_JSR(op)        (((op) & 0xf0ff) == 0x400b)
+-#define OPCODE_JSR_REG(op)    (((op) >> 8) & 0xf)
+-#define OPCODE_RTS(op)        ((op) == 0xb)
+-#define OPCODE_RTE(op)        ((op) == 0x2b)
+-
+-#define SR_T_BIT_MASK           0x1
+-#define STEP_OPCODE             0xc320
+-#define BIOS_CALL_TRAP          0x3f
+-
+-/* Exception codes as per SH-4 core manual */
+-#define ADDRESS_ERROR_LOAD_VEC   7
+-#define ADDRESS_ERROR_STORE_VEC  8
+-#define TRAP_VEC                 11
+-#define INVALID_INSN_VEC         12
+-#define INVALID_SLOT_VEC         13
+-#define NMI_VEC                  14
+-#define USER_BREAK_VEC           15
+-#define SERIAL_BREAK_VEC         58
+-
+-/* Misc static */
+-static int stepped_address;
+-static short stepped_opcode;
+-static const char hexchars[] = "0123456789abcdef";
+-static char in_buffer[BUFMAX];
+-static char out_buffer[OUTBUFMAX];
+-
+-static void kgdb_to_gdb(const char *s);
+-
+-#ifdef CONFIG_KGDB_THREAD
+-static struct task_struct *trapped_thread;
+-static struct task_struct *current_thread;
+-typedef unsigned char threadref[8];
+-#define BUF_THREAD_ID_SIZE 16
+-#endif
+-
+-/* Return addr as a real volatile address */
+-static inline unsigned int ctrl_inl(const unsigned long addr)
+-{
+-      return *(volatile unsigned long *) addr;
+-}
+-
+-/* Correctly set *addr using volatile */
+-static inline void ctrl_outl(const unsigned int b, unsigned long addr)
+-{
+-      *(volatile unsigned long *) addr = b;
+-}
+-
+-/* Get high hex bits */
+-static char highhex(const int x)
+-{
+-      return hexchars[(x >> 4) & 0xf];
+-}
+-
+-/* Get low hex bits */
+-static char lowhex(const int x)
+-{
+-      return hexchars[x & 0xf];
+-}
+-
+-/* Convert ch to hex */
+-static int hex(const char ch)
+-{
+-      if ((ch >= 'a') && (ch <= 'f'))
+-              return (ch - 'a' + 10);
+-      if ((ch >= '0') && (ch <= '9'))
+-              return (ch - '0');
+-      if ((ch >= 'A') && (ch <= 'F'))
+-              return (ch - 'A' + 10);
+-      return (-1);
+-}
+-
+-/* Convert the memory pointed to by mem into hex, placing result in buf.
+-   Returns a pointer to the last char put in buf (null) */
+-static char *mem_to_hex(const char *mem, char *buf, const int count)
+-{
+-      int i;
+-      int ch;
+-      unsigned short s_val;
+-      unsigned long l_val;
+-
+-      /* Check for 16 or 32 */
+-      if (count == 2 && ((long) mem & 1) == 0) {
+-              s_val = *(unsigned short *) mem;
+-              mem = (char *) &s_val;
+-      } else if (count == 4 && ((long) mem & 3) == 0) {
+-              l_val = *(unsigned long *) mem;
+-              mem = (char *) &l_val;
+-      }
+-      for (i = 0; i < count; i++) {
+-              ch = *mem++;
+-              *buf++ = highhex(ch);
+-              *buf++ = lowhex(ch);
+-      }
+-      *buf = 0;
+-      return (buf);
+-}
+-
+-/* Convert the hex array pointed to by buf into binary, to be placed in mem.
+-   Return a pointer to the character after the last byte written */
+-static char *hex_to_mem(const char *buf, char *mem, const int count)
+-{
+-      int i;
+-      unsigned char ch;
+-
+-      for (i = 0; i < count; i++) {
+-              ch = hex(*buf++) << 4;
+-              ch = ch + hex(*buf++);
+-              *mem++ = ch;
+-      }
+-      return (mem);
+-}
+-
+-/* While finding valid hex chars, convert to an integer, then return it */
+-static int hex_to_int(char **ptr, int *int_value)
+-{
+-      int num_chars = 0;
+-      int hex_value;
+-
+-      *int_value = 0;
+-
+-      while (**ptr) {
+-              hex_value = hex(**ptr);
+-              if (hex_value >= 0) {
+-                      *int_value = (*int_value << 4) | hex_value;
+-                      num_chars++;
+-              } else
+-                      break;
+-              (*ptr)++;
+-      }
+-      return num_chars;
+-}
+-
+-/*  Copy the binary array pointed to by buf into mem.  Fix $, #,
+-    and 0x7d escaped with 0x7d.  Return a pointer to the character 
+-    after the last byte written. */
+-static char *ebin_to_mem(const char *buf, char *mem, int count)
+-{
+-      for (; count > 0; count--, buf++) {
+-              if (*buf == 0x7d)
+-                      *mem++ = *(++buf) ^ 0x20;
+-              else
+-                      *mem++ = *buf;
+-      }
+-      return mem;
+-}
+-
+-/* Pack a hex byte */
+-static char *pack_hex_byte(char *pkt, int byte)
+-{
+-      *pkt++ = hexchars[(byte >> 4) & 0xf];
+-      *pkt++ = hexchars[(byte & 0xf)];
+-      return pkt;
+-}
+-
+-#ifdef CONFIG_KGDB_THREAD
+-
+-/* Pack a thread ID */
+-static char *pack_threadid(char *pkt, threadref * id)
+-{
+-      char *limit;
+-      unsigned char *altid;
+-
+-      altid = (unsigned char *) id;
+-
+-      limit = pkt + BUF_THREAD_ID_SIZE;
+-      while (pkt < limit)
+-              pkt = pack_hex_byte(pkt, *altid++);
+-      return pkt;
+-}
+-
+-/* Convert an integer into our threadref */
+-static void int_to_threadref(threadref * id, const int value)
+-{
+-      unsigned char *scan = (unsigned char *) id;
+-      int i = 4;
+-
+-      while (i--)
+-              *scan++ = 0;
+-
+-      *scan++ = (value >> 24) & 0xff;
+-      *scan++ = (value >> 16) & 0xff;
+-      *scan++ = (value >> 8) & 0xff;
+-      *scan++ = (value & 0xff);
+-}
+-
+-/* Return a task structure ptr for a particular pid */
+-static struct task_struct *get_thread(int pid)
+-{
+-      struct task_struct *thread;
+-
+-      /* Use PID_MAX w/gdb for pid 0 */
+-      if (pid == PID_MAX) pid = 0;
+-
+-      /* First check via PID */
+-      thread = find_task_by_pid(pid);
+-
+-      if (thread)
+-              return thread;
+-
+-      /* Start at the start */
+-      thread = init_tasks[0];
+-
+-      /* Walk along the linked list of tasks */
+-      do {
+-              if (thread->pid == pid)
+-                      return thread;
+-              thread = thread->next_task;
+-      } while (thread != init_tasks[0]);
+-
+-      return NULL;
+-}
+-
+-#endif /* CONFIG_KGDB_THREAD */
+-
+-/* Scan for the start char '$', read the packet and check the checksum */
+-static void get_packet(char *buffer, int buflen)
+-{
+-      unsigned char checksum;
+-      unsigned char xmitcsum;
+-      int i;
+-      int count;
+-      char ch;
+-
+-      do {
+-              /* Ignore everything until the start character */
+-              while ((ch = get_debug_char()) != '$');
+-
+-              checksum = 0;
+-              xmitcsum = -1;
+-              count = 0;
+-
+-              /* Now, read until a # or end of buffer is found */
+-              while (count < (buflen - 1)) {
+-                      ch = get_debug_char();
+-
+-                      if (ch == '#')
+-                              break;
+-
+-                      checksum = checksum + ch;
+-                      buffer[count] = ch;
+-                      count = count + 1;
+-              }
+-
+-              buffer[count] = 0;
+-
+-              /* Continue to read checksum following # */
+-              if (ch == '#') {
+-                      xmitcsum = hex(get_debug_char()) << 4;
+-                      xmitcsum += hex(get_debug_char());
+-
+-                      /* Checksum */
+-                      if (checksum != xmitcsum)
+-                              put_debug_char('-');    /* Failed checksum */
+-                      else {
+-                              /* Ack successful transfer */
+-                              put_debug_char('+');
+-
+-                              /* If a sequence char is present, reply 
+-                                 the sequence ID */
+-                              if (buffer[2] == ':') {
+-                                      put_debug_char(buffer[0]);
+-                                      put_debug_char(buffer[1]);
+-
+-                                      /* Remove sequence chars from buffer */
+-                                      count = strlen(buffer);
+-                                      for (i = 3; i <= count; i++)
+-                                              buffer[i - 3] = buffer[i];
+-                              }
+-                      }
+-              }
+-      }
+-      while (checksum != xmitcsum);   /* Keep trying while we fail */
+-}
+-
+-/* Send the packet in the buffer with run-length encoding */
+-static void put_packet(char *buffer)
+-{
+-      int checksum;
+-      char *src;
+-      int runlen;
+-      int encode;
+-
+-      do {
+-              src = buffer;
+-              put_debug_char('$');
+-              checksum = 0;
+-
+-              /* Continue while we still have chars left */
+-              while (*src) {
+-                      /* Check for runs up to 99 chars long */
+-                      for (runlen = 1; runlen < 99; runlen++) {
+-                              if (src[0] != src[runlen])
+-                                      break;
+-                      }
+-
+-                      if (runlen > 3) {
+-                              /* Got a useful amount, send encoding */
+-                              encode = runlen + ' ' - 4;
+-                              put_debug_char(*src);   checksum += *src;
+-                              put_debug_char('*');    checksum += '*';
+-                              put_debug_char(encode); checksum += encode;
+-                              src += runlen;
+-                      } else {
+-                              /* Otherwise just send the current char */
+-                              put_debug_char(*src);   checksum += *src;
+-                              src += 1;
+-                      }
+-              }
+-
+-              /* '#' Separator, put high and low components of checksum */
+-              put_debug_char('#');
+-              put_debug_char(highhex(checksum));
+-              put_debug_char(lowhex(checksum));
+-      }
+-      while ((get_debug_char()) != '+');      /* While no ack */
+-}
+-
+-/* A bus error has occurred - perform a longjmp to return execution and
+-   allow handling of the error */
+-static void kgdb_handle_bus_error(void)
+-{
+-      longjmp(rem_com_env, 1);
+-}
+-
+-/* Translate SH-3/4 exception numbers to unix-like signal values */
+-static int compute_signal(const int excep_code)
+-{
+-      int sigval;
+-
+-      switch (excep_code) {
+-
+-      case INVALID_INSN_VEC:
+-      case INVALID_SLOT_VEC:
+-              sigval = SIGILL;
+-              break;
+-      case ADDRESS_ERROR_LOAD_VEC:
+-      case ADDRESS_ERROR_STORE_VEC:
+-              sigval = SIGSEGV;
+-              break;
+-
+-      case SERIAL_BREAK_VEC:
+-      case NMI_VEC:
+-              sigval = SIGINT;
+-              break;
+-
+-      case USER_BREAK_VEC:
+-      case TRAP_VEC:
+-              sigval = SIGTRAP;
+-              break;
+-
+-      default:
+-              sigval = SIGBUS;        /* "software generated" */
+-              break;
+-      }
+-
+-      return (sigval);
+-}
+-
+-/* Make a local copy of the registers passed into the handler (bletch) */
+-static void kgdb_regs_to_gdb_regs(const struct kgdb_regs *regs,
+-                                int *gdb_regs)
+-{
+-      gdb_regs[R0] = regs->regs[R0];
+-      gdb_regs[R1] = regs->regs[R1];
+-      gdb_regs[R2] = regs->regs[R2];
+-      gdb_regs[R3] = regs->regs[R3];
+-      gdb_regs[R4] = regs->regs[R4];
+-      gdb_regs[R5] = regs->regs[R5];
+-      gdb_regs[R6] = regs->regs[R6];
+-      gdb_regs[R7] = regs->regs[R7];
+-      gdb_regs[R8] = regs->regs[R8];
+-      gdb_regs[R9] = regs->regs[R9];
+-      gdb_regs[R10] = regs->regs[R10];
+-      gdb_regs[R11] = regs->regs[R11];
+-      gdb_regs[R12] = regs->regs[R12];
+-      gdb_regs[R13] = regs->regs[R13];
+-      gdb_regs[R14] = regs->regs[R14];
+-      gdb_regs[R15] = regs->regs[R15];
+-      gdb_regs[PC] = regs->pc;
+-      gdb_regs[PR] = regs->pr;
+-      gdb_regs[GBR] = regs->gbr;
+-      gdb_regs[MACH] = regs->mach;
+-      gdb_regs[MACL] = regs->macl;
+-      gdb_regs[SR] = regs->sr;
+-      gdb_regs[VBR] = regs->vbr;
+-}
+-
+-/* Copy local gdb registers back to kgdb regs, for later copy to kernel */
+-static void gdb_regs_to_kgdb_regs(const int *gdb_regs,
+-                                struct kgdb_regs *regs)
+-{
+-      regs->regs[R0] = gdb_regs[R0];
+-      regs->regs[R1] = gdb_regs[R1];
+-      regs->regs[R2] = gdb_regs[R2];
+-      regs->regs[R3] = gdb_regs[R3];
+-      regs->regs[R4] = gdb_regs[R4];
+-      regs->regs[R5] = gdb_regs[R5];
+-      regs->regs[R6] = gdb_regs[R6];
+-      regs->regs[R7] = gdb_regs[R7];
+-      regs->regs[R8] = gdb_regs[R8];
+-      regs->regs[R9] = gdb_regs[R9];
+-      regs->regs[R10] = gdb_regs[R10];
+-      regs->regs[R11] = gdb_regs[R11];
+-      regs->regs[R12] = gdb_regs[R12];
+-      regs->regs[R13] = gdb_regs[R13];
+-      regs->regs[R14] = gdb_regs[R14];
+-      regs->regs[R15] = gdb_regs[R15];
+-      regs->pc = gdb_regs[PC];
+-      regs->pr = gdb_regs[PR];
+-      regs->gbr = gdb_regs[GBR];
+-      regs->mach = gdb_regs[MACH];
+-      regs->macl = gdb_regs[MACL];
+-      regs->sr = gdb_regs[SR];
+-      regs->vbr = gdb_regs[VBR];
+-}
+-
+-#ifdef CONFIG_KGDB_THREAD
+-/* Make a local copy of registers from the specified thread */
+-asmlinkage void ret_from_fork(void);
+-static void thread_regs_to_gdb_regs(const struct task_struct *thread,
+-                                  int *gdb_regs)
+-{
+-      int regno;
+-      int *tregs;
+-
+-      /* Initialize to zero */
+-      for (regno = 0; regno < MAXREG; regno++)
+-              gdb_regs[regno] = 0;
+-
+-      /* Just making sure... */
+-      if (thread == NULL)
+-              return;
+-
+-      /* A new fork has pt_regs on the stack from a fork() call */
+-      if (thread->thread.pc == (unsigned long)ret_from_fork) {
+-
+-              int vbr_val;
+-              struct pt_regs *kregs;
+-              kregs = (struct pt_regs*)thread->thread.sp;
+-
+-              gdb_regs[R0] = kregs->regs[R0];
+-              gdb_regs[R1] = kregs->regs[R1];
+-              gdb_regs[R2] = kregs->regs[R2];
+-              gdb_regs[R3] = kregs->regs[R3];
+-              gdb_regs[R4] = kregs->regs[R4];
+-              gdb_regs[R5] = kregs->regs[R5];
+-              gdb_regs[R6] = kregs->regs[R6];
+-              gdb_regs[R7] = kregs->regs[R7];
+-              gdb_regs[R8] = kregs->regs[R8];
+-              gdb_regs[R9] = kregs->regs[R9];
+-              gdb_regs[R10] = kregs->regs[R10];
+-              gdb_regs[R11] = kregs->regs[R11];
+-              gdb_regs[R12] = kregs->regs[R12];
+-              gdb_regs[R13] = kregs->regs[R13];
+-              gdb_regs[R14] = kregs->regs[R14];
+-              gdb_regs[R15] = kregs->regs[R15];
+-              gdb_regs[PC] = kregs->pc;
+-              gdb_regs[PR] = kregs->pr;
+-              gdb_regs[GBR] = kregs->gbr;
+-              gdb_regs[MACH] = kregs->mach;
+-              gdb_regs[MACL] = kregs->macl;
+-              gdb_regs[SR] = kregs->sr;
+-
+-              asm("stc vbr, %0":"=r"(vbr_val));
+-              gdb_regs[VBR] = vbr_val;
+-              return;
+-      }
+-
+-      /* Otherwise, we have only some registers from switch_to() */
+-      tregs = (int *)thread->thread.sp;
+-      gdb_regs[R15] = (int)tregs;
+-      gdb_regs[R14] = *tregs++;
+-      gdb_regs[R13] = *tregs++;
+-      gdb_regs[R12] = *tregs++;
+-      gdb_regs[R11] = *tregs++;
+-      gdb_regs[R10] = *tregs++;
+-      gdb_regs[R9] = *tregs++;
+-      gdb_regs[R8] = *tregs++;
+-      gdb_regs[PR] = *tregs++;
+-      gdb_regs[GBR] = *tregs++;
+-      gdb_regs[PC] = thread->thread.pc;
+-}
+-#endif /* CONFIG_KGDB_THREAD */
+-
+-/* Calculate the new address for after a step */
+-static short *get_step_address(void)
+-{
+-      short op = *(short *) trap_registers.pc;
+-      long addr;
+-
+-      /* BT */
+-      if (OPCODE_BT(op)) {
+-              if (trap_registers.sr & SR_T_BIT_MASK)
+-                      addr = trap_registers.pc + 4 + OPCODE_BTF_DISP(op);
+-              else
+-                      addr = trap_registers.pc + 2;
+-      }
+-
+-      /* BTS */
+-      else if (OPCODE_BTS(op)) {
+-              if (trap_registers.sr & SR_T_BIT_MASK)
+-                      addr = trap_registers.pc + 4 + OPCODE_BTF_DISP(op);
+-              else
+-                      addr = trap_registers.pc + 4;   /* Not in delay slot */
+-      }
+-
+-      /* BF */
+-      else if (OPCODE_BF(op)) {
+-              if (!(trap_registers.sr & SR_T_BIT_MASK))
+-                      addr = trap_registers.pc + 4 + OPCODE_BTF_DISP(op);
+-              else
+-                      addr = trap_registers.pc + 2;
+-      }
+-
+-      /* BFS */
+-      else if (OPCODE_BFS(op)) {
+-              if (!(trap_registers.sr & SR_T_BIT_MASK))
+-                      addr = trap_registers.pc + 4 + OPCODE_BTF_DISP(op);
+-              else
+-                      addr = trap_registers.pc + 4;   /* Not in delay slot */
+-      }
+-
+-      /* BRA */
+-      else if (OPCODE_BRA(op))
+-              addr = trap_registers.pc + 4 + OPCODE_BRA_DISP(op);
+-
+-      /* BRAF */
+-      else if (OPCODE_BRAF(op))
+-              addr = trap_registers.pc + 4
+-                  + trap_registers.regs[OPCODE_BRAF_REG(op)];
+-
+-      /* BSR */
+-      else if (OPCODE_BSR(op))
+-              addr = trap_registers.pc + 4 + OPCODE_BSR_DISP(op);
+-
+-      /* BSRF */
+-      else if (OPCODE_BSRF(op))
+-              addr = trap_registers.pc + 4
+-                  + trap_registers.regs[OPCODE_BSRF_REG(op)];
+-
+-      /* JMP */
+-      else if (OPCODE_JMP(op))
+-              addr = trap_registers.regs[OPCODE_JMP_REG(op)];
+-
+-      /* JSR */
+-      else if (OPCODE_JSR(op))
+-              addr = trap_registers.regs[OPCODE_JSR_REG(op)];
+-
+-      /* RTS */
+-      else if (OPCODE_RTS(op))
+-              addr = trap_registers.pr;
+-
+-      /* RTE */
+-      else if (OPCODE_RTE(op))
+-              addr = trap_registers.regs[15];
+-
+-      /* Other */
+-      else
+-              addr = trap_registers.pc + 2;
+-
+-      kgdb_flush_icache_range(addr, addr + 2);
+-      return (short *) addr;
+-}
+-
+-/* Set up a single-step.  Replace the instruction immediately after the 
+-   current instruction (i.e. next in the expected flow of control) with a
+-   trap instruction, so that returning will cause only a single instruction
+-   to be executed. Note that this model is slightly broken for instructions
+-   with delay slots (e.g. B[TF]S, BSR, BRA etc), where both the branch
+-   and the instruction in the delay slot will be executed. */
+-static void do_single_step(void)
+-{
+-      unsigned short *addr = 0;
+-
+-      /* Determine where the target instruction will send us to */
+-      addr = get_step_address();
+-      stepped_address = (int)addr;
+-
+-      /* Replace it */
+-      stepped_opcode = *(short *)addr;
+-      *addr = STEP_OPCODE;
+-
+-      /* Flush and return */
+-      kgdb_flush_icache_range((long) addr, (long) addr + 2);
+-      return;
+-}
+-
+-/* Undo a single step */
+-static void undo_single_step(void)
+-{
+-      /* If we have stepped, put back the old instruction */
+-      /* Use stepped_address in case we stopped elsewhere */
+-      if (stepped_opcode != 0) {
+-              *(short*)stepped_address = stepped_opcode;
+-              kgdb_flush_icache_range(stepped_address, stepped_address + 2);
+-      }
+-      stepped_opcode = 0;
+-}
+-
+-/* Send a signal message */
+-static void send_signal_msg(const int signum)
+-{
+-#ifndef CONFIG_KGDB_THREAD
+-      out_buffer[0] = 'S';
+-      out_buffer[1] = highhex(signum);
+-      out_buffer[2] = lowhex(signum);
+-      out_buffer[3] = 0;
+-      put_packet(out_buffer);
+-#else /* CONFIG_KGDB_THREAD */
+-      int threadid;
+-      threadref thref;
+-      char *out = out_buffer;
+-      const char *tstring = "thread";
+-
+-      *out++ = 'T';
+-      *out++ = highhex(signum);
+-      *out++ = lowhex(signum);
+-
+-      while (*tstring) {
+-              *out++ = *tstring++;
+-      }
+-      *out++ = ':';
+-
+-      threadid = trapped_thread->pid;
+-      if (threadid == 0) threadid = PID_MAX;
+-      int_to_threadref(&thref, threadid);
+-      pack_threadid(out, &thref);
+-      out += BUF_THREAD_ID_SIZE;
+-      *out++ = ';';
+-
+-      *out = 0;
+-      put_packet(out_buffer);
+-#endif /* CONFIG_KGDB_THREAD */
+-}
+-
+-/* Reply that all was well */
+-static void send_ok_msg(void)
+-{
+-      strcpy(out_buffer, "OK");
+-      put_packet(out_buffer);
+-}
+-
+-/* Reply that an error occurred */
+-static void send_err_msg(void)
+-{
+-      strcpy(out_buffer, "E01");
+-      put_packet(out_buffer);
+-}
+-
+-/* Empty message indicates unrecognised command */
+-static void send_empty_msg(void)
+-{
+-      put_packet("");
+-}
+-
+-/* Read memory due to 'm' message */
+-static void read_mem_msg(void)
+-{
+-      char *ptr;
+-      int addr;
+-      int length;
+-
+-      /* Jmp, disable bus error handler */
+-      if (setjmp(rem_com_env) == 0) {
+-
+-              kgdb_nofault = 1;
+-
+-              /* Walk through, have m<addr>,<length> */
+-              ptr = &in_buffer[1];
+-              if (hex_to_int(&ptr, &addr) && (*ptr++ == ','))
+-                      if (hex_to_int(&ptr, &length)) {
+-                              ptr = 0;
+-                              if (length * 2 > OUTBUFMAX)
+-                                      length = OUTBUFMAX / 2;
+-                              mem_to_hex((char *) addr, out_buffer, length);
+-                      }
+-              if (ptr)
+-                      send_err_msg();
+-              else
+-                      put_packet(out_buffer);
+-      } else
+-              send_err_msg();
+-
+-      /* Restore bus error handler */
+-      kgdb_nofault = 0;
+-}
+-
+-/* Write memory due to 'M' or 'X' message */
+-static void write_mem_msg(int binary)
+-{
+-      char *ptr;
+-      int addr;
+-      int length;
+-
+-      if (setjmp(rem_com_env) == 0) {
+-
+-              kgdb_nofault = 1;
+-
+-              /* Walk through, have M<addr>,<length>:<data> */
+-              ptr = &in_buffer[1];
+-              if (hex_to_int(&ptr, &addr) && (*ptr++ == ','))
+-                      if (hex_to_int(&ptr, &length) && (*ptr++ == ':')) {
+-                              if (binary)
+-                                      ebin_to_mem(ptr, (char*)addr, length);
+-                              else
+-                                      hex_to_mem(ptr, (char*)addr, length);
+-                              kgdb_flush_icache_range(addr, addr + length);
+-                              ptr = 0;
+-                              send_ok_msg();
+-                      }
+-              if (ptr)
+-                      send_err_msg();
+-      } else
+-              send_err_msg();
+-
+-      /* Restore bus error handler */
+-      kgdb_nofault = 0;
+-}
+-
+-/* Continue message  */
+-static void continue_msg(void)
+-{
+-      /* Try to read optional parameter, PC unchanged if none */
+-      char *ptr = &in_buffer[1];
+-      int addr;
+-
+-      if (hex_to_int(&ptr, &addr))
+-              trap_registers.pc = addr;
+-}
+-
+-/* Continue message with signal */
+-static void continue_with_sig_msg(void)
+-{
+-      int signal;
+-      char *ptr = &in_buffer[1];
+-      int addr;
+-
+-      /* Report limitation */
+-      kgdb_to_gdb("Cannot force signal in kgdb, continuing anyway.\n");
+-
+-      /* Signal */
+-      hex_to_int(&ptr, &signal);
+-      if (*ptr == ';')
+-              ptr++;
+-
+-      /* Optional address */
+-      if (hex_to_int(&ptr, &addr))
+-              trap_registers.pc = addr;
+-}
+-
+-/* Step message */
+-static void step_msg(void)
+-{
+-      continue_msg();
+-      do_single_step();
+-}
+-
+-/* Step message with signal */
+-static void step_with_sig_msg(void)
+-{
+-      continue_with_sig_msg();
+-      do_single_step();
+-}
+-
+-/* Send register contents */
+-static void send_regs_msg(void)
+-{
+-#ifdef CONFIG_KGDB_THREAD
+-      if (!current_thread)
+-              kgdb_regs_to_gdb_regs(&trap_registers, registers);
+-      else
+-              thread_regs_to_gdb_regs(current_thread, registers);
+-#else
+-      kgdb_regs_to_gdb_regs(&trap_registers, registers);
+-#endif
+-
+-      mem_to_hex((char *) registers, out_buffer, NUMREGBYTES);
+-      put_packet(out_buffer);
+-}
+-
+-/* Set register contents - currently can't set other thread's registers */
+-static void set_regs_msg(void)
+-{
+-#ifdef CONFIG_KGDB_THREAD
+-      if (!current_thread) {
+-#endif
+-              kgdb_regs_to_gdb_regs(&trap_registers, registers);
+-              hex_to_mem(&in_buffer[1], (char *) registers, NUMREGBYTES);
+-              gdb_regs_to_kgdb_regs(registers, &trap_registers);
+-              send_ok_msg();
+-#ifdef CONFIG_KGDB_THREAD
+-      } else
+-              send_err_msg();
+-#endif
+-}
+-
+-
+-#ifdef CONFIG_KGDB_THREAD
+-
+-/* Set the status for a thread */
+-void set_thread_msg(void)
+-{
+-      int threadid;
+-      struct task_struct *thread = NULL;
+-      char *ptr;
+-
+-      switch (in_buffer[1]) {
+-
+-              /* To select which thread for gG etc messages, i.e. supported */
+-      case 'g':
+-
+-              ptr = &in_buffer[2];
+-              hex_to_int(&ptr, &threadid);
+-              thread = get_thread(threadid);
+-
+-              /* If we haven't found it */
+-              if (!thread) {
+-                      send_err_msg();
+-                      break;
+-              }
+-
+-              /* Set current_thread (or not) */
+-              if (thread == trapped_thread)
+-                      current_thread = NULL;
+-              else
+-                      current_thread = thread;
+-              send_ok_msg();
+-              break;
+-
+-      /* To select which thread for cCsS messages, i.e. unsupported */
+-      case 'c':
+-              send_ok_msg();
+-              break;
+-
+-      default:
+-              send_empty_msg();
+-              break;
+-      }
+-}
+-
+-/* Is a thread alive? */
+-static void thread_status_msg(void)
+-{
+-      char *ptr;
+-      int threadid;
+-      struct task_struct *thread = NULL;
+-
+-      ptr = &in_buffer[1];
+-      hex_to_int(&ptr, &threadid);
+-      thread = get_thread(threadid);
+-      if (thread)
+-              send_ok_msg();
+-      else
+-              send_err_msg();
+-}
+-/* Send the current thread ID */
+-static void thread_id_msg(void)
+-{
+-      int threadid;
+-      threadref thref;
+-
+-      out_buffer[0] = 'Q';
+-      out_buffer[1] = 'C';
+-
+-      if (current_thread)
+-              threadid = current_thread->pid;
+-      else if (trapped_thread)
+-              threadid = trapped_thread->pid;
+-      else /* Impossible, but just in case! */
+-      {
+-              send_err_msg();
+-              return;
+-      }
+-
+-      /* Translate pid 0 to PID_MAX for gdb */
+-      if (threadid == 0) threadid = PID_MAX;
+-
+-      int_to_threadref(&thref, threadid);
+-      pack_threadid(out_buffer + 2, &thref);
+-      out_buffer[2 + BUF_THREAD_ID_SIZE] = '\0';
+-      put_packet(out_buffer);
+-}
+-
+-/* Send thread info */
+-static void thread_info_msg(void)
+-{
+-      struct task_struct *thread = NULL;
+-      int threadid;
+-      char *pos;
+-      threadref thref;
+-
+-      /* Start with 'm' */
+-      out_buffer[0] = 'm';
+-      pos = &out_buffer[1];
+-
+-      /* For all possible thread IDs - this will overrun if > 44 threads! */
+-      /* Start at 1 and include PID_MAX (since GDB won't use pid 0...) */
+-      for (threadid = 1; threadid <= PID_MAX; threadid++) {
+-
+-              read_lock(&tasklist_lock);
+-              thread = get_thread(threadid);
+-              read_unlock(&tasklist_lock);
+-
+-              /* If it's a valid thread */
+-              if (thread) {
+-                      int_to_threadref(&thref, threadid);
+-                      pack_threadid(pos, &thref);
+-                      pos += BUF_THREAD_ID_SIZE;
+-                      *pos++ = ',';
+-              }
+-      }
+-      *--pos = 0;             /* Lose final comma */
+-      put_packet(out_buffer);
+-
+-}
+-
+-/* Return printable info for gdb's 'info threads' command */
+-static void thread_extra_info_msg(void)
+-{
+-      int threadid;
+-      struct task_struct *thread = NULL;
+-      char buffer[20], *ptr;
+-      int i;
+-
+-      /* Extract thread ID */
+-      ptr = &in_buffer[17];
+-      hex_to_int(&ptr, &threadid);
+-      thread = get_thread(threadid);
+-
+-      /* If we don't recognise it, say so */
+-      if (thread == NULL)
+-              strcpy(buffer, "(unknown)");
+-      else
+-              strcpy(buffer, thread->comm);
+-
+-      /* Construct packet */
+-      for (i = 0, ptr = out_buffer; buffer[i]; i++)
+-              ptr = pack_hex_byte(ptr, buffer[i]);
+-
+-      if (thread->thread.pc == (unsigned long)ret_from_fork) {
+-              strcpy(buffer, "<new fork>");
+-              for (i = 0; buffer[i]; i++)
+-                      ptr = pack_hex_byte(ptr, buffer[i]);
+-      }
+-
+-      *ptr = '\0';
+-      put_packet(out_buffer);
+-}
+-
+-/* Handle all qFooBarBaz messages - have to use an if statement as
+-   opposed to a switch because q messages can have > 1 char id. */
+-static void query_msg(void)
+-{
+-      const char *q_start = &in_buffer[1];
+-
+-      /* qC = return current thread ID */
+-      if (strncmp(q_start, "C", 1) == 0)
+-              thread_id_msg();
+-
+-      /* qfThreadInfo = query all threads (first) */
+-      else if (strncmp(q_start, "fThreadInfo", 11) == 0)
+-              thread_info_msg();
+-
+-      /* qsThreadInfo = query all threads (subsequent). We know we have sent
+-         them all after the qfThreadInfo message, so there are no to send */
+-      else if (strncmp(q_start, "sThreadInfo", 11) == 0)
+-              put_packet("l");        /* el = last */
+-
+-      /* qThreadExtraInfo = supply printable information per thread */
+-      else if (strncmp(q_start, "ThreadExtraInfo", 15) == 0)
+-              thread_extra_info_msg();
+-
+-      /* Unsupported - empty message as per spec */
+-      else
+-              send_empty_msg();
+-}
+-#endif /* CONFIG_KGDB_THREAD */
+-
+-/*
+- * Bring up the ports..
+- */
+-static int kgdb_serial_setup(void)
+-{
+-      extern int kgdb_console_setup(struct console *co, char *options);
+-      struct console dummy;
+-
+-      kgdb_console_setup(&dummy, 0);
+-
+-      return 0;
+-}
+-
+-/* The command loop, read and act on requests */
+-static void kgdb_command_loop(const int excep_code, const int trapa_value)
+-{
+-      int sigval;
+-
+-      if (excep_code == NMI_VEC) {
+-#ifndef CONFIG_KGDB_NMI
+-              KGDB_PRINTK("Ignoring unexpected NMI?\n");
+-              return;
+-#else /* CONFIG_KGDB_NMI */
+-              if (!kgdb_enabled) {
+-                      kgdb_enabled = 1;
+-                      kgdb_init();
+-              }
+-#endif /* CONFIG_KGDB_NMI */
+-      }
+-
+-      /* Ignore if we're disabled */
+-      if (!kgdb_enabled)
+-              return;
+-
+-#ifdef CONFIG_KGDB_THREAD
+-      /* Until GDB specifies a thread */
+-      current_thread = NULL;
+-      trapped_thread = current;
+-#endif
+-
+-      /* Enter GDB mode (e.g. after detach) */
+-      if (!kgdb_in_gdb_mode) {
+-              /* Do serial setup, notify user, issue preemptive ack */
+-              kgdb_serial_setup();
+-              KGDB_PRINTK("Waiting for GDB (on %s%d at %d baud)\n",
+-                          (kgdb_porttype ? kgdb_porttype->name : ""),
+-                          kgdb_portnum, kgdb_baud);
+-              kgdb_in_gdb_mode = 1;
+-              put_debug_char('+');
+-      }
+-
+-      /* Reply to host that an exception has occurred */
+-      sigval = compute_signal(excep_code);
+-      send_signal_msg(sigval);
+-
+-      /* TRAP_VEC exception indicates a software trap inserted in place of
+-         code by GDB so back up PC by one instruction, as this instruction
+-         will later be replaced by its original one.  Do NOT do this for
+-         trap 0xff, since that indicates a compiled-in breakpoint which
+-         will not be replaced (and we would retake the trap forever) */
+-      if ((excep_code == TRAP_VEC) && (trapa_value != (0xff << 2))) {
+-              trap_registers.pc -= 2;
+-      }
+-
+-      /* Undo any stepping we may have done */
+-      undo_single_step();
+-
+-      while (1) {
+-
+-              out_buffer[0] = 0;
+-              get_packet(in_buffer, BUFMAX);
+-
+-              /* Examine first char of buffer to see what we need to do */
+-              switch (in_buffer[0]) {
+-
+-              case '?':       /* Send which signal we've received */
+-                      send_signal_msg(sigval);
+-                      break;
+-
+-              case 'g':       /* Return the values of the CPU registers */
+-                      send_regs_msg();
+-                      break;
+-
+-              case 'G':       /* Set the value of the CPU registers */
+-                      set_regs_msg();
+-                      break;
+-
+-              case 'm':       /* Read LLLL bytes address AA..AA */
+-                      read_mem_msg();
+-                      break;
+-
+-              case 'M':       /* Write LLLL bytes address AA..AA, ret OK */
+-                      write_mem_msg(0);       /* 0 = data in hex */
+-                      break;
+-
+-              case 'X':       /* Write LLLL bytes esc bin address AA..AA */
+-                      if (kgdb_bits == '8')
+-                              write_mem_msg(1); /* 1 = data in binary */
+-                      else
+-                              send_empty_msg();
+-                      break;
+-
+-              case 'C':       /* Continue, signum included, we ignore it */
+-                      continue_with_sig_msg();
+-                      return;
+-
+-              case 'c':       /* Continue at address AA..AA (optional) */
+-                      continue_msg();
+-                      return;
+-
+-              case 'S':       /* Step, signum included, we ignore it */
+-                      step_with_sig_msg();
+-                      return;
+-
+-              case 's':       /* Step one instruction from AA..AA */
+-                      step_msg();
+-                      return;
+-
+-#ifdef CONFIG_KGDB_THREAD
+-
+-              case 'H':       /* Task related */
+-                      set_thread_msg();
+-                      break;
+-
+-              case 'T':       /* Query thread status */
+-                      thread_status_msg();
+-                      break;
+-
+-              case 'q':       /* Handle query - currently thread-related */
+-                      query_msg();
+-                      break;
+-#endif
+-
+-              case 'k':       /* 'Kill the program' with a kernel ? */
+-                      break;
+-
+-              case 'D':       /* Detach from program, send reply OK */
+-                      kgdb_in_gdb_mode = 0;
+-                      send_ok_msg();
+-                      get_debug_char();
+-                      return;
+-
+-              default:
+-                      send_empty_msg();
+-                      break;
+-              }
+-      }
+-}
+-
+-/* There has been an exception, most likely a breakpoint. */
+-void kgdb_handle_exception(struct pt_regs *regs)
+-{
+-      int excep_code, vbr_val;
+-      int count;
+-      int trapa_value = ctrl_inl(TRA);
+-
+-      /* Copy kernel regs (from stack) */
+-      for (count = 0; count < 16; count++)
+-              trap_registers.regs[count] = regs->regs[count];
+-      trap_registers.pc = regs->pc;
+-      trap_registers.pr = regs->pr;
+-      trap_registers.sr = regs->sr;
+-      trap_registers.gbr = regs->gbr;
+-      trap_registers.mach = regs->mach;
+-      trap_registers.macl = regs->macl;
+-
+-      asm("stc vbr, %0":"=r"(vbr_val));
+-      trap_registers.vbr = vbr_val;
+-
+-      /* Get excode for command loop call, user access */
+-      asm("stc r2_bank, %0":"=r"(excep_code));
+-      kgdb_excode = excep_code;
+-
+-      /* Other interesting environment items for reference */
+-      asm("stc r6_bank, %0":"=r"(kgdb_g_imask));
+-      kgdb_current = current;
+-      kgdb_trapa_val = trapa_value;
+-
+-      /* Act on the exception */
+-      kgdb_command_loop(excep_code >> 5, trapa_value);
+-
+-      kgdb_current = NULL;
+-
+-      /* Copy back the (maybe modified) registers */
+-      for (count = 0; count < 16; count++)
+-              regs->regs[count] = trap_registers.regs[count];
+-      regs->pc = trap_registers.pc;
+-      regs->pr = trap_registers.pr;
+-      regs->sr = trap_registers.sr;
+-      regs->gbr = trap_registers.gbr;
+-      regs->mach = trap_registers.mach;
+-      regs->macl = trap_registers.macl;
+-
+-      vbr_val = trap_registers.vbr;
+-      asm("ldc %0, vbr": :"r"(vbr_val));
+-
+-      return;
+-}
+-
+-/* Trigger a breakpoint by function */
+-void breakpoint(void)
+-{
+-      if (!kgdb_enabled) {
+-              kgdb_enabled = 1;
+-              kgdb_init();
+-      }
+-      BREAKPOINT();
+-}
+-
+-/* Initialise the KGDB data structures and serial configuration */
+-int kgdb_init(void)
+-{
+-      if (!kgdb_enabled)
+-              return 1;
+-
+-      in_nmi = 0;
+-      kgdb_nofault = 0;
+-      stepped_opcode = 0;
+-      kgdb_in_gdb_mode = 0;
+-
+-      if (kgdb_serial_setup() != 0) {
+-              KGDB_PRINTK("serial setup error\n");
+-              return -1;
+-      }
+-
+-      /* Init ptr to exception handler */
+-      kgdb_debug_hook = kgdb_handle_exception;
+-      kgdb_bus_err_hook = kgdb_handle_bus_error;
+-
+-      /* Enter kgdb now if requested, or just report init done */
+-      if (kgdb_halt) {
+-              kgdb_in_gdb_mode = 1;
+-              put_debug_char('+');
+-              breakpoint();
+-      }
+-      else
+-      {
+-              KGDB_PRINTK("stub is initialized.\n");
+-      }
+-
+-      return 0;
+-}
+-
+-/* Make function available for "user messages"; console will use it too. */
+-
+-char gdbmsgbuf[BUFMAX];
+-#define MAXOUT ((BUFMAX-2)/2)
+-
+-static void kgdb_msg_write(const char *s, unsigned count)
+-{
+-      int i;
+-      int wcount;
+-      char *bufptr;
+-
+-      /* 'O'utput */
+-      gdbmsgbuf[0] = 'O';
+-
+-      /* Fill and send buffers... */
+-      while (count > 0) {
+-              bufptr = gdbmsgbuf + 1;
+-
+-              /* Calculate how many this time */
+-              wcount = (count > MAXOUT) ? MAXOUT : count;
+-              
+-              /* Pack in hex chars */
+-              for (i = 0; i < wcount; i++)
+-                      bufptr = pack_hex_byte(bufptr, s[i]);
+-              *bufptr = '\0';
+-
+-              /* Move up */
+-              s += wcount;
+-              count -= wcount;
+-
+-              /* Write packet */
+-              put_packet(gdbmsgbuf);
+-      }
+-}
+-
+-static void kgdb_to_gdb(const char *s)
+-{
+-      kgdb_msg_write(s, strlen(s));
+-}
+-
+-#ifdef CONFIG_SH_KGDB_CONSOLE
+-void kgdb_console_write(struct console *co, const char *s, unsigned count)
+-{
+-      /* Bail if we're not talking to GDB */
+-      if (!kgdb_in_gdb_mode)
+-              return;
+-
+-      kgdb_msg_write(s, count);
+-}
+-#endif
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/sh/kernel/setup.c linux-2.6.18.kgdb/arch/sh/kernel/setup.c
+--- linux-2.6.18/arch/sh/kernel/setup.c        2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/sh/kernel/setup.c   2008-06-10 16:19:47.000000000 +0400
+@@ -28,10 +28,6 @@
+ #include <asm/setup.h>
+ #include <asm/clock.h>
+ 
+-#ifdef CONFIG_SH_KGDB
+-#include <asm/kgdb.h>
+-static int kgdb_parse_options(char *options);
+-#endif
+ extern void * __rd_start, * __rd_end;
+ /*
+  * Machine setup..
+@@ -528,93 +524,3 @@ struct seq_operations cpuinfo_op = {
+       .show   = show_cpuinfo,
+ };
+ #endif /* CONFIG_PROC_FS */
+-
+-#ifdef CONFIG_SH_KGDB
+-/*
+- * Parse command-line kgdb options.  By default KGDB is enabled,
+- * entered on error (or other action) using default serial info.
+- * The command-line option can include a serial port specification
+- * and an action to override default or configured behavior.
+- */
+-struct kgdb_sermap kgdb_sci_sermap =
+-{ "ttySC", 5, kgdb_sci_setup, NULL };
+-
+-struct kgdb_sermap *kgdb_serlist = &kgdb_sci_sermap;
+-struct kgdb_sermap *kgdb_porttype = &kgdb_sci_sermap;
+-
+-void kgdb_register_sermap(struct kgdb_sermap *map)
+-{
+-      struct kgdb_sermap *last;
+-
+-      for (last = kgdb_serlist; last->next; last = last->next)
+-              ;
+-      last->next = map;
+-      if (!map->namelen) {
+-              map->namelen = strlen(map->name);
+-      }
+-}
+-
+-static int __init kgdb_parse_options(char *options)
+-{
+-      char c;
+-      int baud;
+-
+-      /* Check for port spec (or use default) */
+-
+-      /* Determine port type and instance */
+-      if (!memcmp(options, "tty", 3)) {
+-              struct kgdb_sermap *map = kgdb_serlist;
+-
+-              while (map && memcmp(options, map->name, map->namelen))
+-                      map = map->next;
+-
+-              if (!map) {
+-                      KGDB_PRINTK("unknown port spec in %s\n", options);
+-                      return -1;
+-              }
+-
+-              kgdb_porttype = map;
+-              kgdb_serial_setup = map->setup_fn;
+-              kgdb_portnum = options[map->namelen] - '0';
+-              options += map->namelen + 1;
+-
+-              options = (*options == ',') ? options+1 : options;
+-
+-              /* Read optional parameters (baud/parity/bits) */
+-              baud = simple_strtoul(options, &options, 10);
+-              if (baud != 0) {
+-                      kgdb_baud = baud;
+-
+-                      c = toupper(*options);
+-                      if (c == 'E' || c == 'O' || c == 'N') {
+-                              kgdb_parity = c;
+-                              options++;
+-                      }
+-
+-                      c = *options;
+-                      if (c == '7' || c == '8') {
+-                              kgdb_bits = c;
+-                              options++;
+-                      }
+-                      options = (*options == ',') ? options+1 : options;
+-              }
+-      }
+-
+-      /* Check for action specification */
+-      if (!memcmp(options, "halt", 4)) {
+-              kgdb_halt = 1;
+-              options += 4;
+-      } else if (!memcmp(options, "disabled", 8)) {
+-              kgdb_enabled = 0;
+-              options += 8;
+-      }
+-
+-      if (*options) {
+-                KGDB_PRINTK("ignored unknown options: %s\n", options);
+-              return 0;
+-      }
+-      return 1;
+-}
+-__setup("kgdb=", kgdb_parse_options);
+-#endif /* CONFIG_SH_KGDB */
+-
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/sh/kernel/time.c linux-2.6.18.kgdb/arch/sh/kernel/time.c
+--- linux-2.6.18/arch/sh/kernel/time.c 2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/sh/kernel/time.c    2008-06-10 16:19:47.000000000 +0400
+@@ -184,12 +184,4 @@ void __init time_init(void)
+        */
+       sys_timer = get_sys_timer();
+       printk(KERN_INFO "Using %s for system timer\n", sys_timer->name);
+-
+-#if defined(CONFIG_SH_KGDB)
+-      /*
+-       * Set up kgdb as requested. We do it here because the serial
+-       * init uses the timer vars we just set up for figuring baud.
+-       */
+-      kgdb_init();
+-#endif
+ }
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/sh/kernel/traps.c linux-2.6.18.kgdb/arch/sh/kernel/traps.c
+--- linux-2.6.18/arch/sh/kernel/traps.c        2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/sh/kernel/traps.c   2008-06-10 16:19:47.000000000 +0400
+@@ -26,6 +26,7 @@
+ #include <linux/spinlock.h>
+ #include <linux/module.h>
+ #include <linux/kallsyms.h>
++#include <linux/kgdb.h>
+ 
+ #include <asm/system.h>
+ #include <asm/uaccess.h>
+@@ -34,17 +35,8 @@
+ #include <asm/processor.h>
+ #include <asm/sections.h>
+ 
+-#ifdef CONFIG_SH_KGDB
+-#include <asm/kgdb.h>
+-#define CHK_REMOTE_DEBUG(regs)                                               \
+-{                                                                            \
+-  if ((kgdb_debug_hook != (kgdb_debug_hook_t *) NULL) && (!user_mode(regs))) \
+-  {                                                                          \
+-    (*kgdb_debug_hook)(regs);                                                \
+-  }                                                                          \
+-}
+-#else
+-#define CHK_REMOTE_DEBUG(regs)
++#ifndef CONFIG_KGDB
++#define kgdb_handle_exception(t, s, e, r)
+ #endif
+ 
+ #define DO_ERROR(trapnr, signr, str, name, tsk)                               \
+@@ -65,7 +57,7 @@ asmlinkage void do_##name(unsigned long 
+       local_irq_enable();                                             \
+       tsk->thread.error_code = error_code;                            \
+       tsk->thread.trap_no = trapnr;                                   \
+-        CHK_REMOTE_DEBUG(&regs);                                      \
++      kgdb_handle_exception(trapnr, signr, error_code, &regs);        \
+       force_sig(signr, tsk);                                          \
+       die_if_no_fixup(str,&regs,error_code);                          \
+ }
+@@ -92,10 +84,12 @@ void die(const char * str, struct pt_reg
+ {
+       static int die_counter;
+ 
++#ifdef CONFIG_KGDB
++      kgdb_handle_exception(1, SIGTRAP, err, regs);
++#endif
+       console_verbose();
+       spin_lock_irq(&die_lock);
+       printk("%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter);
+-      CHK_REMOTE_DEBUG(regs);
+       show_regs(regs);
+       spin_unlock_irq(&die_lock);
+       do_exit(SIGSEGV);
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/sh/mm/extable.c linux-2.6.18.kgdb/arch/sh/mm/extable.c
+--- linux-2.6.18/arch/sh/mm/extable.c  2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/sh/mm/extable.c     2008-06-10 16:19:47.000000000 +0400
+@@ -5,6 +5,7 @@
+  */
+ 
+ #include <linux/module.h>
++#include <linux/kgdb.h>
+ #include <asm/uaccess.h>
+ 
+ int fixup_exception(struct pt_regs *regs)
+@@ -16,6 +17,12 @@ int fixup_exception(struct pt_regs *regs
+               regs->pc = fixup->fixup;
+               return 1;
+       }
++#ifdef CONFIG_KGDB
++      if (atomic_read(&debugger_active) && kgdb_may_fault)
++              /* Restore our previous state. */
++              kgdb_fault_longjmp(kgdb_fault_jmp_regs);
++              /* Never reached. */
++#endif
+ 
+       return 0;
+ }
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/sh/mm/fault-nommu.c linux-2.6.18.kgdb/arch/sh/mm/fault-nommu.c
+--- linux-2.6.18/arch/sh/mm/fault-nommu.c      2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/sh/mm/fault-nommu.c 2008-06-10 16:19:47.000000000 +0400
+@@ -29,10 +29,6 @@
+ #include <asm/mmu_context.h>
+ #include <asm/cacheflush.h>
+ 
+-#if defined(CONFIG_SH_KGDB)
+-#include <asm/kgdb.h>
+-#endif
+-
+ extern void die(const char *,struct pt_regs *,long);
+ 
+ /*
+@@ -43,11 +39,6 @@ extern void die(const char *,struct pt_r
+ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long writeaccess,
+                             unsigned long address)
+ {
+-#if defined(CONFIG_SH_KGDB)
+-      if (kgdb_nofault && kgdb_bus_err_hook)
+-              kgdb_bus_err_hook();
+-#endif
+-
+       /*
+        * Oops. The kernel tried to access some bad page. We'll have to
+        * terminate things with extreme prejudice.
+@@ -69,11 +60,6 @@ asmlinkage void do_page_fault(struct pt_
+ asmlinkage int __do_page_fault(struct pt_regs *regs, unsigned long writeaccess,
+                              unsigned long address)
+ {
+-#if defined(CONFIG_SH_KGDB)
+-      if (kgdb_nofault && kgdb_bus_err_hook)
+-              kgdb_bus_err_hook();
+-#endif
+-
+       if (address >= TASK_SIZE)
+               return 1;
+ 
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/sh/mm/fault.c linux-2.6.18.kgdb/arch/sh/mm/fault.c
+--- linux-2.6.18/arch/sh/mm/fault.c    2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/sh/mm/fault.c       2008-06-10 16:19:47.000000000 +0400
+@@ -28,7 +28,6 @@
+ #include <asm/pgalloc.h>
+ #include <asm/mmu_context.h>
+ #include <asm/cacheflush.h>
+-#include <asm/kgdb.h>
+ 
+ extern void die(const char *,struct pt_regs *,long);
+ 
+@@ -45,11 +44,6 @@ asmlinkage void do_page_fault(struct pt_
+       struct vm_area_struct * vma;
+       unsigned long page;
+ 
+-#ifdef CONFIG_SH_KGDB
+-      if (kgdb_nofault && kgdb_bus_err_hook)
+-              kgdb_bus_err_hook();
+-#endif
+-
+       tsk = current;
+       mm = tsk->mm;
+ 
+@@ -153,6 +147,7 @@ no_context:
+       }
+       die("Oops", regs, writeaccess);
+       do_exit(SIGKILL);
++      dump_stack();
+ 
+ /*
+  * We ran out of memory, or some other thing happened to us that made
+@@ -202,11 +197,6 @@ asmlinkage int __do_page_fault(struct pt
+       spinlock_t *ptl;
+       int ret = 1;
+ 
+-#ifdef CONFIG_SH_KGDB
+-      if (kgdb_nofault && kgdb_bus_err_hook)
+-              kgdb_bus_err_hook();
+-#endif
+-
+ #ifdef CONFIG_SH_STORE_QUEUES
+       addrmax = P4SEG_STORE_QUE + 0x04000000;
+ #endif
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/x86_64/Kconfig.debug linux-2.6.18.kgdb/arch/x86_64/Kconfig.debug
+--- linux-2.6.18/arch/x86_64/Kconfig.debug     2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/x86_64/Kconfig.debug        2008-06-10 16:19:41.000000000 +0400
+@@ -55,7 +55,4 @@ config DEBUG_STACK_USAGE
+ 
+         This option will slow down process creation somewhat.
+ 
+-#config X86_REMOTE_DEBUG
+-#       bool "kgdb debugging stub"
+-
+ endmenu
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/x86_64/kernel/Makefile linux-2.6.18.kgdb/arch/x86_64/kernel/Makefile
+--- linux-2.6.18/arch/x86_64/kernel/Makefile   2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/x86_64/kernel/Makefile      2008-06-10 16:19:41.000000000 +0400
+@@ -33,6 +33,7 @@ obj-$(CONFIG_IOMMU)          += pci-gart.o apert
+ obj-$(CONFIG_CALGARY_IOMMU)   += pci-calgary.o tce.o
+ obj-$(CONFIG_SWIOTLB)         += pci-swiotlb.o
+ obj-$(CONFIG_KPROBES)         += kprobes.o
++obj-$(CONFIG_KGDB)            += kgdb.o kgdb-jmp.o
+ obj-$(CONFIG_X86_PM_TIMER)    += pmtimer.o
+ obj-$(CONFIG_X86_VSMP)                += vsmp.o
+ obj-$(CONFIG_K8_NB)           += k8.o
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/x86_64/kernel/entry.S linux-2.6.18.kgdb/arch/x86_64/kernel/entry.S
+--- linux-2.6.18/arch/x86_64/kernel/entry.S    2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/x86_64/kernel/entry.S       2008-06-10 16:19:58.000000000 +0400
+@@ -42,6 +42,7 @@
+ #include <asm/hw_irq.h>
+ #include <asm/page.h>
+ #include <asm/irqflags.h>
++#include <asm/kgdb.h>
+ 
+       .code64
+ 
+@@ -881,6 +882,7 @@ error_exit:                
+       RESTORE_ARGS 0,8,0                                              
+       jmp iret_label
+       CFI_ENDPROC
++      CFI_END_FRAME(kernel_thread)
+ 
+ error_kernelspace:
+       incl %ebx
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/x86_64/kernel/kgdb-jmp.S linux-2.6.18.kgdb/arch/x86_64/kernel/kgdb-jmp.S
+--- linux-2.6.18/arch/x86_64/kernel/kgdb-jmp.S 1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18.kgdb/arch/x86_64/kernel/kgdb-jmp.S    2008-06-10 16:19:41.000000000 +0400
+@@ -0,0 +1,65 @@
++/*
++ * arch/x86_64/kernel/kgdb-jmp.S
++ *
++ * Save and restore system registers so that within a limited frame we
++ * may have a fault and "jump back" to a known safe location.
++ *
++ * Author: Tom Rini <trini@kernel.crashing.org>
++ *
++ * Cribbed from glibc, which carries the following:
++ * Copyright (C) 2001, 2003, 2004 Free Software Foundation, Inc.
++ * Copyright (C) 2005 by MontaVista Software.
++ *
++ * This file is licensed under the terms of the GNU General Public License
++ * version 2. This program as licensed "as is" without any warranty of
++ * any kind, whether express or implied.
++ */
++
++#include <linux/linkage.h>
++
++#define JB_RBX                0
++#define JB_RBP                1
++#define JB_R12                2
++#define JB_R13                3
++#define JB_R14                4
++#define JB_R15                5
++#define JB_RSP                6
++#define JB_PC         7
++
++      .code64
++
++/* This must be called prior to kgdb_fault_longjmp and
++ * kgdb_fault_longjmp must not be called outside of the context of the
++ * last call to kgdb_fault_setjmp.
++ */
++ENTRY(kgdb_fault_setjmp)
++      /* Save registers. */
++      movq %rbx, (JB_RBX*8)(%rdi)
++      movq %rbp, (JB_RBP*8)(%rdi)
++      movq %r12, (JB_R12*8)(%rdi)
++      movq %r13, (JB_R13*8)(%rdi)
++      movq %r14, (JB_R14*8)(%rdi)
++      movq %r15, (JB_R15*8)(%rdi)
++      leaq 8(%rsp), %rdx      /* Save SP as it will be after we return. */
++      movq %rdx, (JB_RSP*8)(%rdi)
++      movq (%rsp), %rax       /* Save PC we are returning to now. */
++      movq %rax, (JB_PC*8)(%rdi)
++      /* Set return value for setjmp. */
++      mov $0,%eax
++      movq (JB_PC*8)(%rdi),%rdx
++      movq (JB_RSP*8)(%rdi),%rsp
++      jmpq *%rdx
++
++ENTRY(kgdb_fault_longjmp)
++      /* Restore registers. */
++      movq (JB_RBX*8)(%rdi),%rbx
++      movq (JB_RBP*8)(%rdi),%rbp
++      movq (JB_R12*8)(%rdi),%r12
++      movq (JB_R13*8)(%rdi),%r13
++      movq (JB_R14*8)(%rdi),%r14
++      movq (JB_R15*8)(%rdi),%r15
++      /* Set return value for setjmp. */
++      movq (JB_PC*8)(%rdi),%rdx
++      movq (JB_RSP*8)(%rdi),%rsp
++      mov $1,%eax
++      jmpq *%rdx
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/x86_64/kernel/kgdb.c linux-2.6.18.kgdb/arch/x86_64/kernel/kgdb.c
+--- linux-2.6.18/arch/x86_64/kernel/kgdb.c     1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18.kgdb/arch/x86_64/kernel/kgdb.c        2008-06-10 16:19:41.000000000 +0400
+@@ -0,0 +1,474 @@
++/*
++ *
++ * This program is free software; you can redistribute it and/or modify it
++ * under the terms of the GNU General Public License as published by the
++ * Free Software Foundation; either version 2, or (at your option) any
++ * later version.
++ *
++ * This program is distributed in the hope that it will be useful, but
++ * WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * General Public License for more details.
++ *
++ */
++
++/*
++ * Copyright (C) 2004 Amit S. Kale <amitkale@linsyssoft.com>
++ * Copyright (C) 2000-2001 VERITAS Software Corporation.
++ * Copyright (C) 2002 Andi Kleen, SuSE Labs
++ * Copyright (C) 2004 LinSysSoft Technologies Pvt. Ltd.
++ */
++/****************************************************************************
++ *  Contributor:     Lake Stevens Instrument Division$
++ *  Written by:      Glenn Engel $
++ *  Updated by:            Amit Kale<akale@veritas.com>
++ *  Modified for 386 by Jim Kingdon, Cygnus Support.
++ *  Origianl kgdb, compatibility with 2.1.xx kernel by
++ *  David Grothe <dave@gcom.com>
++ *  Integrated into 2.2.5 kernel by Tigran Aivazian <tigran@sco.com>
++ *  X86_64 changes from Andi Kleen's patch merged by Jim Houston
++ */
++
++#include <linux/string.h>
++#include <linux/kernel.h>
++#include <linux/sched.h>
++#include <linux/smp.h>
++#include <linux/spinlock.h>
++#include <linux/delay.h>
++#include <asm/system.h>
++#include <asm/ptrace.h>               /* for linux pt_regs struct */
++#include <linux/kgdb.h>
++#include <linux/init.h>
++#include <asm/apicdef.h>
++#include <asm/mach_apic.h>
++#include <asm/kdebug.h>
++#include <asm/debugreg.h>
++
++/* Put the error code here just in case the user cares.  */
++int gdb_x86_64errcode;
++/* Likewise, the vector number here (since GDB only gets the signal
++   number through the usual means, and that's not very specific).  */
++int gdb_x86_64vector = -1;
++
++extern atomic_t cpu_doing_single_step;
++
++void regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
++{
++      gdb_regs[_RAX] = regs->rax;
++      gdb_regs[_RBX] = regs->rbx;
++      gdb_regs[_RCX] = regs->rcx;
++      gdb_regs[_RDX] = regs->rdx;
++      gdb_regs[_RSI] = regs->rsi;
++      gdb_regs[_RDI] = regs->rdi;
++      gdb_regs[_RBP] = regs->rbp;
++      gdb_regs[_PS] = regs->eflags;
++      gdb_regs[_PC] = regs->rip;
++      gdb_regs[_R8] = regs->r8;
++      gdb_regs[_R9] = regs->r9;
++      gdb_regs[_R10] = regs->r10;
++      gdb_regs[_R11] = regs->r11;
++      gdb_regs[_R12] = regs->r12;
++      gdb_regs[_R13] = regs->r13;
++      gdb_regs[_R14] = regs->r14;
++      gdb_regs[_R15] = regs->r15;
++      gdb_regs[_RSP] = regs->rsp;
++}
++
++extern void thread_return(void);
++void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
++{
++      gdb_regs[_RAX] = 0;
++      gdb_regs[_RBX] = 0;
++      gdb_regs[_RCX] = 0;
++      gdb_regs[_RDX] = 0;
++      gdb_regs[_RSI] = 0;
++      gdb_regs[_RDI] = 0;
++      gdb_regs[_RBP] = *(unsigned long *)p->thread.rsp;
++      gdb_regs[_PS] = *(unsigned long *)(p->thread.rsp + 8);
++      gdb_regs[_PC] = (unsigned long)&thread_return;
++      gdb_regs[_R8] = 0;
++      gdb_regs[_R9] = 0;
++      gdb_regs[_R10] = 0;
++      gdb_regs[_R11] = 0;
++      gdb_regs[_R12] = 0;
++      gdb_regs[_R13] = 0;
++      gdb_regs[_R14] = 0;
++      gdb_regs[_R15] = 0;
++      gdb_regs[_RSP] = p->thread.rsp;
++}
++
++void gdb_regs_to_regs(unsigned long *gdb_regs, struct pt_regs *regs)
++{
++      regs->rax = gdb_regs[_RAX];
++      regs->rbx = gdb_regs[_RBX];
++      regs->rcx = gdb_regs[_RCX];
++      regs->rdx = gdb_regs[_RDX];
++      regs->rsi = gdb_regs[_RSI];
++      regs->rdi = gdb_regs[_RDI];
++      regs->rbp = gdb_regs[_RBP];
++      regs->eflags = gdb_regs[_PS];
++      regs->rip = gdb_regs[_PC];
++      regs->r8 = gdb_regs[_R8];
++      regs->r9 = gdb_regs[_R9];
++      regs->r10 = gdb_regs[_R10];
++      regs->r11 = gdb_regs[_R11];
++      regs->r12 = gdb_regs[_R12];
++      regs->r13 = gdb_regs[_R13];
++      regs->r14 = gdb_regs[_R14];
++      regs->r15 = gdb_regs[_R15];
++#if 0                         /* can't change these */
++      regs->rsp = gdb_regs[_RSP];
++      regs->ss = gdb_regs[_SS];
++      regs->fs = gdb_regs[_FS];
++      regs->gs = gdb_regs[_GS];
++#endif
++
++}                             /* gdb_regs_to_regs */
++
++struct hw_breakpoint {
++      unsigned enabled;
++      unsigned type;
++      unsigned len;
++      unsigned long addr;
++} breakinfo[4] = { {
++enabled:0}, {
++enabled:0}, {
++enabled:0}, {
++enabled:0}};
++
++void kgdb_correct_hw_break(void)
++{
++      int breakno;
++      int correctit;
++      int breakbit;
++      unsigned long dr7;
++
++      asm volatile ("movq %%db7, %0\n":"=r" (dr7):);
++      do {
++              unsigned long addr0, addr1, addr2, addr3;
++              asm volatile ("movq %%db0, %0\n"
++                            "movq %%db1, %1\n"
++                            "movq %%db2, %2\n"
++                            "movq %%db3, %3\n":"=r" (addr0), "=r"(addr1),
++                            "=r"(addr2), "=r"(addr3):);
++      } while (0);
++      correctit = 0;
++      for (breakno = 0; breakno < 3; breakno++) {
++              breakbit = 2 << (breakno << 1);
++              if (!(dr7 & breakbit) && breakinfo[breakno].enabled) {
++                      correctit = 1;
++                      dr7 |= breakbit;
++                      dr7 &= ~(0xf0000 << (breakno << 2));
++                      dr7 |= (((breakinfo[breakno].len << 2) |
++                               breakinfo[breakno].type) << 16) <<
++                          (breakno << 2);
++                      switch (breakno) {
++                      case 0:
++                              asm volatile ("movq %0, %%dr0\n"::"r"
++                                            (breakinfo[breakno].addr));
++                              break;
++
++                      case 1:
++                              asm volatile ("movq %0, %%dr1\n"::"r"
++                                            (breakinfo[breakno].addr));
++                              break;
++
++                      case 2:
++                              asm volatile ("movq %0, %%dr2\n"::"r"
++                                            (breakinfo[breakno].addr));
++                              break;
++
++                      case 3:
++                              asm volatile ("movq %0, %%dr3\n"::"r"
++                                            (breakinfo[breakno].addr));
++                              break;
++                      }
++              } else if ((dr7 & breakbit) && !breakinfo[breakno].enabled) {
++                      correctit = 1;
++                      dr7 &= ~breakbit;
++                      dr7 &= ~(0xf0000 << (breakno << 2));
++              }
++      }
++      if (correctit) {
++              asm volatile ("movq %0, %%db7\n"::"r" (dr7));
++      }
++}
++
++int kgdb_remove_hw_break(unsigned long addr)
++{
++      int i, idx = -1;
++      for (i = 0; i < 4; i++) {
++              if (breakinfo[i].addr == addr && breakinfo[i].enabled) {
++                      idx = i;
++                      break;
++              }
++      }
++      if (idx == -1)
++              return -1;
++
++      breakinfo[idx].enabled = 0;
++      return 0;
++}
++
++int kgdb_set_hw_break(unsigned long addr)
++{
++      int i, idx = -1;
++      for (i = 0; i < 4; i++) {
++              if (!breakinfo[i].enabled) {
++                      idx = i;
++                      break;
++              }
++      }
++      if (idx == -1)
++              return -1;
++
++      breakinfo[idx].enabled = 1;
++      breakinfo[idx].type = 1;
++      breakinfo[idx].len = 1;
++      breakinfo[idx].addr = addr;
++      return 0;
++}
++
++int remove_hw_break(unsigned breakno)
++{
++      if (!breakinfo[breakno].enabled) {
++              return -1;
++      }
++      breakinfo[breakno].enabled = 0;
++      return 0;
++}
++
++int set_hw_break(unsigned breakno, unsigned type, unsigned len, unsigned addr)
++{
++      if (breakinfo[breakno].enabled) {
++              return -1;
++      }
++      breakinfo[breakno].enabled = 1;
++      breakinfo[breakno].type = type;
++      breakinfo[breakno].len = len;
++      breakinfo[breakno].addr = addr;
++      return 0;
++}
++
++void kgdb_disable_hw_debug(struct pt_regs *regs)
++{
++      /* Disable hardware debugging while we are in kgdb */
++      asm volatile ("movq %0,%%db7": /* no output */ :"r" (0UL));
++}
++
++void kgdb_post_master_code(struct pt_regs *regs, int e_vector, int err_code)
++{
++      /* Master processor is completely in the debugger */
++      gdb_x86_64vector = e_vector;
++      gdb_x86_64errcode = err_code;
++}
++
++void kgdb_roundup_cpus(unsigned long flags)
++{
++      send_IPI_allbutself(APIC_DM_NMI);
++}
++
++int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
++                             char *remcomInBuffer, char *remcomOutBuffer,
++                             struct pt_regs *linux_regs)
++{
++      unsigned long addr, length;
++      unsigned long breakno, breaktype;
++      char *ptr;
++      int newPC;
++      unsigned long dr6;
++
++      switch (remcomInBuffer[0]) {
++      case 'c':
++      case 's':
++              /* try to read optional parameter, pc unchanged if no parm */
++              ptr = &remcomInBuffer[1];
++              if (kgdb_hex2long(&ptr, &addr))
++                      linux_regs->rip = addr;
++              newPC = linux_regs->rip;
++
++              /* clear the trace bit */
++              linux_regs->eflags &= ~TF_MASK;
++
++              atomic_set(&cpu_doing_single_step, -1);
++              /* set the trace bit if we're stepping */
++              if (remcomInBuffer[0] == 's') {
++                      linux_regs->eflags |= TF_MASK;
++                      debugger_step = 1;
++                      if (kgdb_contthread)
++                              atomic_set(&cpu_doing_single_step,
++                                         smp_processor_id());
++
++              }
++
++              asm volatile ("movq %%db6, %0\n":"=r" (dr6));
++              if (!(dr6 & 0x4000)) {
++                      for (breakno = 0; breakno < 4; ++breakno) {
++                              if (dr6 & (1 << breakno)) {
++                                      if (breakinfo[breakno].type == 0) {
++                                              /* Set restore flag */
++                                              linux_regs->eflags |=
++                                                  X86_EFLAGS_RF;
++                                              break;
++                                      }
++                              }
++                      }
++              }
++              kgdb_correct_hw_break();
++              asm volatile ("movq %0, %%db6\n"::"r" (0UL));
++
++              return (0);
++
++      case 'Y':
++              ptr = &remcomInBuffer[1];
++              kgdb_hex2long(&ptr, &breakno);
++              ptr++;
++              kgdb_hex2long(&ptr, &breaktype);
++              ptr++;
++              kgdb_hex2long(&ptr, &length);
++              ptr++;
++              kgdb_hex2long(&ptr, &addr);
++              if (set_hw_break(breakno & 0x3, breaktype & 0x3,
++                               length & 0x3, addr) == 0)
++                      strcpy(remcomOutBuffer, "OK");
++              else
++                      strcpy(remcomOutBuffer, "ERROR");
++              break;
++
++              /* Remove hardware breakpoint */
++      case 'y':
++              ptr = &remcomInBuffer[1];
++              kgdb_hex2long(&ptr, &breakno);
++              if (remove_hw_break(breakno & 0x3) == 0)
++                      strcpy(remcomOutBuffer, "OK");
++              else
++                      strcpy(remcomOutBuffer, "ERROR");
++              break;
++
++      }                       /* switch */
++      return -1;
++}
++
++static struct pt_regs *in_interrupt_stack(unsigned long rsp, int cpu)
++{
++      struct pt_regs *regs;
++      unsigned long end = (unsigned long)cpu_pda(cpu)->irqstackptr;
++      if (rsp <= end && rsp >= end - IRQSTACKSIZE + 8) {
++              regs = *(((struct pt_regs **)end) - 1);
++              return regs;
++      }
++      return NULL;
++}
++
++static struct pt_regs *in_exception_stack(unsigned long rsp, int cpu)
++{
++      int i;
++      struct tss_struct *init_tss = &__get_cpu_var(init_tss);
++      for (i = 0; i < N_EXCEPTION_STACKS; i++)
++              if (rsp >= init_tss[cpu].ist[i] &&
++                  rsp <= init_tss[cpu].ist[i] + EXCEPTION_STKSZ) {
++                      struct pt_regs *r =
++                          (void *)init_tss[cpu].ist[i] + EXCEPTION_STKSZ;
++                      return r - 1;
++              }
++      return NULL;
++}
++
++void kgdb_shadowinfo(struct pt_regs *regs, char *buffer, unsigned threadid)
++{
++      static char intr_desc[] = "Stack at interrupt entrypoint";
++      static char exc_desc[] = "Stack at exception entrypoint";
++      struct pt_regs *stregs;
++      int cpu = hard_smp_processor_id();
++
++      if ((stregs = in_interrupt_stack(regs->rsp, cpu)))
++              kgdb_mem2hex(intr_desc, buffer, strlen(intr_desc));
++      else if ((stregs = in_exception_stack(regs->rsp, cpu)))
++              kgdb_mem2hex(exc_desc, buffer, strlen(exc_desc));
++}
++
++struct task_struct *kgdb_get_shadow_thread(struct pt_regs *regs, int threadid)
++{
++      struct pt_regs *stregs;
++      int cpu = hard_smp_processor_id();
++
++      if ((stregs = in_interrupt_stack(regs->rsp, cpu)))
++              return current;
++      else if ((stregs = in_exception_stack(regs->rsp, cpu)))
++              return current;
++
++      return NULL;
++}
++
++struct pt_regs *kgdb_shadow_regs(struct pt_regs *regs, int threadid)
++{
++      struct pt_regs *stregs;
++      int cpu = hard_smp_processor_id();
++
++      if ((stregs = in_interrupt_stack(regs->rsp, cpu)))
++              return stregs;
++      else if ((stregs = in_exception_stack(regs->rsp, cpu)))
++              return stregs;
++
++      return NULL;
++}
++
++/* Register KGDB with the die_chain so that we hook into all of the right
++ * spots. */
++static int kgdb_notify(struct notifier_block *self, unsigned long cmd,
++                     void *ptr)
++{
++      struct die_args *args = ptr;
++      struct pt_regs *regs = args->regs;
++
++      if (cmd == DIE_PAGE_FAULT_NO_CONTEXT && atomic_read(&debugger_active)
++                      && kgdb_may_fault) {
++              kgdb_fault_longjmp(kgdb_fault_jmp_regs);
++              return NOTIFY_STOP;
++      /* CPU roundup? */
++      } else if (atomic_read(&debugger_active) && cmd == DIE_NMI_IPI) {
++              kgdb_nmihook(smp_processor_id(), regs);
++              return NOTIFY_STOP;
++              /* See if KGDB is interested. */
++      } else if (cmd == DIE_PAGE_FAULT || user_mode(regs) ||
++                 cmd == DIE_NMI_IPI || (cmd == DIE_DEBUG &&
++                                        atomic_read(&debugger_active)))
++              /* Userpace events, normal watchdog event, or spurious
++               * debug exception.  Ignore. */
++              return NOTIFY_DONE;
++
++      kgdb_handle_exception(args->trapnr, args->signr, args->err, regs);
++
++      return NOTIFY_STOP;
++}
++
++static struct notifier_block kgdb_notifier = {
++      .notifier_call = kgdb_notify,
++      .priority = 0x7fffffff, /* we need to notified first */
++};
++
++int kgdb_arch_init(void)
++{
++      atomic_notifier_chain_register(&die_chain, &kgdb_notifier);
++      return 0;
++}
++/*
++ * Skip an int3 exception when it occurs after a breakpoint has been
++ * removed. Backtrack eip by 1 since the int3 would have caused it to
++ * increment by 1.
++ */
++
++int kgdb_skipexception(int exception, struct pt_regs *regs)
++{
++      if (exception == 3 && kgdb_isremovedbreak(regs->rip - 1)) {
++              regs->rip -= 1;
++              return 1;
++      }
++      return 0;
++}
++
++struct kgdb_arch arch_kgdb_ops = {
++      .gdb_bpt_instr = {0xcc},
++      .flags = KGDB_HW_BREAKPOINT,
++      .shadowth = 1,
++};
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/x86_64/mm/fault.c linux-2.6.18.kgdb/arch/x86_64/mm/fault.c
+--- linux-2.6.18/arch/x86_64/mm/fault.c        2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/arch/x86_64/mm/fault.c   2008-06-10 16:19:36.000000000 +0400
+@@ -557,6 +557,10 @@ no_context:
+       if (is_errata93(regs, address))
+               return; 
+ 
++      if (notify_die(DIE_PAGE_FAULT_NO_CONTEXT, "no context", regs,
++                              error_code, 14, SIGSEGV) == NOTIFY_STOP)
++              return;
++
+ /*
+  * Oops. The kernel tried to access some bad page. We'll have to
+  * terminate things with extreme prejudice.
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/drivers/char/keyboard.c linux-2.6.18.kgdb/drivers/char/keyboard.c
+--- linux-2.6.18/drivers/char/keyboard.c       2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/drivers/char/keyboard.c  2008-06-10 16:20:02.000000000 +0400
+@@ -1174,6 +1174,7 @@ static void kbd_keycode(unsigned int key
+               sysrq_down = 0;
+       if (sysrq_down && down && !rep) {
+               handle_sysrq(kbd_sysrq_xlate[keycode], regs, tty);
++              sysrq_down = 0;         /* In case we miss the 'up' event. */
+               return;
+       }
+ #endif
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/drivers/net/Makefile linux-2.6.18.kgdb/drivers/net/Makefile
+--- linux-2.6.18/drivers/net/Makefile  2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/drivers/net/Makefile     2008-06-10 16:19:13.000000000 +0400
+@@ -216,6 +216,7 @@ obj-$(CONFIG_ETRAX_ETHERNET) += cris/
+ obj-$(CONFIG_ENP2611_MSF_NET) += ixp2000/
+ 
+ obj-$(CONFIG_NETCONSOLE) += netconsole.o
++obj-$(CONFIG_KGDBOE) += kgdboe.o
+ 
+ obj-$(CONFIG_FS_ENET) += fs_enet/
+ 
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/drivers/net/kgdboe.c linux-2.6.18.kgdb/drivers/net/kgdboe.c
+--- linux-2.6.18/drivers/net/kgdboe.c  1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18.kgdb/drivers/net/kgdboe.c     2008-06-10 16:19:13.000000000 +0400
+@@ -0,0 +1,294 @@
++/*
++ * drivers/net/kgdboe.c
++ *
++ * A network interface for GDB.
++ * Based upon 'gdbserial' by David Grothe <dave@gcom.com>
++ * and Scott Foehner <sfoehner@engr.sgi.com>
++ *
++ * Maintainers: Amit S. Kale <amitkale@linsyssoft.com> and
++ *            Tom Rini <trini@kernel.crashing.org>
++ *
++ * 2004 (c) Amit S. Kale <amitkale@linsyssoft.com>
++ * 2004-2005 (c) MontaVista Software, Inc.
++ * 2005 (c) Wind River Systems, Inc.
++ *
++ * Contributors at various stages not listed above:
++ * San Mehat <nettwerk@biodome.org>, Robert Walsh <rjwalsh@durables.org>,
++ * wangdi <wangdi@clusterfs.com>, Matt Mackall <mpm@selenic.com>,
++ * Pavel Machek <pavel@suse.cz>, Jason Wessel <jason.wessel@windriver.com>
++ *
++ * This file is licensed under the terms of the GNU General Public License
++ * version 2. This program is licensed "as is" without any warranty of any
++ * kind, whether express or implied.
++ */
++
++#include <linux/kernel.h>
++#include <linux/interrupt.h>
++#include <linux/string.h>
++#include <linux/kgdb.h>
++#include <linux/netpoll.h>
++#include <linux/init.h>
++
++#include <asm/atomic.h>
++
++#define IN_BUF_SIZE 512               /* power of 2, please */
++#define NOT_CONFIGURED_STRING "not_configured"
++#define OUT_BUF_SIZE 30               /* We don't want to send too big of a packet. */
++#define MAX_KGDBOE_CONFIG_STR 256
++
++static char in_buf[IN_BUF_SIZE], out_buf[OUT_BUF_SIZE];
++static int in_head, in_tail, out_count;
++static atomic_t in_count;
++/* 0 = unconfigured, 1 = netpoll options parsed, 2 = fully configured. */
++static int configured;
++static struct kgdb_io local_kgdb_io_ops;
++static int use_dynamic_mac;
++
++MODULE_DESCRIPTION("KGDB driver for network interfaces");
++MODULE_LICENSE("GPL");
++static char config[MAX_KGDBOE_CONFIG_STR] = NOT_CONFIGURED_STRING;
++static struct kparam_string kps = {
++      .string = config,
++      .maxlen = MAX_KGDBOE_CONFIG_STR,
++};
++
++static void rx_hook(struct netpoll *np, int port, char *msg, int len,
++                  struct sk_buff *skb)
++{
++      int i;
++
++      np->remote_port = port;
++
++      /* Copy the MAC address if we need to. */
++      if (use_dynamic_mac) {
++              memcpy(np->remote_mac, eth_hdr(skb)->h_source,
++                              sizeof(np->remote_mac));
++              use_dynamic_mac = 0;
++      }
++
++      /*
++       * This could be GDB trying to attach.  But it could also be GDB
++       * finishing up a session, with kgdb_connected=0 but GDB sending
++       * an ACK for the final packet.  To make sure we don't try and
++       * make a breakpoint when GDB is leaving, make sure that if
++       * !kgdb_connected the only len == 1 packet we allow is ^C.
++       */
++      if (!kgdb_connected && (len != 1 || msg[0] == 3) &&
++          !atomic_read(&kgdb_setting_breakpoint)) {
++              tasklet_schedule(&kgdb_tasklet_breakpoint);
++      }
++
++      for (i = 0; i < len; i++) {
++              if (msg[i] == 3)
++                      tasklet_schedule(&kgdb_tasklet_breakpoint);
++
++              if (atomic_read(&in_count) >= IN_BUF_SIZE) {
++                      /* buffer overflow, clear it */
++                      in_head = in_tail = 0;
++                      atomic_set(&in_count, 0);
++                      break;
++              }
++              in_buf[in_head++] = msg[i];
++              in_head &= (IN_BUF_SIZE - 1);
++              atomic_inc(&in_count);
++      }
++}
++
++static struct netpoll np = {
++      .dev_name = "eth0",
++      .name = "kgdboe",
++      .rx_hook = rx_hook,
++      .local_port = 6443,
++      .remote_port = 6442,
++      .remote_mac = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
++};
++
++static void eth_pre_exception_handler(void)
++{
++      /* Increment the module count when the debugger is active */
++      if (!kgdb_connected)
++              try_module_get(THIS_MODULE);
++      netpoll_set_trap(1);
++}
++
++static void eth_post_exception_handler(void)
++{
++      /* decrement the module count when the debugger detaches */
++      if (!kgdb_connected)
++              module_put(THIS_MODULE);
++      netpoll_set_trap(0);
++}
++
++static int eth_get_char(void)
++{
++      int chr;
++
++      while (atomic_read(&in_count) == 0)
++              netpoll_poll(&np);
++
++      chr = in_buf[in_tail++];
++      in_tail &= (IN_BUF_SIZE - 1);
++      atomic_dec(&in_count);
++      return chr;
++}
++
++static void eth_flush_buf(void)
++{
++      if (out_count && np.dev) {
++              netpoll_send_udp(&np, out_buf, out_count);
++              memset(out_buf, 0, sizeof(out_buf));
++              out_count = 0;
++      }
++}
++
++static void eth_put_char(u8 chr)
++{
++      out_buf[out_count++] = chr;
++      if (out_count == OUT_BUF_SIZE)
++              eth_flush_buf();
++}
++
++static int option_setup(char *opt)
++{
++      char opt_scratch[MAX_KGDBOE_CONFIG_STR];
++
++      /* If we're being given a new configuration, copy it in. */
++      if (opt != config)
++              strcpy(config, opt);
++      /* But work on a copy as netpoll_parse_options will eat it. */
++      strcpy(opt_scratch, opt);
++      configured = !netpoll_parse_options(&np, opt_scratch);
++
++      use_dynamic_mac = 1;
++
++      return 0;
++}
++__setup("kgdboe=", option_setup);
++
++/* With our config string set by some means, configure kgdboe. */
++static int configure_kgdboe(void)
++{
++      /* Try out the string. */
++      option_setup(config);
++
++      if (!configured) {
++              printk(KERN_ERR "kgdboe: configuration incorrect - kgdboe not "
++                     "loaded.\n");
++              printk(KERN_ERR "  Usage: kgdboe=[src-port]@[src-ip]/[dev],"
++                              "[tgt-port]@<tgt-ip>/<tgt-macaddr>\n");
++              return -EINVAL;
++      }
++
++      /* Bring it up. */
++      if (netpoll_setup(&np)) {
++              printk(KERN_ERR "kgdboe: netpoll_setup failed kgdboe failed\n");
++              return -EINVAL;
++      }
++
++      if (kgdb_register_io_module(&local_kgdb_io_ops)) {
++              netpoll_cleanup(&np);
++              return -EINVAL;
++      }
++
++      configured = 2;
++
++      return 0;
++}
++
++static int init_kgdboe(void)
++{
++      int ret;
++
++      /* Already done? */
++      if (configured == 2)
++              return 0;
++
++      /* OK, go ahead and do it. */
++      ret = configure_kgdboe();
++
++      if (configured == 2)
++              printk(KERN_INFO "kgdboe: debugging over ethernet enabled\n");
++
++      return ret;
++}
++
++static void cleanup_kgdboe(void)
++{
++      netpoll_cleanup(&np);
++      configured = 0;
++      kgdb_unregister_io_module(&local_kgdb_io_ops);
++}
++
++static int param_set_kgdboe_var(const char *kmessage, struct kernel_param *kp)
++{
++      char kmessage_save[MAX_KGDBOE_CONFIG_STR];
++      int msg_len = strlen(kmessage);
++
++      if (msg_len + 1 > MAX_KGDBOE_CONFIG_STR) {
++              printk(KERN_ERR "%s: string doesn't fit in %u chars.\n",
++                     kp->name, MAX_KGDBOE_CONFIG_STR - 1);
++              return -ENOSPC;
++      }
++
++      if (kgdb_connected) {
++              printk(KERN_ERR "kgdboe: Cannot reconfigure while KGDB is "
++                              "connected.\n");
++              return 0;
++      }
++
++      /* Start the reconfiguration process by saving the old string */
++      strncpy(kmessage_save, config, sizeof(kmessage_save));
++
++
++      /* Copy in the new param and strip out invalid characters so we
++       * can optionally specify the MAC.
++       */
++      strncpy(config, kmessage, sizeof(config));
++      msg_len--;
++      while (msg_len > 0 &&
++                      (config[msg_len] < ',' || config[msg_len] > 'f')) {
++              config[msg_len] = '\0';
++              msg_len--;
++      }
++
++      /* Check to see if we are unconfiguring the io module and that it
++       * was in a fully configured state, as this is the only time that
++       * netpoll_cleanup should get called
++       */
++      if (configured == 2 && strcmp(config, NOT_CONFIGURED_STRING) == 0) {
++              printk(KERN_INFO "kgdboe: reverting to unconfigured state\n");
++              cleanup_kgdboe();
++              return 0;
++      } else
++              /* Go and configure with the new params. */
++              configure_kgdboe();
++
++      if (configured == 2)
++              return 0;
++
++      /* If the new string was invalid, revert to the previous state, which
++       * is at a minimum not_configured. */
++      strncpy(config, kmessage_save, sizeof(config));
++      if (strcmp(kmessage_save, NOT_CONFIGURED_STRING) != 0) {
++              printk(KERN_INFO "kgdboe: reverting to prior configuration\n");
++              /* revert back to the original config */
++              strncpy(config, kmessage_save, sizeof(config));
++              configure_kgdboe();
++      }
++      return 0;
++}
++
++static struct kgdb_io local_kgdb_io_ops = {
++      .read_char = eth_get_char,
++      .write_char = eth_put_char,
++      .init = init_kgdboe,
++      .flush = eth_flush_buf,
++      .pre_exception = eth_pre_exception_handler,
++      .post_exception = eth_post_exception_handler
++};
++
++module_init(init_kgdboe);
++module_exit(cleanup_kgdboe);
++module_param_call(kgdboe, param_set_kgdboe_var, param_get_string, &kps, 0644);
++MODULE_PARM_DESC(kgdboe, " kgdboe=[src-port]@[src-ip]/[dev],"
++               "[tgt-port]@<tgt-ip>/<tgt-macaddr>\n");
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/drivers/serial/8250.c linux-2.6.18.kgdb/drivers/serial/8250.c
+--- linux-2.6.18/drivers/serial/8250.c 2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/drivers/serial/8250.c    2008-06-10 16:19:03.000000000 +0400
+@@ -2628,6 +2628,25 @@ void serial8250_unregister_port(int line
+ }
+ EXPORT_SYMBOL(serial8250_unregister_port);
+ 
++/**
++ *    serial8250_unregister_by_port - remove a 16x50 serial port
++ *    at runtime.
++ *    @port: A &struct uart_port that describes the port to remove.
++ *
++ *    Remove one serial port.  This may not be called from interrupt
++ *    context.  We hand the port back to the our control.
++ */
++void serial8250_unregister_by_port(struct uart_port *port)
++{
++      struct uart_8250_port *uart;
++
++      uart = serial8250_find_match_or_unused(port);
++
++      if (uart)
++              serial8250_unregister_port(uart->port.line);
++}
++EXPORT_SYMBOL(serial8250_unregister_by_port);
++
+ static int __init serial8250_init(void)
+ {
+       int ret, i;
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/drivers/serial/8250_kgdb.c linux-2.6.18.kgdb/drivers/serial/8250_kgdb.c
+--- linux-2.6.18/drivers/serial/8250_kgdb.c    1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18.kgdb/drivers/serial/8250_kgdb.c       2008-06-10 16:19:03.000000000 +0400
+@@ -0,0 +1,516 @@
++/*
++ * 8250 interface for kgdb.
++ *
++ * This is a merging of many different drivers, and all of the people have
++ * had an impact in some form or another:
++ *
++ * 2004-2005 (c) MontaVista Software, Inc.
++ * 2005-2006 (c) Wind River Systems, Inc.
++ *
++ * Amit Kale <amitkale@emsyssoft.com>, David Grothe <dave@gcom.com>,
++ * Scott Foehner <sfoehner@engr.sgi.com>, George Anzinger <george@mvista.com>,
++ * Robert Walsh <rjwalsh@durables.org>, wangdi <wangdi@clusterfs.com>,
++ * San Mehat, Tom Rini <trini@mvista.com>,
++ * Jason Wessel <jason.wessel@windriver.com>
++ */
++
++#include <linux/config.h>
++#include <linux/kernel.h>
++#include <linux/init.h>
++#include <linux/kgdb.h>
++#include <linux/interrupt.h>
++#include <linux/tty.h>
++#include <linux/serial.h>
++#include <linux/serial_reg.h>
++#include <linux/serialP.h>
++#include <linux/ioport.h>
++
++#include <asm/io.h>
++#include <asm/serial.h>               /* For BASE_BAUD and SERIAL_PORT_DFNS */
++
++#include "8250.h"
++
++#define GDB_BUF_SIZE  512     /* power of 2, please */
++
++MODULE_DESCRIPTION("KGDB driver for the 8250");
++MODULE_LICENSE("GPL");
++/* These will conflict with early_param otherwise. */
++#ifdef CONFIG_KGDB_8250_MODULE
++static char config[256];
++module_param_string(kgdb8250, config, 256, 0);
++MODULE_PARM_DESC(kgdb8250,
++               " kgdb8250=<io or mmio>,<address>,<baud rate>,<irq>\n");
++static struct kgdb_io local_kgdb_io_ops;
++#endif                                /* CONFIG_KGDB_8250_MODULE */
++
++/* Speed of the UART. */
++static int kgdb8250_baud;
++
++/* Flag for if we need to call request_mem_region */
++static int kgdb8250_needs_request_mem_region;
++
++static char kgdb8250_buf[GDB_BUF_SIZE];
++static atomic_t kgdb8250_buf_in_cnt;
++static int kgdb8250_buf_out_inx;
++
++/* Old-style serial definitions, if existant, and a counter. */
++#ifdef CONFIG_KGDB_SIMPLE_SERIAL
++static int __initdata should_copy_rs_table = 1;
++static struct serial_state old_rs_table[] __initdata = {
++#ifdef SERIAL_PORT_DFNS
++      SERIAL_PORT_DFNS
++#endif
++};
++#endif
++
++/* Our internal table of UARTS. */
++#define UART_NR       CONFIG_SERIAL_8250_NR_UARTS
++static struct uart_port kgdb8250_ports[UART_NR];
++
++static struct uart_port *current_port;
++
++/* Base of the UART. */
++static void *kgdb8250_addr;
++
++/* Forward declarations. */
++static int kgdb8250_uart_init(void);
++static int __init kgdb_init_io(void);
++static int __init kgdb8250_opt(char *str);
++
++/* These are much shorter calls to ioread8/iowrite8 that take into
++ * account our shifts, etc. */
++static inline unsigned int kgdb_ioread(u8 mask)
++{
++      return ioread8(kgdb8250_addr + (mask << current_port->regshift));
++}
++
++static inline void kgdb_iowrite(u8 val, u8 mask)
++{
++      iowrite8(val, kgdb8250_addr + (mask << current_port->regshift));
++}
++
++/*
++ * Wait until the interface can accept a char, then write it.
++ */
++static void kgdb_put_debug_char(u8 chr)
++{
++      while (!(kgdb_ioread(UART_LSR) & UART_LSR_THRE)) ;
++
++      kgdb_iowrite(chr, UART_TX);
++}
++
++/*
++ * Get a byte from the hardware data buffer and return it
++ */
++static int read_data_bfr(void)
++{
++      char it = kgdb_ioread(UART_LSR);
++
++      if (it & UART_LSR_DR)
++              return kgdb_ioread(UART_RX);
++
++      /*
++       * If we have a framing error assume somebody messed with
++       * our uart.  Reprogram it and send '-' both ways...
++       */
++      if (it & 0xc) {
++              kgdb8250_uart_init();
++              kgdb_put_debug_char('-');
++              return '-';
++      }
++
++      return -1;
++}
++
++/*
++ * Get a char if available, return -1 if nothing available.
++ * Empty the receive buffer first, then look at the interface hardware.
++ */
++static int kgdb_get_debug_char(void)
++{
++      int retchr;
++
++      /* intr routine has q'd chars */
++      if (atomic_read(&kgdb8250_buf_in_cnt) != 0) {
++              retchr = kgdb8250_buf[kgdb8250_buf_out_inx++];
++              kgdb8250_buf_out_inx &= (GDB_BUF_SIZE - 1);
++              atomic_dec(&kgdb8250_buf_in_cnt);
++              return retchr;
++      }
++
++      do {
++              retchr = read_data_bfr();
++      } while (retchr < 0);
++
++      return retchr;
++}
++
++/*
++ * This is the receiver interrupt routine for the GDB stub.
++ * All that we need to do is verify that the interrupt happened on the
++ * line we're in charge of.  If this is true, schedule a breakpoint and
++ * return.
++ */
++static irqreturn_t
++kgdb8250_interrupt(int irq, void *dev_id, struct pt_regs *regs)
++{
++      if (kgdb_ioread(UART_IIR) & UART_IIR_RDI) {
++              /* Throw away the data if another I/O routine is active. */
++              if (kgdb_io_ops.read_char != kgdb_get_debug_char &&
++                              (kgdb_ioread(UART_LSR) & UART_LSR_DR))
++                      kgdb_ioread(UART_RX);
++              else
++                      breakpoint();
++      }
++
++      return IRQ_HANDLED;
++}
++
++/*
++ *  Initializes the UART.
++ *  Returns:
++ *    0 on success, 1 on failure.
++ */
++static int
++kgdb8250_uart_init (void)
++{
++      unsigned int ier, base_baud = current_port->uartclk ?
++              current_port->uartclk / 16 : BASE_BAUD;
++
++      /* test uart existance */
++      if(kgdb_ioread(UART_LSR) == 0xff)
++              return -1;
++
++      /* disable interrupts */
++      kgdb_iowrite(0, UART_IER);
++
++#if defined(CONFIG_ARCH_OMAP1510)
++      /* Workaround to enable 115200 baud on OMAP1510 internal ports */
++      if (cpu_is_omap1510() && is_omap_port((void *)kgdb8250_addr)) {
++              if (kgdb8250_baud == 115200) {
++                      base_baud = 1;
++                      kgdb8250_baud = 1;
++                      kgdb_iowrite(1, UART_OMAP_OSC_12M_SEL);
++              } else
++                      kgdb_iowrite(0, UART_OMAP_OSC_12M_SEL);
++      }
++#endif
++      /* set DLAB */
++      kgdb_iowrite(UART_LCR_DLAB, UART_LCR);
++
++      /* set baud */
++      kgdb_iowrite((base_baud / kgdb8250_baud) & 0xff, UART_DLL);
++      kgdb_iowrite((base_baud / kgdb8250_baud) >> 8, UART_DLM);
++
++      /* reset DLAB, set LCR */
++      kgdb_iowrite(UART_LCR_WLEN8, UART_LCR);
++
++      /* set DTR and RTS */
++      kgdb_iowrite(UART_MCR_OUT2 | UART_MCR_DTR | UART_MCR_RTS, UART_MCR);
++
++      /* setup fifo */
++      kgdb_iowrite(UART_FCR_ENABLE_FIFO | UART_FCR_CLEAR_RCVR
++              | UART_FCR_CLEAR_XMIT | UART_FCR_TRIGGER_8,
++              UART_FCR);
++
++      /* clear pending interrupts */
++      kgdb_ioread(UART_IIR);
++      kgdb_ioread(UART_RX);
++      kgdb_ioread(UART_LSR);
++      kgdb_ioread(UART_MSR);
++
++      /* turn on RX interrupt only */
++      kgdb_iowrite(UART_IER_RDI, UART_IER);
++
++      /*
++       * Borrowed from the main 8250 driver.
++       * Try writing and reading the UART_IER_UUE bit (b6).
++       * If it works, this is probably one of the Xscale platform's
++       * internal UARTs.
++       * We're going to explicitly set the UUE bit to 0 before
++       * trying to write and read a 1 just to make sure it's not
++       * already a 1 and maybe locked there before we even start start.
++       */
++      ier = kgdb_ioread(UART_IER);
++      kgdb_iowrite(ier & ~UART_IER_UUE, UART_IER);
++      if (!(kgdb_ioread(UART_IER) & UART_IER_UUE)) {
++              /*
++               * OK it's in a known zero state, try writing and reading
++               * without disturbing the current state of the other bits.
++               */
++              kgdb_iowrite(ier | UART_IER_UUE, UART_IER);
++              if (kgdb_ioread(UART_IER) & UART_IER_UUE)
++                      /*
++                       * It's an Xscale.
++                       */
++                      ier |= UART_IER_UUE | UART_IER_RTOIE;
++      }
++      kgdb_iowrite(ier, UART_IER);
++      return 0;
++}
++
++/*
++ * Copy the old serial_state table to our uart_port table if we haven't
++ * had values specifically configured in.  We need to make sure this only
++ * happens once.
++ */
++static void __init kgdb8250_copy_rs_table(void)
++{
++#ifdef CONFIG_KGDB_SIMPLE_SERIAL
++      int i;
++
++      if (!should_copy_rs_table)
++              return;
++
++      for (i = 0; i < ARRAY_SIZE(old_rs_table); i++) {
++              kgdb8250_ports[i].iobase = old_rs_table[i].port;
++              kgdb8250_ports[i].irq = irq_canonicalize(old_rs_table[i].irq);
++              kgdb8250_ports[i].uartclk = old_rs_table[i].baud_base * 16;
++              kgdb8250_ports[i].membase = old_rs_table[i].iomem_base;
++              kgdb8250_ports[i].iotype = old_rs_table[i].io_type;
++              kgdb8250_ports[i].regshift = old_rs_table[i].iomem_reg_shift;
++              kgdb8250_ports[i].line = i;
++      }
++
++      should_copy_rs_table = 0;
++#endif
++}
++
++/*
++ * Hookup our IRQ line now that it is safe to do so, after we grab any
++ * memory regions we might need to.  If we haven't been initialized yet,
++ * go ahead and copy the old_rs_table in.
++ */
++static void __init kgdb8250_late_init(void)
++{
++      /* Try and copy the old_rs_table. */
++      kgdb8250_copy_rs_table();
++
++#if defined(CONFIG_SERIAL_8250) || defined(CONFIG_SERIAL_8250_MODULE)
++      /* Take the port away from the main driver. */
++      serial8250_unregister_by_port(current_port);
++
++      /* Now reinit the port as the above has disabled things. */
++      kgdb8250_uart_init();
++#endif
++      /* We may need to call request_mem_region() first. */
++      if (kgdb8250_needs_request_mem_region)
++              request_mem_region(current_port->mapbase,
++                                 8 << current_port->regshift, "kgdb");
++      if (request_irq(current_port->irq, kgdb8250_interrupt, SA_SHIRQ,
++                      "GDB-stub", current_port) < 0)
++              printk(KERN_ERR "KGDB failed to request the serial IRQ (%d)\n",
++                     current_port->irq);
++}
++
++static __init int kgdb_init_io(void)
++{
++      /* Give us the basic table of uarts. */
++      kgdb8250_copy_rs_table();
++
++      /* We're either a module and parse a config string, or we have a
++       * semi-static config. */
++#ifdef CONFIG_KGDB_8250_MODULE
++      if (strlen(config)) {
++              if (kgdb8250_opt(config))
++                      return -EINVAL;
++      } else {
++              printk(KERN_ERR "kgdb8250: argument error, usage: "
++                     "kgdb8250=<io or mmio>,<address>,<baud rate>,<irq>\n");
++              return -EINVAL;
++      }
++#elif defined(CONFIG_KGDB_SIMPLE_SERIAL)
++      kgdb8250_baud = CONFIG_KGDB_BAUDRATE;
++
++      /* Setup our pointer to the serial port now. */
++      current_port = &kgdb8250_ports[CONFIG_KGDB_PORT_NUM];
++#else
++      if (kgdb8250_opt(CONFIG_KGDB_8250_CONF_STRING))
++              return -EINVAL;
++#endif
++
++
++      /* Internal driver setup. */
++      switch (current_port->iotype) {
++      case UPIO_MEM:
++              if (current_port->mapbase)
++                      kgdb8250_needs_request_mem_region = 1;
++              if (current_port->flags & UPF_IOREMAP) {
++                      current_port->membase = ioremap(current_port->mapbase,
++                                              8 << current_port->regshift);
++                      if (!current_port->membase)
++                              return -EIO;    /* Failed. */
++              }
++              kgdb8250_addr = current_port->membase;
++              break;
++      case UPIO_PORT:
++      default:
++              kgdb8250_addr = ioport_map(current_port->iobase,
++                                         8 << current_port->regshift);
++              if (!kgdb8250_addr)
++                      return -EIO;    /* Failed. */
++      }
++
++      if (kgdb8250_uart_init() == -1) {
++              printk(KERN_ERR "kgdb8250: init failed\n");
++              return -EIO;
++      }
++#ifdef CONFIG_KGDB_8250_MODULE
++      /* Attach the kgdb irq. When this is built into the kernel, it
++       * is called as a part of late_init sequence.
++       */
++      kgdb8250_late_init();
++      if (kgdb_register_io_module(&local_kgdb_io_ops))
++              return -EINVAL;
++
++      printk(KERN_INFO "kgdb8250: debugging enabled\n");
++#endif                                /* CONFIG_KGD_8250_MODULE */
++
++      return 0;
++}
++
++#ifdef CONFIG_KGDB_8250_MODULE
++/* If it is a module the kgdb_io_ops should be a static which
++ * is passed to the KGDB I/O initialization
++ */
++static struct kgdb_io local_kgdb_io_ops = {
++#else                         /* ! CONFIG_KGDB_8250_MODULE */
++struct kgdb_io kgdb_io_ops = {
++#endif                                /* ! CONFIG_KGD_8250_MODULE */
++      .read_char = kgdb_get_debug_char,
++      .write_char = kgdb_put_debug_char,
++      .init = kgdb_init_io,
++      .late_init = kgdb8250_late_init,
++};
++
++/**
++ *    kgdb8250_add_port - Define a serial port for use with KGDB
++ *    @i: The index of the port being added
++ *    @serial_req: The &struct uart_port describing the port
++ *
++ *    On platforms where we must register the serial device
++ *    dynamically, this is the best option if a platform also normally
++ *    calls early_serial_setup().
++ */
++void __init kgdb8250_add_port(int i, struct uart_port *serial_req)
++{
++      /* Make sure we've got the built-in data before we override. */
++      kgdb8250_copy_rs_table();
++
++      /* Copy the whole thing over. */
++      if (current_port != &kgdb8250_ports[i])
++                memcpy(&kgdb8250_ports[i], serial_req, sizeof(struct uart_port));
++}
++
++/**
++ *    kgdb8250_add_platform_port - Define a serial port for use with KGDB
++ *    @i: The index of the port being added
++ *    @p: The &struct plat_serial8250_port describing the port
++ *
++ *    On platforms where we must register the serial device
++ *    dynamically, this is the best option if a platform normally
++ *    handles uart setup with an array of &struct plat_serial8250_port.
++ */
++void __init kgdb8250_add_platform_port(int i, struct plat_serial8250_port *p)
++{
++      /* Make sure we've got the built-in data before we override. */
++      kgdb8250_copy_rs_table();
++
++      kgdb8250_ports[i].iobase = p->iobase;
++      kgdb8250_ports[i].membase = p->membase;
++      kgdb8250_ports[i].irq = p->irq;
++      kgdb8250_ports[i].uartclk = p->uartclk;
++      kgdb8250_ports[i].regshift = p->regshift;
++      kgdb8250_ports[i].iotype = p->iotype;
++      kgdb8250_ports[i].flags = p->flags;
++      kgdb8250_ports[i].mapbase = p->mapbase;
++}
++
++/*
++ * Syntax for this cmdline option is:
++ * kgdb8250=<io or mmio>,<address>,<baud rate>,<irq>"
++ */
++static int __init kgdb8250_opt(char *str)
++{
++      /* We'll fill out and use the first slot. */
++      current_port = &kgdb8250_ports[0];
++
++      if (!strncmp(str, "io", 2)) {
++              current_port->iotype = UPIO_PORT;
++              str += 2;
++      } else if (!strncmp(str, "mmap", 4)) {
++              current_port->iotype = UPIO_MEM;
++              current_port->flags |= UPF_IOREMAP;
++              str += 4;
++      } else if (!strncmp(str, "mmio", 4)) {
++              current_port->iotype = UPIO_MEM;
++              current_port->flags &= ~UPF_IOREMAP;
++              str += 4;
++      } else
++              goto errout;
++
++      if (*str != ',')
++              goto errout;
++      str++;
++
++      if (current_port->iotype == UPIO_PORT)
++              current_port->iobase = simple_strtoul(str, &str, 16);
++      else {
++              if (current_port->flags & UPF_IOREMAP)
++                      current_port->mapbase =
++                              (unsigned long) simple_strtoul(str, &str, 16);
++              else
++                      current_port->membase =
++                              (void *) simple_strtoul(str, &str, 16);
++      }
++
++      if (*str != ',')
++              goto errout;
++      str++;
++
++      kgdb8250_baud = simple_strtoul(str, &str, 10);
++      if (!kgdb8250_baud)
++              goto errout;
++
++      if (*str != ',')
++              goto errout;
++      str++;
++
++      current_port->irq = simple_strtoul(str, &str, 10);
++
++#ifdef CONFIG_KGDB_SIMPLE_SERIAL
++      should_copy_rs_table = 0;
++#endif
++
++      return 0;
++
++      errout:
++      printk(KERN_ERR "Invalid syntax for option kgdb8250=\n");
++      return 1;
++}
++
++#ifdef CONFIG_KGDB_8250_MODULE
++static void cleanup_kgdb8250(void)
++{
++      kgdb_unregister_io_module(&local_kgdb_io_ops);
++
++      /* Clean up the irq and memory */
++      free_irq(current_port->irq, current_port);
++
++      if (kgdb8250_needs_request_mem_region)
++              release_mem_region(current_port->mapbase,
++                                 8 << current_port->regshift);
++      /* Hook up the serial port back to what it was previously
++       * hooked up to.
++       */
++#if defined(CONFIG_SERIAL_8250) || defined(CONFIG_SERIAL_8250_MODULE)
++      /* Give the port back to the 8250 driver. */
++      serial8250_register_port(current_port);
++#endif
++}
++
++module_init(kgdb_init_io);
++module_exit(cleanup_kgdb8250);
++#else                         /* ! CONFIG_KGDB_8250_MODULE */
++early_param("kgdb8250", kgdb8250_opt);
++#endif                                /* ! CONFIG_KGDB_8250_MODULE */
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/drivers/serial/Kconfig linux-2.6.18.kgdb/drivers/serial/Kconfig
+--- linux-2.6.18/drivers/serial/Kconfig        2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/drivers/serial/Kconfig   2008-06-10 16:19:03.000000000 +0400
+@@ -106,7 +106,7 @@ config SERIAL_8250_CS
+ 
+ config SERIAL_8250_NR_UARTS
+       int "Maximum number of 8250/16550 serial ports"
+-      depends on SERIAL_8250
++      depends on SERIAL_8250 || KGDB_8250
+       default "4"
+       help
+         Set this to the number of serial ports you want the driver
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/drivers/serial/Makefile linux-2.6.18.kgdb/drivers/serial/Makefile
+--- linux-2.6.18/drivers/serial/Makefile       2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/drivers/serial/Makefile  2008-06-10 16:19:22.000000000 +0400
+@@ -47,6 +47,7 @@ obj-$(CONFIG_SERIAL_IMX) += imx.o
+ obj-$(CONFIG_SERIAL_MPC52xx) += mpc52xx_uart.o
+ obj-$(CONFIG_SERIAL_ICOM) += icom.o
+ obj-$(CONFIG_SERIAL_M32R_SIO) += m32r_sio.o
++obj-$(CONFIG_KGDB_MPSC) += mpsc_kgdb.o
+ obj-$(CONFIG_SERIAL_MPSC) += mpsc.o
+ obj-$(CONFIG_ETRAX_SERIAL) += crisv10.o
+ obj-$(CONFIG_SERIAL_JSM) += jsm/
+@@ -56,3 +57,4 @@ obj-$(CONFIG_SERIAL_SGI_IOC4) += ioc4_se
+ obj-$(CONFIG_SERIAL_SGI_IOC3) += ioc3_serial.o
+ obj-$(CONFIG_SERIAL_AT91) += at91_serial.o
+ obj-$(CONFIG_SERIAL_NETX) += netx-serial.o
++obj-$(CONFIG_KGDB_8250) += 8250_kgdb.o
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/drivers/serial/amba-pl011.c linux-2.6.18.kgdb/drivers/serial/amba-pl011.c
+--- linux-2.6.18/drivers/serial/amba-pl011.c   2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/drivers/serial/amba-pl011.c      2008-06-10 16:19:51.000000000 +0400
+@@ -340,7 +340,7 @@ static int pl011_startup(struct uart_por
+       /*
+        * Allocate the IRQ
+        */
+-      retval = request_irq(uap->port.irq, pl011_int, 0, "uart-pl011", uap);
++      retval = request_irq(uap->port.irq, pl011_int, SA_SHIRQ, "uart-pl011", uap);
+       if (retval)
+               goto clk_dis;
+ 
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/drivers/serial/cpm_uart/Makefile linux-2.6.18.kgdb/drivers/serial/cpm_uart/Makefile
+--- linux-2.6.18/drivers/serial/cpm_uart/Makefile      2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/drivers/serial/cpm_uart/Makefile 2008-06-10 16:19:22.000000000 +0400
+@@ -7,5 +7,6 @@ obj-$(CONFIG_SERIAL_CPM) += cpm_uart.o
+ # Select the correct platform objects.
+ cpm_uart-objs-$(CONFIG_CPM2)  += cpm_uart_cpm2.o
+ cpm_uart-objs-$(CONFIG_8xx)   += cpm_uart_cpm1.o
++cpm_uart-objs-$(CONFIG_KGDB_CPM_UART) += cpm_uart_kgdb.o
+ 
+ cpm_uart-objs := cpm_uart_core.o $(cpm_uart-objs-y)
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/drivers/serial/cpm_uart/cpm_uart.h linux-2.6.18.kgdb/drivers/serial/cpm_uart/cpm_uart.h
+--- linux-2.6.18/drivers/serial/cpm_uart/cpm_uart.h    2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/drivers/serial/cpm_uart/cpm_uart.h       2008-06-10 16:19:22.000000000 +0400
+@@ -51,6 +51,39 @@
+ 
+ #define SCC_WAIT_CLOSING 100
+ 
++#ifdef CONFIG_KGDB_CPM_UART
++
++/* Speed of the debug UART. */
++#if defined(CONFIG_KGDB_9600BAUD)
++#define KGDB_BAUD B9600
++#elif defined(CONFIG_KGDB_19200BAUD)
++#define KGDB_BAUD B19200
++#elif defined(CONFIG_KGDB_38400BAUD)
++#define KGDB_BAUD B38400
++#elif defined(CONFIG_KGDB_57600BAUD)
++#define KGDB_BAUD B57600
++#else
++#define KGDB_BAUD B115200     /* Start with this if not given */
++#endif
++
++#ifdef CONFIG_KGDB_CPM_UART_SCC1
++#define KGDB_PINFO_INDEX      UART_SCC1
++#elif CONFIG_KGDB_CPM_UART_SCC2
++#define KGDB_PINFO_INDEX      UART_SCC2
++#elif CONFIG_KGDB_CPM_UART_SCC3
++#define KGDB_PINFO_INDEX      UART_SCC3
++#elif CONFIG_KGDB_CPM_UART_SCC4
++#define KGDB_PINFO_INDEX      UART_SCC4
++#elif CONFIG_KGDB_CPM_UART_SMC1
++#define KGDB_PINFO_INDEX      UART_SMC1
++#elif CONFIG_KGDB_CPM_UART_SMC2
++#define KGDB_PINFO_INDEX      UART_SMC2
++#else
++#error The S(M)CC for kgdb console is undefined
++#endif
++
++#endif /* CONFIG_KGDB_CPM_UART */
++
+ struct uart_cpm_port {
+       struct uart_port        port;
+       u16                     rx_nrfifos;
+@@ -87,6 +120,9 @@ extern int cpm_uart_port_map[UART_NR];
+ extern int cpm_uart_nr;
+ extern struct uart_cpm_port cpm_uart_ports[UART_NR];
+ 
++void cpm_uart_early_write(int index, const char *s, u_int count);
++int cpm_uart_early_setup(int index,int early);
++
+ /* these are located in their respective files */
+ void cpm_line_cr_cmd(int line, int cmd);
+ int cpm_uart_init_portdesc(void);
+@@ -133,5 +169,4 @@ static inline void *cpm2cpu_addr(unsigne
+       return 0;
+ }
+ 
+-
+ #endif /* CPM_UART_H */
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/drivers/serial/cpm_uart/cpm_uart_core.c linux-2.6.18.kgdb/drivers/serial/cpm_uart/cpm_uart_core.c
+--- linux-2.6.18/drivers/serial/cpm_uart/cpm_uart_core.c       2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/drivers/serial/cpm_uart/cpm_uart_core.c  2008-06-10 16:19:22.000000000 +0400
+@@ -1070,22 +1070,17 @@ int cpm_uart_drv_get_platform_data(struc
+       return 0;
+ }
+ 
+-#ifdef CONFIG_SERIAL_CPM_CONSOLE
+-/*
+- *    Print a string to the serial port trying not to disturb
+- *    any possible real use of the port...
+- *
+- *    Note that this is called with interrupts already disabled
+- */
+-static void cpm_uart_console_write(struct console *co, const char *s,
++void cpm_uart_early_write(int index, const char *s,
+                                  u_int count)
+ {
+-      struct uart_cpm_port *pinfo =
+-          &cpm_uart_ports[cpm_uart_port_map[co->index]];
++      struct uart_cpm_port *pinfo;
+       unsigned int i;
+       volatile cbd_t *bdp, *bdbase;
+       volatile unsigned char *cp;
+ 
++      BUG_ON(index>UART_NR);
++      pinfo = &cpm_uart_ports[index];
++
+       /* Get the address of the host memory buffer.
+        */
+       bdp = pinfo->tx_cur;
+@@ -1149,16 +1144,11 @@ static void cpm_uart_console_write(struc
+       pinfo->tx_cur = (volatile cbd_t *) bdp;
+ }
+ 
+-
+-static int __init cpm_uart_console_setup(struct console *co, char *options)
++int cpm_uart_early_setup(int index, int early)
+ {
++      int ret;
+       struct uart_port *port;
+       struct uart_cpm_port *pinfo;
+-      int baud = 38400;
+-      int bits = 8;
+-      int parity = 'n';
+-      int flow = 'n';
+-      int ret;
+ 
+       struct fs_uart_platform_info *pdata;
+       struct platform_device* pdev = early_uart_get_pdev(co->index);
+@@ -1169,8 +1159,9 @@ static int __init cpm_uart_console_setup
+               cpm_uart_init_portdesc();
+       }
+ 
++      BUG_ON(index>UART_NR);
+       port =
+-          (struct uart_port *)&cpm_uart_ports[cpm_uart_port_map[co->index]];
++              (struct uart_port *)&cpm_uart_ports[index];
+       pinfo = (struct uart_cpm_port *)port;
+       if (!pdev) {
+               if (pinfo->set_lineif)
+@@ -1184,19 +1175,6 @@ static int __init cpm_uart_console_setup
+               cpm_uart_drv_get_platform_data(pdev, 1);
+       }
+ 
+-      pinfo->flags |= FLAG_CONSOLE;
+-
+-      if (options) {
+-              uart_parse_options(options, &baud, &parity, &bits, &flow);
+-      } else {
+-              bd_t *bd = (bd_t *) __res;
+-
+-              if (bd->bi_baudrate)
+-                      baud = bd->bi_baudrate;
+-              else
+-                      baud = 9600;
+-      }
+-
+       if (IS_SMC(pinfo)) {
+               pinfo->smcp->smc_smcm &= ~(SMCM_RX | SMCM_TX);
+               pinfo->smcp->smc_smcmr &= ~(SMCMR_REN | SMCMR_TEN);
+@@ -1204,8 +1182,7 @@ static int __init cpm_uart_console_setup
+               pinfo->sccp->scc_sccm &= ~(UART_SCCM_TX | UART_SCCM_RX);
+               pinfo->sccp->scc_gsmrl &= ~(SCC_GSMRL_ENR | SCC_GSMRL_ENT);
+       }
+-
+-      ret = cpm_uart_allocbuf(pinfo, 1);
++      ret = cpm_uart_allocbuf(pinfo, early);
+ 
+       if (ret)
+               return ret;
+@@ -1217,6 +1194,56 @@ static int __init cpm_uart_console_setup
+       else
+               cpm_uart_init_scc(pinfo);
+ 
++      return 0;
++}
++
++#ifdef CONFIG_SERIAL_CPM_CONSOLE
++/*
++ *    Print a string to the serial port trying not to disturb
++ *    any possible real use of the port...
++ *
++ *    Note that this is called with interrupts already disabled
++ */
++
++static void cpm_uart_console_write(struct console *co, const char *s,
++                                 u_int count)
++{
++      cpm_uart_early_write(cpm_uart_port_map[co->index],s,count);
++}
++
++/*
++ * Setup console. Be careful is called early !
++ */
++static int __init cpm_uart_console_setup(struct console *co, char *options)
++{
++      struct uart_port *port;
++      struct uart_cpm_port *pinfo;
++      int baud = 115200;
++      int bits = 8;
++      int parity = 'n';
++      int flow = 'n';
++      int ret;
++
++      port =
++          (struct uart_port *)&cpm_uart_ports[cpm_uart_port_map[co->index]];
++      pinfo = (struct uart_cpm_port *)port;
++
++      pinfo->flags |= FLAG_CONSOLE;
++
++      if (options) {
++              uart_parse_options(options, &baud, &parity, &bits, &flow);
++      } else {
++              bd_t *bd = (bd_t *) __res;
++
++              if (bd->bi_baudrate)
++                      baud = bd->bi_baudrate;
++              else
++                      baud = 9600;
++      }
++
++      ret = cpm_uart_early_setup(cpm_uart_port_map[co->index], 1);
++      if(ret)
++              return ret;
+       uart_set_options(port, co, baud, parity, bits, flow);
+ 
+       return 0;
+@@ -1364,6 +1391,12 @@ static int cpm_uart_init(void) {
+ 
+               for (i = 0; i < cpm_uart_nr; i++) {
+                       int con = cpm_uart_port_map[i];
++
++#ifdef CONFIG_KGDB_CPM_UART
++              /* We are not interested in ports yet utilized by kgdb */
++              if(con == KGDB_PINFO_INDEX)
++                      continue;
++#endif
+                       cpm_uart_ports[con].port.line = i;
+                       cpm_uart_ports[con].port.flags = UPF_BOOT_AUTOCONF;
+                       uart_add_one_port(&cpm_reg, &cpm_uart_ports[con].port);
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/drivers/serial/cpm_uart/cpm_uart_cpm1.c linux-2.6.18.kgdb/drivers/serial/cpm_uart/cpm_uart_cpm1.c
+--- linux-2.6.18/drivers/serial/cpm_uart/cpm_uart_cpm1.c       2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/drivers/serial/cpm_uart/cpm_uart_cpm1.c  2008-06-10 16:19:22.000000000 +0400
+@@ -52,6 +52,7 @@ void cpm_line_cr_cmd(int line, int cmd)
+ {
+       ushort val;
+       volatile cpm8xx_t *cp = cpmp;
++      unsigned *bcsr_io;
+ 
+       switch (line) {
+       case UART_SMC1:
+@@ -94,12 +95,35 @@ void scc1_lineif(struct uart_cpm_port *p
+ {
+       /* XXX SCC1: insert port configuration here */
+       pinfo->brg = 1;
++
++#if defined (CONFIG_MPC885ADS) || defined (CONFIG_MPC86XADS)
++      bcsr_io = ioremap(BCSR1, sizeof(unsigned long));
++
++      if (bcsr_io == NULL) {
++              printk(KERN_CRIT "Could not remap BCSR\n");
++              return;
++      }
++      out_be32(bcsr_io, in_be32(bcsr_io) & ~BCSR1_RS232EN_1);
++      iounmap(bcsr_io);
++#endif
+ }
+ 
+ void scc2_lineif(struct uart_cpm_port *pinfo)
+ {
+       /* XXX SCC2: insert port configuration here */
+       pinfo->brg = 2;
++      unsigned *bcsr_io;
++
++#if defined (CONFIG_MPC885ADS) || defined (CONFIG_MPC86XADS)
++      bcsr_io = ioremap(BCSR1, sizeof(unsigned long));
++
++      if (bcsr_io == NULL) {
++              printk(KERN_CRIT "Could not remap BCSR\n");
++              return;
++      }
++        out_be32(bcsr_io, in_be32(bcsr_io) & ~BCSR1_RS232EN_2);
++      iounmap(bcsr_io);
++#endif
+ }
+ 
+ void scc3_lineif(struct uart_cpm_port *pinfo)
+@@ -188,6 +212,10 @@ int cpm_uart_init_portdesc(void)
+ {
+       pr_debug("CPM uart[-]:init portdesc\n");
+ 
++      /* Check if we have called this yet. This may happen if early kgdb
++      breakpoint is on */
++      if(cpm_uart_nr)
++              return 0;
+       cpm_uart_nr = 0;
+ #ifdef CONFIG_SERIAL_CPM_SMC1
+       cpm_uart_ports[UART_SMC1].smcp = &cpmp->cp_smc[0];
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/drivers/serial/cpm_uart/cpm_uart_cpm2.c linux-2.6.18.kgdb/drivers/serial/cpm_uart/cpm_uart_cpm2.c
+--- linux-2.6.18/drivers/serial/cpm_uart/cpm_uart_cpm2.c       2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/drivers/serial/cpm_uart/cpm_uart_cpm2.c  2008-06-10 16:19:22.000000000 +0400
+@@ -256,6 +256,10 @@ int cpm_uart_init_portdesc(void)
+ {
+       pr_debug("CPM uart[-]:init portdesc\n");
+ 
++      /* Check if we have called this yet. This may happen if early kgdb
++      breakpoint is on */
++      if(cpm_uart_nr)
++              return 0;
+       cpm_uart_nr = 0;
+ #ifdef CONFIG_SERIAL_CPM_SMC1
+       cpm_uart_ports[UART_SMC1].smcp = (smc_t *) & cpm2_immr->im_smc[0];
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/drivers/serial/cpm_uart/cpm_uart_kgdb.c linux-2.6.18.kgdb/drivers/serial/cpm_uart/cpm_uart_kgdb.c
+--- linux-2.6.18/drivers/serial/cpm_uart/cpm_uart_kgdb.c       1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18.kgdb/drivers/serial/cpm_uart/cpm_uart_kgdb.c  2008-06-10 16:19:22.000000000 +0400
+@@ -0,0 +1,195 @@
++/*
++ * drivers/serial/cpm_uart/cpm_uart_kgdb.c
++ *
++ * CPM UART interface for kgdb.
++ *
++ * Author: Vitaly Bordug <vbordug@ru.mvista.com>
++ *
++ * Used some bits from drivers/serial/kgdb_8250.c as a template
++ *
++ * 2005 (c) MontaVista Software, Inc. This file is licensed under
++ * the terms of the GNU General Public License version 2. This program
++ * is licensed "as is" without any warranty of any kind, whether express
++ * or implied.
++ */
++
++#include <linux/kgdb.h>
++#include <linux/config.h>
++#include <linux/kernel.h>
++#include <linux/init.h>
++#include <linux/interrupt.h>
++#include <linux/tty.h>
++#include <linux/serial.h>
++#include <linux/serial_core.h>
++#include <linux/serial_reg.h>
++
++#include <asm/io.h>
++#include <asm/serial.h>               /* For BASE_BAUD and SERIAL_PORT_DFNS */
++
++#include "cpm_uart.h"
++
++#define GDB_BUF_SIZE  512     /* power of 2, please */
++
++
++static char kgdb_buf[GDB_BUF_SIZE], *kgdbp;
++static int kgdb_chars;
++
++/* Forward declarations. */
++
++/*
++ * Receive character from the serial port.  This only works well
++ * before the port is initialize for real use.
++ */
++static int kgdb_wait_key(char *obuf)
++{
++      struct uart_cpm_port *pinfo;
++
++      u_char                          c, *cp;
++      volatile        cbd_t           *bdp;
++      int                             i;
++
++      pinfo = &cpm_uart_ports[KGDB_PINFO_INDEX];
++
++      /* Get the address of the host memory buffer.
++       */
++      bdp = pinfo->rx_cur;
++      while (bdp->cbd_sc & BD_SC_EMPTY);
++
++      /* If the buffer address is in the CPM DPRAM, don't
++       * convert it.
++       */
++      cp = cpm2cpu_addr(bdp->cbd_bufaddr);
++
++      if (obuf) {
++              i = c = bdp->cbd_datlen;
++              while (i-- > 0)
++              {
++                      *obuf++ = *cp++;
++              }
++      } else {
++              c = *cp;
++      }
++      bdp->cbd_sc |= BD_SC_EMPTY;
++
++      if (bdp->cbd_sc & BD_SC_WRAP) {
++              bdp = pinfo->rx_bd_base;
++      } else {
++              bdp++;
++      }
++      pinfo->rx_cur = (cbd_t *)bdp;
++
++      return((int)c);
++}
++
++
++/*
++ * Wait until the interface can accept a char, then write it.
++ */
++static void
++kgdb_put_debug_char(int chr)
++{
++      static char ch[2];
++      ch[0]=(char)chr;
++      cpm_uart_early_write(KGDB_PINFO_INDEX, ch, 1);
++}
++
++
++/*
++ * Get a char if available, return -1 if nothing available.
++ * Empty the receive buffer first, then look at the interface hardware.
++ */
++static int
++kgdb_get_debug_char(void)
++{
++      if (kgdb_chars<=0) {
++              kgdb_chars = kgdb_wait_key(kgdb_buf);
++              kgdbp = kgdb_buf;
++      }
++      kgdb_chars--;
++
++      return (*kgdbp++);
++}
++
++static void termios_set_options(int index,
++               int baud, int parity, int bits, int flow)
++{
++      struct termios termios;
++      struct uart_port *port;
++      struct uart_cpm_port *pinfo;
++
++      BUG_ON(index>UART_NR);
++
++      port =
++          (struct uart_port *)&cpm_uart_ports[index];
++      pinfo = (struct uart_cpm_port *)port;
++
++      /*
++       * Ensure that the serial console lock is initialised
++       * early.
++       */
++      spin_lock_init(&port->lock);
++
++      memset(&termios, 0, sizeof(struct termios));
++
++      termios.c_cflag = CREAD | HUPCL | CLOCAL;
++
++      termios.c_cflag |= baud;
++
++      if (bits == 7)
++              termios.c_cflag |= CS7;
++      else
++              termios.c_cflag |= CS8;
++
++      switch (parity) {
++      case 'o': case 'O':
++              termios.c_cflag |= PARODD;
++              /*fall through*/
++      case 'e': case 'E':
++              termios.c_cflag |= PARENB;
++              break;
++      }
++
++      if (flow == 'r')
++              termios.c_cflag |= CRTSCTS;
++
++      port->ops->set_termios(port, &termios, NULL);
++}
++
++/*
++ *  Returns:
++ *    0 on success, 1 on failure.
++ */
++static int kgdb_init(void)
++{
++      struct uart_port *port;
++      struct uart_cpm_port *pinfo;
++
++      int use_bootmem = 0; /* use dma by default */
++
++      if(!cpm_uart_nr)
++      {
++              use_bootmem = 1;
++              cpm_uart_init_portdesc();
++      }
++      port = (struct uart_port *)&cpm_uart_ports[KGDB_PINFO_INDEX];
++      pinfo = (struct uart_cpm_port *)port;
++
++      if (cpm_uart_early_setup(KGDB_PINFO_INDEX, use_bootmem))
++              return 1;
++
++      termios_set_options(KGDB_PINFO_INDEX, KGDB_BAUD,'n',8,'n');
++        if (IS_SMC(pinfo))
++                pinfo->smcp->smc_smcm |= SMCM_TX;
++        else
++                pinfo->sccp->scc_sccm |= UART_SCCM_TX;
++
++      return 0;
++}
++
++
++struct kgdb_io kgdb_io_ops = {
++      .read_char = kgdb_get_debug_char,
++      .write_char = kgdb_put_debug_char,
++      .init = kgdb_init,
++};
++
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/drivers/serial/mpsc.c linux-2.6.18.kgdb/drivers/serial/mpsc.c
+--- linux-2.6.18/drivers/serial/mpsc.c 2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/drivers/serial/mpsc.c    2008-06-10 16:19:22.000000000 +0400
+@@ -242,6 +242,11 @@ struct mpsc_port_info *mpsc_device_remov
+ #define       MPSC_RCRR                       0x0004
+ #define       MPSC_TCRR                       0x0008
+ 
++/* MPSC Interrupt registers (offset from MV64x60_SDMA_INTR_OFFSET) */
++#define MPSC_INTR_CAUSE                        0x0004
++#define MPSC_INTR_MASK                 0x0084
++#define MPSC_INTR_CAUSE_RCC            (1<<6)
++
+ /* Serial DMA Controller Interface Registers */
+ #define       SDMA_SDC                        0x0000
+ #define       SDMA_SDCM                       0x0008
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/drivers/serial/mpsc_kgdb.c linux-2.6.18.kgdb/drivers/serial/mpsc_kgdb.c
+--- linux-2.6.18/drivers/serial/mpsc_kgdb.c    1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18.kgdb/drivers/serial/mpsc_kgdb.c       2008-06-10 16:19:22.000000000 +0400
+@@ -0,0 +1,299 @@
++/*
++ * drivers/serial/mpsc_kgdb.c
++ *
++ * KGDB driver for the Marvell MultiProtocol Serial Controller (MPCS)
++ *
++ * Based on the polled boot loader driver by Ajit Prem (ajit.prem@motorola.com)
++ *
++ * Author: Randy Vinson <rvinson@mvista.com>
++ *
++ * 2005 (c) MontaVista Software, Inc.
++ * This program is free software; you can redistribute  it and/or modify it
++ * under  the terms of  the GNU General  Public License as published by the
++ * Free Software Foundation;  either version 2 of the  License, or (at your
++ * option) any later version.
++ */
++
++#include <linux/config.h>
++#include <linux/kgdb.h>
++#include <linux/mv643xx.h>
++#include <linux/device.h>
++#include <asm/mv64x60.h>
++#include <asm/serial.h>
++#include <asm/io.h>
++#include <asm/delay.h>
++
++#include "mpsc.h"
++
++/* Speed of the UART. */
++static int kgdbmpsc_baud = CONFIG_KGDB_BAUDRATE;
++
++/* Index of the UART, matches ttyMX naming. */
++static int kgdbmpsc_ttyMM = CONFIG_KGDB_PORT_NUM;
++
++#define MPSC_INTR_REG_SELECT(x)       ((x) + (8 * kgdbmpsc_ttyMM))
++
++static int kgdbmpsc_init(void);
++
++static struct platform_device mpsc_dev, shared_dev;
++
++static void __iomem *mpsc_base;
++static void __iomem *brg_base;
++static void __iomem *routing_base;
++static void __iomem *sdma_base;
++
++static unsigned int mpsc_irq;
++
++static void kgdb_write_debug_char(int c)
++{
++      u32 data;
++
++      data = readl(mpsc_base + MPSC_MPCR);
++      writeb(c, mpsc_base + MPSC_CHR_1);
++      mb();
++      data = readl(mpsc_base + MPSC_CHR_2);
++      data |= MPSC_CHR_2_TTCS;
++      writel(data, mpsc_base + MPSC_CHR_2);
++      mb();
++
++      while (readl(mpsc_base + MPSC_CHR_2) & MPSC_CHR_2_TTCS) ;
++}
++
++static int kgdb_get_debug_char(void)
++{
++      unsigned char c;
++
++      while (!(readl(sdma_base + MPSC_INTR_REG_SELECT(MPSC_INTR_CAUSE)) &
++               MPSC_INTR_CAUSE_RCC)) ;
++
++      c = readb(mpsc_base + MPSC_CHR_10 + (1 << 1));
++      mb();
++      writeb(c, mpsc_base + MPSC_CHR_10 + (1 << 1));
++      mb();
++      writel(~MPSC_INTR_CAUSE_RCC, sdma_base +
++             MPSC_INTR_REG_SELECT(MPSC_INTR_CAUSE));
++      return (c);
++}
++
++/*
++ * This is the receiver interrupt routine for the GDB stub.
++ * All that we need to do is verify that the interrupt happened on the
++ * line we're in charge of.  If this is true, schedule a breakpoint and
++ * return.
++ */
++static irqreturn_t
++kgdbmpsc_interrupt(int irq, void *dev_id, struct pt_regs *regs)
++{
++      if (irq != mpsc_irq)
++              return IRQ_NONE;
++      /*
++       * If  there is some other CPU in KGDB then this is a
++       * spurious interrupt. so return without even checking a byte
++       */
++      if (atomic_read(&debugger_active))
++              return IRQ_NONE;
++
++      if (readl(sdma_base + MPSC_INTR_REG_SELECT(MPSC_INTR_CAUSE)) &
++          MPSC_INTR_CAUSE_RCC)
++              breakpoint();
++
++      return IRQ_HANDLED;
++}
++
++static int __init kgdbmpsc_init(void)
++{
++      struct mpsc_pdata *pdata;
++      u32 cdv;
++
++      if (!brg_base || !mpsc_base || !routing_base || !sdma_base)
++              return -1;
++
++      /* Set MPSC Routing to enable both ports */
++      writel(0x0, routing_base + MPSC_MRR);
++
++      /* MPSC 0/1 Rx & Tx get clocks BRG0/1 */
++      writel(0x00000100, routing_base + MPSC_RCRR);
++      writel(0x00000100, routing_base + MPSC_TCRR);
++
++      /* Disable all MPSC interrupts and clear any pending interrupts */
++      writel(0, sdma_base + MPSC_INTR_REG_SELECT(MPSC_INTR_MASK));
++      writel(0, sdma_base + MPSC_INTR_REG_SELECT(MPSC_INTR_CAUSE));
++
++      pdata = (struct mpsc_pdata *)mpsc_dev.dev.platform_data;
++
++      /* cdv = (clock/(2*16*baud rate)) for 16X mode. */
++      cdv = ((pdata->brg_clk_freq / (32 * kgdbmpsc_baud)) - 1);
++      writel((pdata->brg_clk_src << 18) | (1 << 16) | cdv,
++             brg_base + BRG_BCR);
++
++      /* Put MPSC into UART mode, no null modem, 16x clock mode */
++      writel(0x000004c4, mpsc_base + MPSC_MMCRL);
++      writel(0x04400400, mpsc_base + MPSC_MMCRH);
++
++      writel(0, mpsc_base + MPSC_CHR_1);
++      writel(0, mpsc_base + MPSC_CHR_9);
++      writel(0, mpsc_base + MPSC_CHR_10);
++      writel(4, mpsc_base + MPSC_CHR_3);
++      writel(0x20000000, mpsc_base + MPSC_CHR_4);
++      writel(0x9000, mpsc_base + MPSC_CHR_5);
++      writel(0, mpsc_base + MPSC_CHR_6);
++      writel(0, mpsc_base + MPSC_CHR_7);
++      writel(0, mpsc_base + MPSC_CHR_8);
++
++      /* 8 data bits, 1 stop bit */
++      writel((3 << 12), mpsc_base + MPSC_MPCR);
++
++      /* Enter "hunt" mode */
++      writel((1 << 31), mpsc_base + MPSC_CHR_2);
++
++      udelay(100);
++      return 0;
++}
++
++static void __iomem *__init
++kgdbmpsc_map_resource(struct platform_device *pd, int type, int num)
++{
++      void __iomem *base = NULL;
++      struct resource *r;
++
++      if ((r = platform_get_resource(pd, IORESOURCE_MEM, num)))
++              base = ioremap(r->start, r->end - r->start + 1);
++      return base;
++}
++
++static void __iomem *__init
++kgdbmpsc_unmap_resource(struct platform_device *pd, int type, int num,
++                      void __iomem * base)
++{
++      if (base)
++              iounmap(base);
++      return NULL;
++}
++
++static void __init
++kgdbmpsc_reserve_resource(struct platform_device *pd, int type, int num)
++{
++      struct resource *r;
++
++      if ((r = platform_get_resource(pd, IORESOURCE_MEM, num)))
++              request_mem_region(r->start, r->end - r->start + 1, "kgdb");
++}
++
++static int __init kgdbmpsc_local_init(void)
++{
++      if (!mpsc_dev.num_resources || !shared_dev.num_resources)
++              return 1;       /* failure */
++
++      mpsc_base = kgdbmpsc_map_resource(&mpsc_dev, IORESOURCE_MEM,
++                                        MPSC_BASE_ORDER);
++      brg_base = kgdbmpsc_map_resource(&mpsc_dev, IORESOURCE_MEM,
++                                       MPSC_BRG_BASE_ORDER);
++
++      /* get the platform data for the shared registers and get them mapped */
++      routing_base = kgdbmpsc_map_resource(&shared_dev,
++                                           IORESOURCE_MEM,
++                                           MPSC_ROUTING_BASE_ORDER);
++      sdma_base =
++          kgdbmpsc_map_resource(&shared_dev, IORESOURCE_MEM,
++                                MPSC_SDMA_INTR_BASE_ORDER);
++
++      mpsc_irq = platform_get_irq(&mpsc_dev, 1);
++
++      if (mpsc_base && brg_base && routing_base && sdma_base)
++              return 0;       /* success */
++
++      return 1;               /* failure */
++}
++
++static void __init kgdbmpsc_local_exit(void)
++{
++      if (sdma_base)
++              sdma_base = kgdbmpsc_unmap_resource(&shared_dev, IORESOURCE_MEM,
++                                                  MPSC_SDMA_INTR_BASE_ORDER,
++                                                  sdma_base);
++      if (routing_base)
++              routing_base = kgdbmpsc_unmap_resource(&shared_dev,
++                                                     IORESOURCE_MEM,
++                                                     MPSC_ROUTING_BASE_ORDER,
++                                                     routing_base);
++      if (brg_base)
++              brg_base = kgdbmpsc_unmap_resource(&mpsc_dev, IORESOURCE_MEM,
++                                                 MPSC_BRG_BASE_ORDER,
++                                                 brg_base);
++      if (mpsc_base)
++              mpsc_base = kgdbmpsc_unmap_resource(&mpsc_dev, IORESOURCE_MEM,
++                                                  MPSC_BASE_ORDER, mpsc_base);
++}
++
++static void __init kgdbmpsc_update_pdata(struct platform_device *pdev)
++{
++
++      snprintf(pdev->dev.bus_id, BUS_ID_SIZE, "%s%u", pdev->name, pdev->id);
++}
++
++static int __init kgdbmpsc_pdev_init(void)
++{
++      struct platform_device *pdev;
++
++      /* get the platform data for the specified port. */
++      pdev = mv64x60_early_get_pdev_data(MPSC_CTLR_NAME, kgdbmpsc_ttyMM, 1);
++      if (pdev) {
++              memcpy(&mpsc_dev, pdev, sizeof(struct platform_device));
++              if (platform_notify) {
++                      kgdbmpsc_update_pdata(&mpsc_dev);
++                      platform_notify(&mpsc_dev.dev);
++              }
++
++              /* get the platform data for the shared registers. */
++              pdev = mv64x60_early_get_pdev_data(MPSC_SHARED_NAME, 0, 0);
++              if (pdev) {
++                      memcpy(&shared_dev, pdev,
++                             sizeof(struct platform_device));
++                      if (platform_notify) {
++                              kgdbmpsc_update_pdata(&shared_dev);
++                              platform_notify(&shared_dev.dev);
++                      }
++              }
++      }
++      return 0;
++}
++
++postcore_initcall(kgdbmpsc_pdev_init);
++
++static int __init kgdbmpsc_init_io(void)
++{
++
++      kgdbmpsc_pdev_init();
++
++      if (kgdbmpsc_local_init()) {
++              kgdbmpsc_local_exit();
++              return -1;
++      }
++
++      if (kgdbmpsc_init() == -1)
++              return -1;
++      return 0;
++}
++
++static void __init kgdbmpsc_hookup_irq(void)
++{
++      unsigned int msk;
++      if (!request_irq(mpsc_irq, kgdbmpsc_interrupt, 0, "kgdb mpsc", NULL)) {
++              /* Enable interrupt */
++              msk = readl(sdma_base + MPSC_INTR_REG_SELECT(MPSC_INTR_MASK));
++              msk |= MPSC_INTR_CAUSE_RCC;
++              writel(msk, sdma_base + MPSC_INTR_REG_SELECT(MPSC_INTR_MASK));
++
++              kgdbmpsc_reserve_resource(&mpsc_dev, IORESOURCE_MEM,
++                                        MPSC_BASE_ORDER);
++              kgdbmpsc_reserve_resource(&mpsc_dev, IORESOURCE_MEM,
++                                        MPSC_BRG_BASE_ORDER);
++      }
++}
++
++struct kgdb_io kgdb_io_ops = {
++      .read_char = kgdb_get_debug_char,
++      .write_char = kgdb_write_debug_char,
++      .init = kgdbmpsc_init_io,
++      .late_init = kgdbmpsc_hookup_irq,
++};
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/drivers/serial/pxa.c linux-2.6.18.kgdb/drivers/serial/pxa.c
+--- linux-2.6.18/drivers/serial/pxa.c  2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/drivers/serial/pxa.c     2008-06-10 16:19:51.000000000 +0400
+@@ -42,6 +42,9 @@
+ #include <linux/tty.h>
+ #include <linux/tty_flip.h>
+ #include <linux/serial_core.h>
++#ifdef CONFIG_KGDB_CONSOLE
++#include <linux/kgdb.h>
++#endif
+ 
+ #include <asm/io.h>
+ #include <asm/hardware.h>
+@@ -692,6 +695,8 @@ serial_pxa_console_init(void)
+ console_initcall(serial_pxa_console_init);
+ 
+ #define PXA_CONSOLE   &serial_pxa_console
++#elif defined(CONFIG_KGDB_CONSOLE)
++#define PXA_CONSOLE   &kgdbcons
+ #else
+ #define PXA_CONSOLE   NULL
+ #endif
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/drivers/serial/serial_core.c linux-2.6.18.kgdb/drivers/serial/serial_core.c
+--- linux-2.6.18/drivers/serial/serial_core.c  2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/drivers/serial/serial_core.c     2008-06-10 16:19:03.000000000 +0400
+@@ -33,6 +33,7 @@
+ #include <linux/serial.h> /* for serial_state and serial_icounter_struct */
+ #include <linux/delay.h>
+ #include <linux/mutex.h>
++#include <linux/kgdb.h>
+ 
+ #include <asm/irq.h>
+ #include <asm/uaccess.h>
+@@ -65,6 +66,12 @@ static struct lock_class_key port_lock_k
+ #define uart_console(port)    (0)
+ #endif
+ 
++#ifdef CONFIG_KGDB_CONSOLE
++#define uart_kgdb(port)               (port->cons && !strcmp(port->cons->name, "kgdb"))
++#else
++#define uart_kgdb(port)               (0)
++#endif
++
+ static void uart_change_speed(struct uart_state *state, struct termios *old_termios);
+ static void uart_wait_until_sent(struct tty_struct *tty, int timeout);
+ static void uart_change_pm(struct uart_state *state, int pm_state);
+@@ -1673,6 +1680,9 @@ static int uart_line_info(char *buf, str
+                       port->iotype == UPIO_MEM ? port->mapbase :
+                                               (unsigned long) port->iobase,
+                       port->irq);
++      if (port->iotype == UPIO_MEM)
++              ret += sprintf(buf+ret, " membase 0x%08lX",
++                                         (unsigned long) port->membase);
+ 
+       if (port->type == PORT_UNKNOWN) {
+               strcat(buf, "\n");
+@@ -2038,7 +2048,8 @@ uart_report_port(struct uart_driver *drv
+       case UPIO_AU:
+       case UPIO_TSI:
+               snprintf(address, sizeof(address),
+-                       "MMIO 0x%lx", port->mapbase);
++                      "MMIO map 0x%lx mem 0x%lx", port->mapbase,
++                      (unsigned long) port->membase);
+               break;
+       default:
+               strlcpy(address, "*unknown*", sizeof(address));
+@@ -2090,9 +2101,9 @@ uart_configure_port(struct uart_driver *
+ 
+               /*
+                * Power down all ports by default, except the
+-               * console if we have one.
++               * console (real or kgdb) if we have one.
+                */
+-              if (!uart_console(port))
++              if (!uart_console(port) && !uart_kgdb(port))
+                       uart_change_pm(state, 3);
+       }
+ }
+@@ -2284,6 +2295,12 @@ int uart_add_one_port(struct uart_driver
+        */
+       port->flags &= ~UPF_DEAD;
+ 
++#if defined(CONFIG_KGDB_8250)
++      /* Add any 8250-like ports we find later. */
++      if (port->type <= PORT_MAX_8250)
++              kgdb8250_add_port(port->line, port);
++#endif
++
+  out:
+       mutex_unlock(&state->mutex);
+       mutex_unlock(&port_mutex);
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/drivers/serial/serial_txx9.c linux-2.6.18.kgdb/drivers/serial/serial_txx9.c
+--- linux-2.6.18/drivers/serial/serial_txx9.c  2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/drivers/serial/serial_txx9.c     2008-06-10 16:19:28.000000000 +0400
+@@ -1164,6 +1164,96 @@ static struct pci_driver serial_txx9_pci
+ MODULE_DEVICE_TABLE(pci, serial_txx9_pci_tbl);
+ #endif /* ENABLE_SERIAL_TXX9_PCI */
+ 
++/******************************************************************************/
++/* BEG: KDBG Routines                                                         */
++/******************************************************************************/
++
++#ifdef CONFIG_KGDB
++int kgdb_init_count = 0;
++
++void txx9_sio_kgdb_hook(unsigned int port, unsigned int baud_rate)
++{
++      static struct resource kgdb_resource;
++      int ret;
++      struct uart_txx9_port *up = &serial_txx9_ports[port];
++
++      /* prevent initialization by driver */
++      kgdb_resource.name = "serial_txx9(debug)";
++      kgdb_resource.start = (unsigned long)up->port.membase;
++      kgdb_resource.end = (unsigned long)(up->port.membase + 36 - 1);
++      kgdb_resource.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
++
++      ret = request_resource(&iomem_resource, &kgdb_resource);
++      if(ret == -EBUSY)
++              printk(" serial_txx9(debug): request_resource failed\n");
++
++      return;
++}
++void
++txx9_sio_kdbg_init( unsigned int port_number )
++{
++      if (port_number == 1) {
++              txx9_sio_kgdb_hook(port_number, 38400);
++      } else {
++              printk("Bad Port Number [%u] != [1]\n",port_number);
++      }
++      return;
++}
++
++u8
++txx9_sio_kdbg_rd( void )
++{
++      unsigned int status,ch;
++      struct uart_txx9_port *up = &serial_txx9_ports[1];
++
++      if (kgdb_init_count == 0) {
++              txx9_sio_kdbg_init(1);
++              kgdb_init_count = 1;
++      }
++
++      while (1) {
++              status = sio_in(up, TXX9_SIDISR);
++              if ( status & 0x1f ) {
++                      ch = sio_in(up, TXX9_SIRFIFO );
++                      break;
++              }
++      }
++
++      return (ch);
++}
++
++int
++txx9_sio_kdbg_wr( u8 ch )
++{
++      unsigned int status;
++      struct uart_txx9_port *up = &serial_txx9_ports[1];
++
++      if (kgdb_init_count == 0) {
++              txx9_sio_kdbg_init(1);
++              kgdb_init_count = 1;
++      }
++
++      while (1) {
++              status = sio_in(up, TXX9_SICISR);
++              if (status & TXX9_SICISR_TRDY) {
++                      if ( ch == '\n' ) {
++                              txx9_sio_kdbg_wr( '\r' );
++                      }
++                      sio_out(up, TXX9_SITFIFO, (u32)ch );
++
++                      break;
++              }
++      }
++
++      return (1);
++}
++#endif /* CONFIG_KGDB */
++
++
++/******************************************************************************/
++/* END: KDBG Routines                                                         */
++/******************************************************************************/
++
+ static int __init serial_txx9_init(void)
+ {
+       int ret;
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/drivers/serial/sh-sci.c linux-2.6.18.kgdb/drivers/serial/sh-sci.c
+--- linux-2.6.18/drivers/serial/sh-sci.c       2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/drivers/serial/sh-sci.c  2008-06-10 16:19:47.000000000 +0400
+@@ -42,6 +42,7 @@
+ #include <linux/console.h>
+ #include <linux/bitops.h>
+ #include <linux/generic_serial.h>
++#include <linux/kgdb.h>
+ 
+ #ifdef CONFIG_CPU_FREQ
+ #include <linux/notifier.h>
+@@ -67,14 +68,16 @@
+ 
+ #include "sh-sci.h"
+ 
+-#ifdef CONFIG_SH_KGDB
+-#include <asm/kgdb.h>
+-
+-static int kgdb_get_char(struct sci_port *port);
+-static void kgdb_put_char(struct sci_port *port, char c);
+-static void kgdb_handle_error(struct sci_port *port);
+-static struct sci_port *kgdb_sci_port;
+-#endif /* CONFIG_SH_KGDB */
++#ifdef CONFIG_KGDB_SH_SCI
++/* Speed of the UART. */
++static int kgdbsci_baud = CONFIG_KGDB_BAUDRATE
++
++/* Index of the UART, matches ttySCX naming. */
++static int kgdbsci_ttySC = CONFIG_KGDB_PORT_NUM;
++
++/* Make life easier on us. */
++#define KGDBPORT      sci_ports[kgdbsci_ttySC]
++#endif /* CONFIG_KGDB_SH_SCI */
+ 
+ #ifdef CONFIG_SERIAL_SH_SCI_CONSOLE
+ static struct sci_port *serial_console_port = 0;
+@@ -87,20 +90,17 @@ static void sci_start_rx(struct uart_por
+ static void sci_stop_rx(struct uart_port *port);
+ static int sci_request_irq(struct sci_port *port);
+ static void sci_free_irq(struct sci_port *port);
++static void sci_set_termios(struct uart_port *port, struct termios *termios,
++                      struct termios *old);
++static int kgdbsci_init(void);
+ 
+ static struct sci_port sci_ports[];
+ static struct uart_driver sci_uart_driver;
+ 
+ #define SCI_NPORTS sci_uart_driver.nr
+ 
+-#if defined(CONFIG_SH_STANDARD_BIOS) || defined(CONFIG_SH_KGDB)
+-
+-static void handle_error(struct uart_port *port)
+-{                             /* Clear error flags */
+-      sci_out(port, SCxSR, SCxSR_ERROR_CLEAR(port));
+-}
+-
+-static int get_char(struct uart_port *port)
++#if defined(CONFIG_SH_STANDARD_BIOS) || defined(CONFIG_KGDB_SH_SCI)
++static int get_char_for_gdb(struct uart_port *port)
+ {
+       unsigned long flags;
+       unsigned short status;
+@@ -110,7 +110,8 @@ static int get_char(struct uart_port *po
+         do {
+               status = sci_in(port, SCxSR);
+               if (status & SCxSR_ERRORS(port)) {
+-                      handle_error(port);
++                      /* Clear error flags. */
++                      sci_out(port, SCxSR, SCxSR_ERROR_CLEAR(port));
+                       continue;
+               }
+       } while (!(status & SCxSR_RDxF(port)));
+@@ -121,21 +122,7 @@ static int get_char(struct uart_port *po
+ 
+       return c;
+ }
+-
+-/* Taken from sh-stub.c of GDB 4.18 */
+-static const char hexchars[] = "0123456789abcdef";
+-
+-static __inline__ char highhex(int  x)
+-{
+-      return hexchars[(x >> 4) & 0xf];
+-}
+-
+-static __inline__ char lowhex(int  x)
+-{
+-      return hexchars[x & 0xf];
+-}
+-
+-#endif /* CONFIG_SH_STANDARD_BIOS || CONFIG_SH_KGDB */
++#endif /* CONFIG_SH_STANDARD_BIOS || CONFIG_KGDB_SH_SCI */
+ 
+ /*
+  * Send the packet in buffer.  The host gets one chance to read it.
+@@ -167,21 +154,14 @@ static void put_string(struct sci_port *
+       const unsigned char *p = buffer;
+       int i;
+ 
+-#if defined(CONFIG_SH_STANDARD_BIOS) || defined(CONFIG_SH_KGDB)
+-      int checksum;
+-      int usegdb=0;
+-
+ #ifdef CONFIG_SH_STANDARD_BIOS
++      int checksum;
++       const char hexchars[] = "0123456789abcdef";
++      
+       /* This call only does a trap the first time it is
+        * called, and so is safe to do here unconditionally
+        */
+-      usegdb |= sh_bios_in_gdb_mode();
+-#endif
+-#ifdef CONFIG_SH_KGDB
+-      usegdb |= (kgdb_in_gdb_mode && (port == kgdb_sci_port));
+-#endif
+-
+-      if (usegdb) {
++      if (sh_bios_in_gdb_mode()) {
+           /*  $<packet info>#<checksum>. */
+           do {
+               unsigned char c;
+@@ -193,18 +173,18 @@ static void put_string(struct sci_port *
+                       int h, l;
+ 
+                       c = *p++;
+-                      h = highhex(c);
+-                      l = lowhex(c);
++                      h = hexchars[c >> 4];
++                      l = hexchars[c % 16];
+                       put_char(port, h);
+                       put_char(port, l);
+                       checksum += h + l;
+               }
+               put_char(port, '#');
+-              put_char(port, highhex(checksum));
+-              put_char(port, lowhex(checksum));
++              put_char(port, hexchars[checksum >> 4]);
++              put_char(port, hexchars[checksum % 16]);
+           } while  (get_char(port) != '+');
+       } else
+-#endif /* CONFIG_SH_STANDARD_BIOS || CONFIG_SH_KGDB */
++#endif /* CONFIG_SH_STANDARD_BIOS */
+       for (i=0; i<count; i++) {
+               if (*p == 10)
+                       put_char(port, '\r');
+@@ -214,90 +194,163 @@ static void put_string(struct sci_port *
+ #endif /* CONFIG_SERIAL_SH_SCI_CONSOLE */
+ 
+ 
+-#ifdef CONFIG_SH_KGDB
+-
+-/* Is the SCI ready, ie is there a char waiting? */
+-static int kgdb_is_char_ready(struct sci_port *port)
++#ifdef CONFIG_KGDB_SH_SCI
++static int kgdbsci_read_char(void)
+ {
+-        unsigned short status = sci_in(port, SCxSR);
+-
+-        if (status & (SCxSR_ERRORS(port) | SCxSR_BRK(port)))
+-                kgdb_handle_error(port);
+-
+-        return (status & SCxSR_RDxF(port));
++      return get_char_for_gdb(&KGDBPORT.port);
+ }
+ 
+-/* Write a char */
+-static void kgdb_put_char(struct sci_port *port, char c)
++/* Called from kgdbstub.c to put a character, just a wrapper */
++static void kgdbsci_write_char(int c)
+ {
+-        unsigned short status;
+-
+-        do
+-                status = sci_in(port, SCxSR);
+-        while (!(status & SCxSR_TDxE(port)));
++      unsigned short status;
+ 
+-        sci_out(port, SCxTDR, c);
+-        sci_in(port, SCxSR);    /* Dummy read */
+-        sci_out(port, SCxSR, SCxSR_TDxE_CLEAR(port));
++      do
++              status = sci_in(&KGDBPORT.port, SCxSR);
++      while (!(status & SCxSR_TDxE(&KGDBPORT.port)));
++
++      sci_out(&KGDBPORT.port, SCxTDR, c);
++      sci_in(&KGDBPORT.port, SCxSR);  /* Dummy read */
++      sci_out(&KGDBPORT.port, SCxSR, SCxSR_TDxE_CLEAR(&KGDBPORT.port));
+ }
+ 
+-/* Get a char if there is one, else ret -1 */
+-static int kgdb_get_char(struct sci_port *port)
++#ifndef CONFIG_SERIAL_SH_SCI_CONSOLE
++/* If we don't have console, we never hookup IRQs.  But we need to
++ * hookup one so that we can interrupt the system.
++ */
++static irqreturn_t kgdbsci_rx_interrupt(int irq, void *ptr,
++              struct pt_regs *regs)
+ {
+-        int c;
++      struct uart_port *port = ptr;
+ 
+-        if (kgdb_is_char_ready(port) == 0)
+-                c = -1;
+-        else {
+-                c = sci_in(port, SCxRDR);
+-                sci_in(port, SCxSR);    /* Dummy read */
+-                sci_out(port, SCxSR, SCxSR_RDxF_CLEAR(port));
+-        }
++      if (!(sci_in(port, SCxSR) & SCxSR_RDxF(port)))
++              return IRQ_NONE;
+ 
+-        return c;
+-}
+-
+-/* Called from kgdbstub.c to get a character, i.e. is blocking */
+-static int kgdb_sci_getchar(void)
+-{
+-        volatile int c;
++      if (kgdb_io_ops.init != kgdbsci_init) {
++              /* Throw away the data if another I/O routine is active */
++              get_char_for_gdb(&KGDBPORT.port);
++      } else
++              /* We've got an interrupt, so go ahead and call breakpoint() */
++              breakpoint();
+ 
+-        /* Keep trying to read a character, this could be neater */
+-        while ((c = kgdb_get_char(kgdb_sci_port)) < 0);
++      sci_in(port, SCxSR); /* dummy read */
++      sci_out(port, SCxSR, SCxSR_RDxF_CLEAR(port));
+ 
+-        return c;
++      return IRQ_HANDLED;
+ }
+ 
+-/* Called from kgdbstub.c to put a character, just a wrapper */
+-static void kgdb_sci_putchar(int c)
++static irqreturn_t kgdbsci_mpxed_interrupt(int irq, void *ptr,
++              struct pt_regs *regs)
+ {
++        unsigned short ssr_status, scr_status;
++        struct uart_port *port = ptr;
++
++        ssr_status = sci_in(port,SCxSR);
++        scr_status = sci_in(port,SCSCR);
++
++      /* Rx Interrupt */
++        if ((ssr_status&0x0002) && (scr_status&0x0040))
++              kgdbsci_rx_interrupt(irq, ptr, regs);
+ 
+-        kgdb_put_char(kgdb_sci_port, c);
++      return IRQ_HANDLED;
+ }
+ 
+-/* Clear any errors on the SCI */
+-static void kgdb_handle_error(struct sci_port *port)
++static void __init kgdbsci_lateinit(void)
+ {
+-        sci_out(port, SCxSR, SCxSR_ERROR_CLEAR(port));  /* Clear error flags */
++      if (KGDBPORT.irqs[0] == KGDBPORT.irqs[1]) {
++              if (!KGDBPORT.irqs[0]) {
++                      printk(KERN_ERR "kgdbsci: Cannot allocate irq.\n");
++                      return;
++              }
++              if (request_irq(KGDBPORT.irqs[0], kgdbsci_mpxed_interrupt,
++                                      SA_INTERRUPT, "kgdbsci",
++                                      &KGDBPORT.port)) {
++                      printk(KERN_ERR "kgdbsci: Cannot allocate irq.\n");
++                      return;
++              }
++      } else {
++              if (KGDBPORT.irqs[1])
++                      request_irq(KGDBPORT.irqs[1],
++                                      kgdbsci_rx_interrupt, SA_INTERRUPT,
++                                      "kgdbsci", &KGDBPORT.port);
++      }
+ }
++#endif
+ 
+-/* Breakpoint if there's a break sent on the serial port */
+-static void kgdb_break_interrupt(int irq, void *ptr, struct pt_regs *regs)
++/*
++ * We use the normal init routine to setup the port, so we can't be
++ * in here too early.
++ */
++static int kgdbsci_init(void)
+ {
+-        struct sci_port *port = ptr;
+-        unsigned short status = sci_in(port, SCxSR);
++      struct termios termios;
+ 
+-        if (status & SCxSR_BRK(port)) {
++      memset(&termios, 0, sizeof(struct termios));
+ 
+-                /* Break into the debugger if a break is detected */
+-                BREAKPOINT();
++      termios.c_cflag = CREAD | HUPCL | CLOCAL | CS8;
++      switch (kgdbsci_baud) {
++      case 9600:
++              termios.c_cflag |= B9600;
++              break;
++      case 19200:
++              termios.c_cflag |= B19200;
++              break;
++      case 38400:
++              termios.c_cflag |= B38400;
++              break;
++      case 57600:
++              termios.c_cflag |= B57600;
++              break;
++      case 115200:
++              termios.c_cflag |= B115200;
++              break;
++      }
++      sci_set_termios(&KGDBPORT.port, &termios, NULL);
+ 
+-                /* Clear */
+-                sci_out(port, SCxSR, SCxSR_BREAK_CLEAR(port));
+-        }
++      return 0;
+ }
+ 
+-#endif /* CONFIG_SH_KGDB */
++struct kgdb_io kgdb_io_ops = {
++      .read_char = kgdbsci_read_char,
++      .write_char = kgdbsci_write_char,
++      .init = kgdbsci_init,
++#ifndef CONFIG_SERIAL_SH_SCI_CONSOLE
++      .late_init = kgdbsci_lateinit,
++#else /* ! CONFIG_SERIAL_SH_SCI_CONSOLE */
++      .late_init = NULL,
++#endif /* ! CONFIG_SERIAL_SH_SCI_CONSOLE */
++      .pre_exception = NULL,
++      .post_exception = NULL
++};
++
++/*
++ * Syntax for this cmdline option is "kgdbsci=ttyno,baudrate".
++ */
++static int __init
++kgdbsci_opt(char *str)
++{
++      /* We might have anywhere from 1 to 3 ports. */
++      if (*str < '0' || *str > SCI_NPORTS + '0')
++               goto errout;
++      kgdbsci_ttySC = *str - '0';
++      str++;
++      if (*str != ',')
++               goto errout;
++      str++;
++      kgdbsci_baud = simple_strtoul(str, &str, 10);
++      if (kgdbsci_baud != 9600 && kgdbsci_baud != 19200 &&
++          kgdbsci_baud != 38400 && kgdbsci_baud != 57600 &&
++          kgdbsci_baud != 115200)
++               goto errout;
++
++      return 0;
++
++errout:
++      printk(KERN_ERR "Invalid syntax for option kgdbsci=\n");
++      return 1;
++}
++__setup("kgdbsci", kgdbsci_opt);
++#endif /* CONFIG_KGDB_SH_SCI */
+ 
+ #if defined(__H8300S__)
+ enum { sci_disable, sci_enable };
+@@ -555,6 +608,16 @@ static inline void sci_receive_chars(str
+                                       continue;
+                               }
+ 
++#ifdef CONFIG_KGDB_SH_SCI
++                              /* We assume that a ^C on the port KGDB
++                               * is using means that KGDB wants to
++                               * interrupt the running system.
++                               */
++                              if (port->line == KGDBPORT.port.line &&
++                                              c == 3)
++                                      breakpoint();
++#endif
++
+                               /* Store data and status */
+                               if (status&SCxSR_FER(port)) {
+                                       flag = TTY_FRAME;
+@@ -1618,6 +1681,7 @@ static int __init sci_console_init(void)
+ console_initcall(sci_console_init);
+ #endif /* CONFIG_SERIAL_SH_SCI_CONSOLE */
+ 
++#if 0
+ #ifdef CONFIG_SH_KGDB
+ /*
+  * FIXME: Most of this can go away.. at the moment, we rely on
+@@ -1663,30 +1727,9 @@ int __init kgdb_console_setup(struct con
+       return uart_set_options(port, co, baud, parity, bits, flow);
+ }
+ #endif /* CONFIG_SH_KGDB */
++#endif /* 0 */
+ 
+-#ifdef CONFIG_SH_KGDB_CONSOLE
+-static struct console kgdb_console = {
+-        .name         = "ttySC",
+-        .write                = kgdb_console_write,
+-        .setup                = kgdb_console_setup,
+-        .flags                = CON_PRINTBUFFER | CON_ENABLED,
+-        .index                = -1,
+-      .data           = &sci_uart_driver,
+-};
+-
+-/* Register the KGDB console so we get messages (d'oh!) */
+-static int __init kgdb_console_init(void)
+-{
+-      register_console(&kgdb_console);
+-      return 0;
+-}
+-
+-console_initcall(kgdb_console_init);
+-#endif /* CONFIG_SH_KGDB_CONSOLE */
+-
+-#if defined(CONFIG_SH_KGDB_CONSOLE)
+-#define SCI_CONSOLE   &kgdb_console
+-#elif defined(CONFIG_SERIAL_SH_SCI_CONSOLE)
++#ifdef CONFIG_SERIAL_SH_SCI_CONSOLE
+ #define SCI_CONSOLE   &serial_console
+ #else
+ #define SCI_CONSOLE   0
+@@ -1757,4 +1800,3 @@ static void __exit sci_exit(void)
+ 
+ module_init(sci_init);
+ module_exit(sci_exit);
+-
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/asm-arm/kgdb.h linux-2.6.18.kgdb/include/asm-arm/kgdb.h
+--- linux-2.6.18/include/asm-arm/kgdb.h        1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18.kgdb/include/asm-arm/kgdb.h   2008-06-10 16:19:58.000000000 +0400
+@@ -0,0 +1,92 @@
++/*
++ * include/asm-arm/kgdb.h
++ *
++ * ARM KGDB support
++ *
++ * Author: Deepak Saxena <dsaxena@mvista.com>
++ *
++ * Copyright (C) 2002 MontaVista Software Inc.
++ *
++ */
++
++#ifndef __ASM_KGDB_H__
++#define __ASM_KGDB_H__
++
++#include <linux/config.h>
++#include <asm/ptrace.h>
++#include <asm-generic/kgdb.h>
++
++
++/*
++ * GDB assumes that we're a user process being debugged, so
++ * it will send us an SWI command to write into memory as the
++ * debug trap. When an SWI occurs, the next instruction addr is
++ * placed into R14_svc before jumping to the vector trap.
++ * This doesn't work for kernel debugging as we are already in SVC
++ * we would loose the kernel's LR, which is a bad thing. This
++ * is  bad thing.
++ *
++ * By doing this as an undefined instruction trap, we force a mode
++ * switch from SVC to UND mode, allowing us to save full kernel state.
++ *
++ * We also define a KGDB_COMPILED_BREAK which can be used to compile
++ * in breakpoints. This is important for things like sysrq-G and for
++ * the initial breakpoint from trap_init().
++ *
++ * Note to ARM HW designers: Add real trap support like SH && PPC to
++ * make our lives much much simpler. :)
++ */
++#define       BREAK_INSTR_SIZE                4
++#define GDB_BREAKINST                   0xef9f0001
++#define KGDB_BREAKINST                  0xe7ffdefe
++#define KGDB_COMPILED_BREAK             0xe7ffdeff
++#define CACHE_FLUSH_IS_SAFE           1
++
++#ifndef       __ASSEMBLY__
++
++#define       BREAKPOINT()                    asm(".word      0xe7ffdeff")
++
++
++extern void kgdb_handle_bus_error(void);
++extern int kgdb_fault_expected;
++#endif /* !__ASSEMBLY__ */
++
++/*
++ * From Amit S. Kale:
++ *
++ * In the register packet, words 0-15 are R0 to R10, FP, IP, SP, LR, PC. But
++ * Register 16 isn't cpsr. GDB passes CPSR in word 25. There are 9 words in
++ * between which are unused. Passing only 26 words to gdb is sufficient.
++ * GDB can figure out that floating point registers are not passed.
++ * GDB_MAX_REGS should be 26.
++ */
++#define       GDB_MAX_REGS            (26)
++
++#define       KGDB_MAX_NO_CPUS        1
++#define       BUFMAX                  400
++#define       NUMREGBYTES             (GDB_MAX_REGS << 2)
++#define       NUMCRITREGBYTES         (32 << 2)
++
++#define       _R0             0
++#define       _R1             1
++#define       _R2             2
++#define       _R3             3
++#define       _R4             4
++#define       _R5             5
++#define       _R6             6
++#define       _R7             7
++#define       _R8             8
++#define       _R9             9
++#define       _R10            10
++#define       _FP             11
++#define       _IP             12
++#define       _SP             13
++#define       _LR             14
++#define       _PC             15
++#define       _CPSR           (GDB_MAX_REGS - 1)
++
++/* So that we can denote the end of a frame for tracing, in the simple
++ * case. */
++#define CFI_END_FRAME(func)   __CFI_END_FRAME(_PC,_SP,func)
++
++#endif /* __ASM_KGDB_H__ */
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/asm-arm/system.h linux-2.6.18.kgdb/include/asm-arm/system.h
+--- linux-2.6.18/include/asm-arm/system.h      2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/include/asm-arm/system.h 2008-06-10 16:19:51.000000000 +0400
+@@ -444,6 +444,47 @@ static inline unsigned long __xchg(unsig
+ extern void disable_hlt(void);
+ extern void enable_hlt(void);
+ 
++#define       __HAVE_ARCH_CMPXCHG     1
++
++#include <asm/types.h>
++
++static inline unsigned long __cmpxchg_u32(volatile int *m, unsigned long old,
++                                      unsigned long new)
++{
++      u32 retval;
++      unsigned long flags;
++
++      local_irq_save(flags);
++      retval = *m;
++      if (retval == old)
++              *m = new;
++      local_irq_restore(flags);       /* implies memory barrier  */
++
++      return retval;
++}
++
++/* This function doesn't exist, so you'll get a linker error
++   if something tries to do an invalid cmpxchg().  */
++extern void __cmpxchg_called_with_bad_pointer(void);
++
++static inline unsigned long __cmpxchg(volatile void * ptr, unsigned long old,
++      unsigned long new, int size)
++{
++      switch (size) {
++      case 4:
++              return __cmpxchg_u32(ptr, old, new);
++      }
++      __cmpxchg_called_with_bad_pointer();
++      return old;
++}
++
++#define cmpxchg(ptr,o,n)                                               \
++  ({                                                                   \
++     __typeof__(*(ptr)) _o_ = (o);                                     \
++     __typeof__(*(ptr)) _n_ = (n);                                     \
++     (__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)_o_,                 \
++                                  (unsigned long)_n_, sizeof(*(ptr))); \
++  })
+ #endif /* __ASSEMBLY__ */
+ 
+ #define arch_align_stack(x) (x)
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/asm-generic/kgdb.h linux-2.6.18.kgdb/include/asm-generic/kgdb.h
+--- linux-2.6.18/include/asm-generic/kgdb.h    1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18.kgdb/include/asm-generic/kgdb.h       2008-06-10 16:19:58.000000000 +0400
+@@ -0,0 +1,34 @@
++/*
++ * include/asm-generic/kgdb.h
++ *
++ * This provides the assembly level information so that KGDB can provide
++ * a GDB that has been patched with enough information to know to stop
++ * trying to unwind the function.
++ *
++ * Author: Tom Rini <trini@kernel.crashing.org>
++ *
++ * 2005 (c) MontaVista Software, Inc. This file is licensed under the terms
++ * of the GNU General Public License version 2. This program is licensed
++ * "as is" without any warranty of any kind, whether express or implied.
++ */
++
++#ifndef __ASM_GENERIC_KGDB_H__
++#define __ASM_GENERIC_KGDB_H__
++
++#include <linux/dwarf2-lang.h>
++#ifdef __ASSEMBLY__
++#ifdef CONFIG_KGDB
++/* This MUST be put at the end of a given assembly function */
++#define __CFI_END_FRAME(pc,sp,func)                   \
++CAT3(.Lend_,func,:)                                   \
++      CFI_preamble(func,pc,0x1,-DATA_ALIGN_FACTOR)    \
++      CFA_define_reference(sp, 0)                     \
++      CFA_undefine_reg(pc)                            \
++      CFI_postamble()                                 \
++      FDE_preamble(func,func,CAT3(.Lend,_,func))      \
++      FDE_postamble()
++#else
++#define __CFI_END_FRAME(pc,sp,fn)
++#endif                                /* CONFIG_KGDB */
++#endif                                /* __ASSEMBLY__ */
++#endif                                /* __ASM_GENERIC_KGDB_H__ */
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/asm-i386/kdebug.h linux-2.6.18.kgdb/include/asm-i386/kdebug.h
+--- linux-2.6.18/include/asm-i386/kdebug.h     2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/include/asm-i386/kdebug.h        2008-06-10 16:19:17.000000000 +0400
+@@ -39,6 +39,7 @@ enum die_val {
+       DIE_CALL,
+       DIE_NMI_IPI,
+       DIE_PAGE_FAULT,
++      DIE_PAGE_FAULT_NO_CONTEXT,
+ };
+ 
+ static inline int notify_die(enum die_val val, const char *str,
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/asm-i386/kgdb.h linux-2.6.18.kgdb/include/asm-i386/kgdb.h
+--- linux-2.6.18/include/asm-i386/kgdb.h       1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18.kgdb/include/asm-i386/kgdb.h  2008-06-10 16:19:58.000000000 +0400
+@@ -0,0 +1,58 @@
++#ifdef __KERNEL__
++#ifndef _ASM_KGDB_H_
++#define _ASM_KGDB_H_
++
++/*
++ * Copyright (C) 2001-2004 Amit S. Kale
++ */
++
++#include <asm-generic/kgdb.h>
++
++/*
++ *  Note that this register image is in a different order than
++ *  the register image that Linux produces at interrupt time.
++ *
++ *  Linux's register image is defined by struct pt_regs in ptrace.h.
++ *  Just why GDB uses a different order is a historical mystery.
++ */
++#define _EAX  0
++#define _ECX  1
++#define _EDX  2
++#define _EBX  3
++#define _ESP  4
++#define _EBP  5
++#define _ESI  6
++#define _EDI  7
++#define _PC   8
++#define _EIP  8
++#define _PS   9
++#define _EFLAGS       9
++#define _CS   10
++#define _SS   11
++#define _DS   12
++#define _ES   13
++#define _FS   14
++#define _GS   15
++
++/* So that we can denote the end of a frame for tracing, in the simple
++ * case. */
++#define CFI_END_FRAME(func)   __CFI_END_FRAME(_EIP,_ESP,func)
++
++#ifndef __ASSEMBLY__
++/************************************************************************/
++/* BUFMAX defines the maximum number of characters in inbound/outbound buffers*/
++/* at least NUMREGBYTES*2 are needed for register packets */
++/* Longer buffer is needed to list all threads */
++#define BUFMAX                        1024
++
++/* Number of bytes of registers.  */
++#define NUMREGBYTES           64
++/* Number of bytes of registers we need to save for a setjmp/longjmp. */
++#define NUMCRITREGBYTES               24
++
++#define BREAKPOINT()          asm("   int $3");
++#define BREAK_INSTR_SIZE      1
++#define CACHE_FLUSH_IS_SAFE   1
++#endif                                /* !__ASSEMBLY__ */
++#endif                                /* _ASM_KGDB_H_ */
++#endif                                /* __KERNEL__ */
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/asm-ia64/kdebug.h linux-2.6.18.kgdb/include/asm-ia64/kdebug.h
+--- linux-2.6.18/include/asm-ia64/kdebug.h     2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/include/asm-ia64/kdebug.h        2008-06-10 16:19:32.000000000 +0400
+@@ -72,6 +72,7 @@ enum die_val {
+       DIE_KDEBUG_LEAVE,
+       DIE_KDUMP_ENTER,
+       DIE_KDUMP_LEAVE,
++      DIE_PAGE_FAULT_NO_CONTEXT,
+ };
+ 
+ static inline int notify_die(enum die_val val, char *str, struct pt_regs *regs,
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/asm-ia64/kgdb.h linux-2.6.18.kgdb/include/asm-ia64/kgdb.h
+--- linux-2.6.18/include/asm-ia64/kgdb.h       1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18.kgdb/include/asm-ia64/kgdb.h  2008-06-10 16:19:32.000000000 +0400
+@@ -0,0 +1,36 @@
++#ifdef __KERNEL__
++#ifndef _ASM_KGDB_H_
++#define _ASM_KGDB_H_
++
++/*
++ * Copyright (C) 2001-2004 Amit S. Kale
++ */
++
++#include <linux/threads.h>
++
++/************************************************************************/
++/* BUFMAX defines the maximum number of characters in inbound/outbound buffers*/
++/* at least NUMREGBYTES*2 are needed for register packets */
++/* Longer buffer is needed to list all threads */
++#define BUFMAX                        1024
++
++/* Number of bytes of registers.  We set this to 0 so that certain GDB
++ * packets will fail, forcing the use of others, which are more friendly
++ * on ia64. */
++#define NUMREGBYTES           0
++
++#define NUMCRITREGBYTES               (70*8)
++#define JMP_REGS_ALIGNMENT    __attribute__ ((aligned (16)))
++
++#define BREAKNUM              0x00003333300LL
++#define KGDBBREAKNUM          0x6665UL
++#define BREAKPOINT()          asm volatile ("break.m 0x6665")
++#define BREAK_INSTR_SIZE      16
++#define CACHE_FLUSH_IS_SAFE   1
++
++struct pt_regs;
++extern volatile int kgdb_hwbreak_sstep[NR_CPUS];
++extern void smp_send_nmi_allbutself(void);
++extern void kgdb_wait_ipi(struct pt_regs *);
++#endif                                /* _ASM_KGDB_H_ */
++#endif                                /* __KERNEL__ */
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/asm-mips/kdebug.h linux-2.6.18.kgdb/include/asm-mips/kdebug.h
+--- linux-2.6.18/include/asm-mips/kdebug.h     1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18.kgdb/include/asm-mips/kdebug.h        2008-06-10 16:19:28.000000000 +0400
+@@ -0,0 +1,47 @@
++/*
++ *
++ * Copyright (C) 2004  MontaVista Software Inc.
++ * Author: Manish Lachwani, mlachwani@mvista.com or manish@koffee-break.com
++ *
++ * This program is free software; you can redistribute  it and/or modify it
++ * under  the terms of  the GNU General  Public License as published by the
++ * Free Software Foundation;  either version 2 of the  License, or (at your
++ * option) any later version.
++ *
++ */
++#ifndef _MIPS_KDEBUG_H
++#define _MIPS_KDEBUG_H
++
++#include <linux/notifier.h>
++
++struct pt_regs;
++
++struct die_args {
++      struct pt_regs *regs;
++      const char *str;
++      long err;
++};
++
++int register_die_notifier(struct notifier_block *nb);
++extern struct notifier_block *mips_die_chain;
++
++enum die_val {
++      DIE_OOPS = 1,
++      DIE_PANIC,
++      DIE_DIE,
++      DIE_KERNELDEBUG,
++      DIE_TRAP,
++      DIE_PAGE_FAULT,
++};
++
++/*
++ * trap number can be computed from regs and signr can be computed using
++ * compute_signal()
++ */
++static inline int notify_die(enum die_val val,char *str,struct pt_regs *regs,long err)
++{
++      struct die_args args = { .regs=regs, .str=str, .err=err };
++      return notifier_call_chain(&mips_die_chain, val, &args);
++}
++
++#endif /* _MIPS_KDEBUG_H */
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/asm-mips/kgdb.h linux-2.6.18.kgdb/include/asm-mips/kgdb.h
+--- linux-2.6.18/include/asm-mips/kgdb.h       1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18.kgdb/include/asm-mips/kgdb.h  2008-06-10 16:19:58.000000000 +0400
+@@ -0,0 +1,34 @@
++#ifdef __KERNEL__
++#ifndef _ASM_KGDB_H_
++#define _ASM_KGDB_H_
++
++#ifndef __ASSEMBLY__
++#if (_MIPS_ISA == _MIPS_ISA_MIPS1) || (_MIPS_ISA == _MIPS_ISA_MIPS2)
++typedef u32 gdb_reg_t;
++#elif (_MIPS_ISA == _MIPS_ISA_MIPS3) || (_MIPS_ISA == _MIPS_ISA_MIPS4)
++typedef u64 gdb_reg_t;
++#else
++#error need to do
++#endif /* _MIPS_ISA */
++
++#include <asm-generic/kgdb.h>
++
++#ifndef __ASSEMBLY__
++#define BUFMAX                        2048
++#define NUMREGBYTES           (90*sizeof(gdb_reg_t))
++#define NUMCRITREGBYTES               (12*sizeof(gdb_reg_t))
++#define BREAK_INSTR_SIZE      4
++#define BREAKPOINT()          __asm__ __volatile__(           \
++                                      ".globl breakinst\n\t"  \
++                                      ".set\tnoreorder\n\t"   \
++                                      "nop\n"                 \
++                                      "breakinst:\tbreak\n\t" \
++                                      "nop\n\t"               \
++                                      ".set\treorder")
++#define CACHE_FLUSH_IS_SAFE   0
++
++extern int kgdb_early_setup;
++
++#endif                                /* !__ASSEMBLY__ */
++#endif                                /* _ASM_KGDB_H_ */
++#endif                                /* __KERNEL__ */
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/asm-powerpc/kgdb.h linux-2.6.18.kgdb/include/asm-powerpc/kgdb.h
+--- linux-2.6.18/include/asm-powerpc/kgdb.h    1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18.kgdb/include/asm-powerpc/kgdb.h       2008-06-10 16:19:58.000000000 +0400
+@@ -0,0 +1,74 @@
++/*
++ * include/asm-powerpc/kgdb.h
++ *
++ * The PowerPC (32/64) specific defines / externs for KGDB.  Based on
++ * the previous 32bit and 64bit specific files, which had the following
++ * copyrights:
++ *
++ * PPC64 Mods (C) 2005 Frank Rowand (frowand@mvista.com)
++ * PPC Mods (C) 2004 Tom Rini (trini@mvista.com)
++ * PPC Mods (C) 2003 John Whitney (john.whitney@timesys.com)
++ * PPC Mods (C) 1998 Michael Tesch (tesch@cs.wisc.edu)
++ *
++ *
++ * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu)
++ * Author: Tom Rini <trini@kernel.crashing.org>
++ *
++ * 2006 (c) MontaVista Software, Inc. This file is licensed under
++ * the terms of the GNU General Public License version 2. This program
++ * is licensed "as is" without any warranty of any kind, whether express
++ * or implied.
++ */
++#ifdef __KERNEL__
++#ifndef __POWERPC_KGDB_H__
++#define __POWERPC_KGDB_H__
++
++#include <asm-generic/kgdb.h>
++#ifndef __ASSEMBLY__
++
++#define BREAK_INSTR_SIZE      4
++#define BUFMAX                        ((NUMREGBYTES * 2) + 512)
++#define OUTBUFMAX             ((NUMREGBYTES * 2) + 512)
++#define BREAKPOINT()          asm(".long 0x7d821008"); /* twge r2, r2 */
++#define CACHE_FLUSH_IS_SAFE   1
++
++/* The number bytes of registers we have to save depends on a few
++ * things.  For 64bit we default to not including vector registers and
++ * vector state registers. */
++#ifdef CONFIG_PPC64
++/*
++ * 64 bit (8 byte) registers:
++ *   32 gpr, 32 fpr, nip, msr, link, ctr
++ * 32 bit (4 byte) registers:
++ *   ccr, xer, fpscr
++ */
++#define NUMREGBYTES           ((68 * 8) + (3 * 4))
++#if 0
++/* The following adds in vector registers and vector state registers. */
++/* 128 bit (16 byte) registers:
++ *   32 vr
++ * 64 bit (8 byte) registers:
++ *   32 gpr, 32 fpr, nip, msr, link, ctr
++ * 32 bit (4 byte) registers:
++ *   ccr, xer, fpscr, vscr, vrsave
++ */
++#define NUMREGBYTES           ((128 * 16) + (68 * 8) + (5 * 4))
++#endif
++#define NUMCRITREGBYTES               184
++#else /* CONFIG_PPC32 */
++/* On non-E500 family PPC32 we determine the size by picking the last
++ * register we need, but on E500 we skip sections so we list what we
++ * need to store, and add it up. */
++#ifndef CONFIG_E500
++#define MAXREG                        (PT_FPSCR+1)
++#else
++/* 32 GPRs (8 bytes), nip, msr, ccr, link, ctr, xer, acc (8 bytes), spefscr*/
++#define MAXREG                 ((32*2)+6+2+1)
++#endif
++#define NUMREGBYTES           (MAXREG * sizeof(int))
++/* CR/LR, R1, R2, R13-R31 inclusive. */
++#define NUMCRITREGBYTES               (23 * sizeof(int))
++#endif /* 32/64 */
++#endif /* !(__ASSEMBLY__) */
++#endif /* !__POWERPC_KGDB_H__ */
++#endif /* __KERNEL__ */
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/asm-ppc/kgdb.h linux-2.6.18.kgdb/include/asm-ppc/kgdb.h
+--- linux-2.6.18/include/asm-ppc/kgdb.h        2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/include/asm-ppc/kgdb.h   2008-06-10 16:19:22.000000000 +0400
+@@ -1,57 +1,18 @@
+-/*
+- * kgdb.h: Defines and declarations for serial line source level
+- *         remote debugging of the Linux kernel using gdb.
+- *
+- * PPC Mods (C) 1998 Michael Tesch (tesch@cs.wisc.edu)
+- *
+- * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu)
+- */
+ #ifdef __KERNEL__
+-#ifndef _PPC_KGDB_H
+-#define _PPC_KGDB_H
+-
++#ifndef __PPC_KGDB_H__
++#define __PPC_KGDB_H__
++#include <asm-powerpc/kgdb.h>
+ #ifndef __ASSEMBLY__
+-
+-/* Things specific to the gen550 backend. */
+-struct uart_port;
+-
+-extern void gen550_progress(char *, unsigned short);
+-extern void gen550_kgdb_map_scc(void);
+-extern void gen550_init(int, struct uart_port *);
+-
+-/* Things specific to the pmac backend. */
+-extern void zs_kgdb_hook(int tty_num);
+-
+-/* To init the kgdb engine. (called by serial hook)*/
+-extern void set_debug_traps(void);
+-
+-/* To enter the debugger explicitly. */
+-extern void breakpoint(void);
+-
+-/* For taking exceptions
+- * these are defined in traps.c
+- */
+-extern int (*debugger)(struct pt_regs *regs);
++ /* For taking exceptions
++  * these are defined in traps.c
++  */
++struct pt_regs;
++extern void (*debugger)(struct pt_regs *regs);
+ extern int (*debugger_bpt)(struct pt_regs *regs);
+ extern int (*debugger_sstep)(struct pt_regs *regs);
+ extern int (*debugger_iabr_match)(struct pt_regs *regs);
+ extern int (*debugger_dabr_match)(struct pt_regs *regs);
+ extern void (*debugger_fault_handler)(struct pt_regs *regs);
+-
+-/* What we bring to the party */
+-int kgdb_bpt(struct pt_regs *regs);
+-int kgdb_sstep(struct pt_regs *regs);
+-void kgdb(struct pt_regs *regs);
+-int kgdb_iabr_match(struct pt_regs *regs);
+-int kgdb_dabr_match(struct pt_regs *regs);
+-
+-/*
+- * external low-level support routines (ie macserial.c)
+- */
+-extern void kgdb_interruptible(int); /* control interrupts from serial */
+-extern void putDebugChar(char);   /* write a single character      */
+-extern char getDebugChar(void);   /* read and return a single char */
+-
+-#endif /* !(__ASSEMBLY__) */
+-#endif /* !(_PPC_KGDB_H) */
++#endif /* !__ASSEMBLY__ */
++#endif /* __PPC_KGDB_H__ */
+ #endif /* __KERNEL__ */
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/asm-ppc/machdep.h linux-2.6.18.kgdb/include/asm-ppc/machdep.h
+--- linux-2.6.18/include/asm-ppc/machdep.h     2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/include/asm-ppc/machdep.h        2008-06-10 16:19:22.000000000 +0400
+@@ -72,9 +72,7 @@ struct machdep_calls {
+       unsigned long   (*find_end_of_memory)(void);
+       void            (*setup_io_mappings)(void);
+ 
+-      void            (*early_serial_map)(void);
+       void            (*progress)(char *, unsigned short);
+-      void            (*kgdb_map_scc)(void);
+ 
+       unsigned char   (*nvram_read_val)(int addr);
+       void            (*nvram_write_val)(int addr, unsigned char val);
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/asm-ppc/mv64x60.h linux-2.6.18.kgdb/include/asm-ppc/mv64x60.h
+--- linux-2.6.18/include/asm-ppc/mv64x60.h     2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/include/asm-ppc/mv64x60.h        2008-06-10 16:19:22.000000000 +0400
+@@ -348,6 +348,8 @@ u32 mv64x60_calc_mem_size(struct mv64x60
+ 
+ void mv64x60_progress_init(u32 base);
+ void mv64x60_mpsc_progress(char *s, unsigned short hex);
++struct platform_device * mv64x60_early_get_pdev_data(const char *name,
++              int id, int remove);
+ 
+ extern struct mv64x60_32bit_window
+       gt64260_32bit_windows[MV64x60_32BIT_WIN_COUNT];
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/asm-ppc/mv64x60_defs.h linux-2.6.18.kgdb/include/asm-ppc/mv64x60_defs.h
+--- linux-2.6.18/include/asm-ppc/mv64x60_defs.h        2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/include/asm-ppc/mv64x60_defs.h   2008-06-10 16:19:22.000000000 +0400
+@@ -57,7 +57,8 @@
+ #define       MV64x60_IRQ_I2C                         37
+ #define       MV64x60_IRQ_BRG                         39
+ #define       MV64x60_IRQ_MPSC_0                      40
+-#define       MV64x60_IRQ_MPSC_1                      42
++#define       MV64360_IRQ_MPSC_1                      41
++#define       GT64260_IRQ_MPSC_1                      42
+ #define       MV64x60_IRQ_COMM                        43
+ #define       MV64x60_IRQ_P0_GPP_0_7                  56
+ #define       MV64x60_IRQ_P0_GPP_8_15                 57
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/asm-sh/kgdb.h linux-2.6.18.kgdb/include/asm-sh/kgdb.h
+--- linux-2.6.18/include/asm-sh/kgdb.h 2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/include/asm-sh/kgdb.h    2008-06-10 16:19:58.000000000 +0400
+@@ -2,94 +2,40 @@
+  * May be copied or modified under the terms of the GNU General Public
+  * License.  See linux/COPYING for more information.
+  *
+- * Based on original code by Glenn Engel, Jim Kingdon,
+- * David Grothe <dave@gcom.com>, Tigran Aivazian, <tigran@sco.com> and
+- * Amit S. Kale <akale@veritas.com>
+- * 
+- * Super-H port based on sh-stub.c (Ben Lee and Steve Chamberlain) by
+- * Henry Bell <henry.bell@st.com>
+- * 
+- * Header file for low-level support for remote debug using GDB. 
++ * Based on a file that was modified or based on files by: Glenn Engel,
++ * Jim Kingdon, David Grothe <dave@gcom.com>, Tigran Aivazian <tigran@sco.com>,
++ * Amit S. Kale <akale@veritas.com>, sh-stub.c from Ben Lee and
++ * Steve Chamberlain, Henry Bell <henry.bell@st.com>
++ *
++ * Maintainer: Tom Rini <trini@kernel.crashing.org>
+  *
+  */
+ 
+ #ifndef __KGDB_H
+ #define __KGDB_H
+ 
+-#include <asm/ptrace.h>
+-
+-struct console;
++#include <asm-generic/kgdb.h>
++/* Based on sh-gdb.c from gdb-6.1, Glenn
++     Engel at HP  Ben Lee and Steve Chamberlain */
++#define NUMREGBYTES   112     /* 92 */
++#define NUMCRITREGBYTES       (9 << 2)
++#define BUFMAX                400
+ 
+-/* Same as pt_regs but has vbr in place of syscall_nr */
++#ifndef __ASSEMBLY__
+ struct kgdb_regs {
+         unsigned long regs[16];
+         unsigned long pc;
+         unsigned long pr;
+-        unsigned long sr;
+         unsigned long gbr;
++        unsigned long vbr;
+         unsigned long mach;
+         unsigned long macl;
+-        unsigned long vbr;
+-};
+-
+-/* State info */
+-extern char kgdb_in_gdb_mode;
+-extern int kgdb_done_init;
+-extern int kgdb_enabled;
+-extern int kgdb_nofault;      /* Ignore bus errors (in gdb mem access) */
+-extern int kgdb_halt;         /* Execute initial breakpoint at startup */
+-extern char in_nmi;           /* Debounce flag to prevent NMI reentry*/
+-
+-/* SCI */
+-extern int kgdb_portnum;
+-extern int kgdb_baud;
+-extern char kgdb_parity;
+-extern char kgdb_bits;
+-extern int kgdb_console_setup(struct console *, char *);
+-
+-/* Init and interface stuff */
+-extern int kgdb_init(void);
+-extern int (*kgdb_serial_setup)(void);
+-extern int (*kgdb_getchar)(void);
+-extern void (*kgdb_putchar)(int);
+-
+-struct kgdb_sermap {
+-      char *name;
+-      int namelen;
+-      int (*setup_fn)(struct console *, char *);
+-      struct kgdb_sermap *next;
++        unsigned long sr;
+ };
+-extern void kgdb_register_sermap(struct kgdb_sermap *map);
+-extern struct kgdb_sermap *kgdb_porttype;
+ 
+-/* Trap functions */
+-typedef void (kgdb_debug_hook_t)(struct pt_regs *regs); 
+-typedef void (kgdb_bus_error_hook_t)(void);
+-extern kgdb_debug_hook_t  *kgdb_debug_hook;
+-extern kgdb_bus_error_hook_t *kgdb_bus_err_hook;
+-
+-extern void breakpoint(void);
+-
+-/* Console */
+-struct console;
+-void kgdb_console_write(struct console *co, const char *s, unsigned count);
+-void kgdb_console_init(void);
+-
+-/* Prototypes for jmp fns */
+-#define _JBLEN 9
+-typedef        int jmp_buf[_JBLEN];
+-extern void    longjmp(jmp_buf __jmpb, int __retval);
+-extern int     setjmp(jmp_buf __jmpb);
+-
+-/* Variadic macro to print our own message to the console */
+-#define KGDB_PRINTK(...) printk("KGDB: " __VA_ARGS__)
+-
+-/* Forced breakpoint */
+-#define BREAKPOINT() do {                                     \
+-  if (kgdb_enabled) {                                         \
+-    asm volatile("trapa   #0xff");                            \
+-  }                                                           \
+-} while (0)
++#define BREAKPOINT()          asm("trapa #0xff");
++#define BREAK_INSTR_SIZE      2
++#define CACHE_FLUSH_IS_SAFE   1
+ 
+ /* KGDB should be able to flush all kernel text space */
+ #if defined(CONFIG_CPU_SH4)
+@@ -102,30 +48,5 @@ extern int     setjmp(jmp_buf __jmpb);
+ #else
+ #define kgdb_flush_icache_range(start, end)   do { } while (0)
+ #endif
+-
+-/* Kernel assert macros */
+-#ifdef CONFIG_KGDB_KERNEL_ASSERTS
+-
+-/* Predefined conditions */
+-#define KA_VALID_ERRNO(errno) ((errno) > 0 && (errno) <= EMEDIUMTYPE)
+-#define KA_VALID_PTR_ERR(ptr) KA_VALID_ERRNO(-PTR_ERR(ptr))
+-#define KA_VALID_KPTR(ptr)  (!(ptr) || \
+-              ((void *)(ptr) >= (void *)PAGE_OFFSET &&  \
+-               (void *)(ptr) < ERR_PTR(-EMEDIUMTYPE)))
+-#define KA_VALID_PTRORERR(errptr) \
+-               (KA_VALID_KPTR(errptr) || KA_VALID_PTR_ERR(errptr))
+-#define KA_HELD_GKL()  (current->lock_depth >= 0)
+-
+-/* The actual assert */
+-#define KGDB_ASSERT(condition, message) do {                   \
+-       if (!(condition) && (kgdb_enabled)) {                   \
+-               KGDB_PRINTK("Assertion failed at %s:%d: %s\n",  \
+-                                  __FILE__, __LINE__, message);\
+-               BREAKPOINT();                                   \
+-       }                                                       \
+-} while (0)
+-#else
+-#define KGDB_ASSERT(condition, message)
+-#endif
+-
++#endif                                /* !__ASSEMBLY__ */
+ #endif
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/asm-sh/system.h linux-2.6.18.kgdb/include/asm-sh/system.h
+--- linux-2.6.18/include/asm-sh/system.h       2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/include/asm-sh/system.h  2008-06-10 16:19:47.000000000 +0400
+@@ -6,6 +6,7 @@
+  * Copyright (C) 2002 Paul Mundt
+  */
+ 
++#include <asm/types.h>
+ 
+ /*
+  *    switch_to() should switch tasks to task nr n, first
+@@ -260,6 +261,45 @@ static __inline__ unsigned long __xchg(u
+       return x;
+ }
+ 
++static inline unsigned long __cmpxchg_u32(volatile int * m, unsigned long old,
++      unsigned long new)
++{
++      __u32 retval;
++      unsigned long flags;
++
++      local_irq_save(flags);
++      retval = *m;
++      if (retval == old)
++              *m = new;
++      local_irq_restore(flags);       /* implies memory barrier  */
++      return retval;
++}
++
++/* This function doesn't exist, so you'll get a linker error
++ * if something tries to do an invalid cmpxchg(). */
++extern void __cmpxchg_called_with_bad_pointer(void);
++
++#define __HAVE_ARCH_CMPXCHG   1
++
++static inline unsigned long __cmpxchg(volatile void * ptr, unsigned long old,
++              unsigned long new, int size)
++{
++      switch (size) {
++      case 4:
++              return __cmpxchg_u32(ptr, old, new);
++      }
++      __cmpxchg_called_with_bad_pointer();
++      return old;
++}
++
++#define cmpxchg(ptr,o,n)                                               \
++  ({                                                                   \
++     __typeof__(*(ptr)) _o_ = (o);                                     \
++     __typeof__(*(ptr)) _n_ = (n);                                     \
++     (__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)_o_,                 \
++                                  (unsigned long)_n_, sizeof(*(ptr))); \
++  })
++
+ /* XXX
+  * disable hlt during certain critical i/o operations
+  */
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/asm-x86_64/kdebug.h linux-2.6.18.kgdb/include/asm-x86_64/kdebug.h
+--- linux-2.6.18/include/asm-x86_64/kdebug.h   2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/include/asm-x86_64/kdebug.h      2008-06-10 16:19:36.000000000 +0400
+@@ -34,6 +34,7 @@ enum die_val {
+       DIE_CALL,
+       DIE_NMI_IPI,
+       DIE_PAGE_FAULT,
++      DIE_PAGE_FAULT_NO_CONTEXT,
+ };
+ 
+ static inline int notify_die(enum die_val val, const char *str,
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/asm-x86_64/kgdb.h linux-2.6.18.kgdb/include/asm-x86_64/kgdb.h
+--- linux-2.6.18/include/asm-x86_64/kgdb.h     1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18.kgdb/include/asm-x86_64/kgdb.h        2008-06-10 16:19:58.000000000 +0400
+@@ -0,0 +1,54 @@
++#ifdef __KERNEL__
++#ifndef _ASM_KGDB_H_
++#define _ASM_KGDB_H_
++
++/*
++ * Copyright (C) 2001-2004 Amit S. Kale
++ */
++
++#include <asm-generic/kgdb.h>
++
++/*
++ *  Note that this register image is in a different order than
++ *  the register image that Linux produces at interrupt time.
++ *
++ *  Linux's register image is defined by struct pt_regs in ptrace.h.
++ *  Just why GDB uses a different order is a historical mystery.
++ */
++#define _RAX  0
++#define _RDX  1
++#define _RCX  2
++#define _RBX  3
++#define _RSI  4
++#define _RDI  5
++#define _RBP  6
++#define _RSP  7
++#define _R8   8
++#define _R9   9
++#define _R10  10
++#define _R11  11
++#define _R12  12
++#define _R13  13
++#define _R14  14
++#define _R15  15
++#define _PC   16
++#define _PS   17
++
++/* Number of bytes of registers.  */
++#define NUMREGBYTES           ((_PS+1)*8)
++#define NUMCRITREGBYTES               (8 * 8)         /* 8 registers. */
++
++/* Help GDB to know when to stop backtracing. */
++#define CFI_END_FRAME(func)   __CFI_END_FRAME(_PC,_RSP,func)
++#ifndef __ASSEMBLY__
++/* BUFMAX defines the maximum number of characters in inbound/outbound
++ * buffers at least NUMREGBYTES*2 are needed for register packets, and
++ * a longer buffer is needed to list all threads. */
++#define BUFMAX                        1024
++#define BREAKPOINT()          asm("   int $3");
++#define CHECK_EXCEPTION_STACK() ((&__get_cpu_var(init_tss))[0].ist[0])
++#define BREAK_INSTR_SIZE      1
++#define CACHE_FLUSH_IS_SAFE   1
++#endif                                /* !__ASSEMBLY__ */
++#endif                                /* _ASM_KGDB_H_ */
++#endif                                /* __KERNEL__ */
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/asm-x86_64/system.h linux-2.6.18.kgdb/include/asm-x86_64/system.h
+--- linux-2.6.18/include/asm-x86_64/system.h   2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/include/asm-x86_64/system.h      2008-06-10 16:19:42.000000000 +0400
+@@ -21,7 +21,9 @@
+       ,"rcx","rbx","rdx","r8","r9","r10","r11","r12","r13","r14","r15"
+ 
+ #define switch_to(prev,next,last) \
+-      asm volatile(SAVE_CONTEXT                                                   \
++       asm volatile(".globl __switch_to_begin\n\t"                                \
++                   "__switch_to_begin:\n\t"                                     \
++                   SAVE_CONTEXT                                                 \
+                    "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */       \
+                    "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */    \
+                    "call __switch_to\n\t"                                       \
+@@ -33,6 +35,8 @@
+                    "movq %%rax,%%rdi\n\t"                                       \
+                    "jc   ret_from_fork\n\t"                                     \
+                    RESTORE_CONTEXT                                                \
++                   ".globl __switch_to_end\n\t"                                 \
++                   "__switch_to_end:\n\t"                                       \
+                    : "=a" (last)                                                \
+                    : [next] "S" (next), [prev] "D" (prev),                      \
+                      [threadrsp] "i" (offsetof(struct task_struct, thread.rsp)), \
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/linux/dwarf2-defs.h linux-2.6.18.kgdb/include/linux/dwarf2-defs.h
+--- linux-2.6.18/include/linux/dwarf2-defs.h   1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18.kgdb/include/linux/dwarf2-defs.h      2008-06-10 16:22:59.000000000 +0400
+@@ -0,0 +1,515 @@
++#ifndef  _ELF_DWARF_H
++/* Machine generated from dwarf2.h by scripts/dwarfh.awk */
++#define _ELF_DWARF2_H
++#define DW_TAG_padding         0x00
++#define DW_TAG_array_type      0x01
++#define DW_TAG_class_type      0x02
++#define DW_TAG_entry_point     0x03
++#define DW_TAG_enumeration_type        0x04
++#define DW_TAG_formal_parameter        0x05
++#define DW_TAG_imported_declaration    0x08
++#define DW_TAG_label   0x0a
++#define DW_TAG_lexical_block   0x0b
++#define DW_TAG_member  0x0d
++#define DW_TAG_pointer_type    0x0f
++#define DW_TAG_reference_type  0x10
++#define DW_TAG_compile_unit    0x11
++#define DW_TAG_string_type     0x12
++#define DW_TAG_structure_type  0x13
++#define DW_TAG_subroutine_type         0x15
++#define DW_TAG_typedef         0x16
++#define DW_TAG_union_type      0x17
++#define DW_TAG_unspecified_parameters  0x18
++#define DW_TAG_variant         0x19
++#define DW_TAG_common_block    0x1a
++#define DW_TAG_common_inclusion        0x1b
++#define DW_TAG_inheritance     0x1c
++#define DW_TAG_inlined_subroutine      0x1d
++#define DW_TAG_module  0x1e
++#define DW_TAG_ptr_to_member_type      0x1f
++#define DW_TAG_set_type        0x20
++#define DW_TAG_subrange_type   0x21
++#define DW_TAG_with_stmt       0x22
++#define DW_TAG_access_declaration      0x23
++#define DW_TAG_base_type       0x24
++#define DW_TAG_catch_block     0x25
++#define DW_TAG_const_type      0x26
++#define DW_TAG_constant        0x27
++#define DW_TAG_enumerator      0x28
++#define DW_TAG_file_type       0x29
++#define DW_TAG_friend  0x2a
++#define DW_TAG_namelist        0x2b
++#define DW_TAG_namelist_item   0x2c
++#define DW_TAG_packed_type     0x2d
++#define DW_TAG_subprogram      0x2e
++#define DW_TAG_template_type_param     0x2f
++#define DW_TAG_template_value_param    0x30
++#define DW_TAG_thrown_type     0x31
++#define DW_TAG_try_block       0x32
++#define DW_TAG_variant_part    0x33
++#define DW_TAG_variable        0x34
++#define DW_TAG_volatile_type   0x35
++#define DW_TAG_dwarf_procedure         0x36
++#define DW_TAG_restrict_type   0x37
++#define DW_TAG_interface_type  0x38
++#define DW_TAG_namespace       0x39
++#define DW_TAG_imported_module         0x3a
++#define DW_TAG_unspecified_type        0x3b
++#define DW_TAG_partial_unit    0x3c
++#define DW_TAG_imported_unit   0x3d
++#define DW_TAG_MIPS_loop       0x4081
++#define DW_TAG_HP_array_descriptor     0x4090
++#define DW_TAG_format_label    0x4101
++#define DW_TAG_function_template       0x4102
++#define DW_TAG_class_template  0x4103
++#define DW_TAG_GNU_BINCL       0x4104
++#define DW_TAG_GNU_EINCL       0x4105
++#define DW_TAG_upc_shared_type         0x8765
++#define DW_TAG_upc_strict_type         0x8766
++#define DW_TAG_upc_relaxed_type        0x8767
++#define DW_TAG_PGI_kanji_type  0xA000
++#define DW_TAG_PGI_interface_block     0xA020
++#define DW_TAG_lo_user        0x4080
++#define DW_TAG_hi_user        0xffff
++#define DW_children_no   0
++#define       DW_children_yes  1
++#define DW_FORM_addr   0x01
++#define DW_FORM_block2         0x03
++#define DW_FORM_block4         0x04
++#define DW_FORM_data2  0x05
++#define DW_FORM_data4  0x06
++#define DW_FORM_data8  0x07
++#define DW_FORM_string         0x08
++#define DW_FORM_block  0x09
++#define DW_FORM_block1         0x0a
++#define DW_FORM_data1  0x0b
++#define DW_FORM_flag   0x0c
++#define DW_FORM_sdata  0x0d
++#define DW_FORM_strp   0x0e
++#define DW_FORM_udata  0x0f
++#define DW_FORM_ref_addr       0x10
++#define DW_FORM_ref1   0x11
++#define DW_FORM_ref2   0x12
++#define DW_FORM_ref4   0x13
++#define DW_FORM_ref8   0x14
++#define DW_FORM_ref_udata      0x15
++#define DW_FORM_indirect       0x16
++#define DW_AT_sibling  0x01
++#define DW_AT_location         0x02
++#define DW_AT_name     0x03
++#define DW_AT_ordering         0x09
++#define DW_AT_subscr_data      0x0a
++#define DW_AT_byte_size        0x0b
++#define DW_AT_bit_offset       0x0c
++#define DW_AT_bit_size         0x0d
++#define DW_AT_element_list     0x0f
++#define DW_AT_stmt_list        0x10
++#define DW_AT_low_pc   0x11
++#define DW_AT_high_pc  0x12
++#define DW_AT_language         0x13
++#define DW_AT_member   0x14
++#define DW_AT_discr    0x15
++#define DW_AT_discr_value      0x16
++#define DW_AT_visibility       0x17
++#define DW_AT_import   0x18
++#define DW_AT_string_length    0x19
++#define DW_AT_common_reference         0x1a
++#define DW_AT_comp_dir         0x1b
++#define DW_AT_const_value      0x1c
++#define DW_AT_containing_type  0x1d
++#define DW_AT_default_value    0x1e
++#define DW_AT_inline   0x20
++#define DW_AT_is_optional      0x21
++#define DW_AT_lower_bound      0x22
++#define DW_AT_producer         0x25
++#define DW_AT_prototyped       0x27
++#define DW_AT_return_addr      0x2a
++#define DW_AT_start_scope      0x2c
++#define DW_AT_stride_size      0x2e
++#define DW_AT_upper_bound      0x2f
++#define DW_AT_abstract_origin  0x31
++#define DW_AT_accessibility    0x32
++#define DW_AT_address_class    0x33
++#define DW_AT_artificial       0x34
++#define DW_AT_base_types       0x35
++#define DW_AT_calling_convention       0x36
++#define DW_AT_count    0x37
++#define DW_AT_data_member_location     0x38
++#define DW_AT_decl_column      0x39
++#define DW_AT_decl_file        0x3a
++#define DW_AT_decl_line        0x3b
++#define DW_AT_declaration      0x3c
++#define DW_AT_discr_list       0x3d
++#define DW_AT_encoding         0x3e
++#define DW_AT_external         0x3f
++#define DW_AT_frame_base       0x40
++#define DW_AT_friend   0x41
++#define DW_AT_identifier_case  0x42
++#define DW_AT_macro_info       0x43
++#define DW_AT_namelist_items   0x44
++#define DW_AT_priority         0x45
++#define DW_AT_segment  0x46
++#define DW_AT_specification    0x47
++#define DW_AT_static_link      0x48
++#define DW_AT_type     0x49
++#define DW_AT_use_location     0x4a
++#define DW_AT_variable_parameter       0x4b
++#define DW_AT_virtuality       0x4c
++#define DW_AT_vtable_elem_location     0x4d
++#define DW_AT_allocated        0x4e
++#define DW_AT_associated       0x4f
++#define DW_AT_data_location    0x50
++#define DW_AT_stride   0x51
++#define DW_AT_entry_pc         0x52
++#define DW_AT_use_UTF8         0x53
++#define DW_AT_extension        0x54
++#define DW_AT_ranges   0x55
++#define DW_AT_trampoline       0x56
++#define DW_AT_call_column      0x57
++#define DW_AT_call_file        0x58
++#define DW_AT_call_line        0x59
++#define DW_AT_MIPS_fde         0x2001
++#define DW_AT_MIPS_loop_begin  0x2002
++#define DW_AT_MIPS_tail_loop_begin     0x2003
++#define DW_AT_MIPS_epilog_begin        0x2004
++#define DW_AT_MIPS_loop_unroll_factor  0x2005
++#define DW_AT_MIPS_software_pipeline_depth     0x2006
++#define DW_AT_MIPS_linkage_name        0x2007
++#define DW_AT_MIPS_stride      0x2008
++#define DW_AT_MIPS_abstract_name       0x2009
++#define DW_AT_MIPS_clone_origin        0x200a
++#define DW_AT_MIPS_has_inlines         0x200b
++#define DW_AT_HP_block_index   0x2000
++#define DW_AT_HP_unmodifiable  0x2001
++#define DW_AT_HP_actuals_stmt_list     0x2010
++#define DW_AT_HP_proc_per_section      0x2011
++#define DW_AT_HP_raw_data_ptr  0x2012
++#define DW_AT_HP_pass_by_reference     0x2013
++#define DW_AT_HP_opt_level     0x2014
++#define DW_AT_HP_prof_version_id       0x2015
++#define DW_AT_HP_opt_flags     0x2016
++#define DW_AT_HP_cold_region_low_pc    0x2017
++#define DW_AT_HP_cold_region_high_pc   0x2018
++#define DW_AT_HP_all_variables_modifiable      0x2019
++#define DW_AT_HP_linkage_name  0x201a
++#define DW_AT_HP_prof_flags    0x201b
++#define DW_AT_sf_names         0x2101
++#define DW_AT_src_info         0x2102
++#define DW_AT_mac_info         0x2103
++#define DW_AT_src_coords       0x2104
++#define DW_AT_body_begin       0x2105
++#define DW_AT_body_end         0x2106
++#define DW_AT_GNU_vector       0x2107
++#define DW_AT_VMS_rtnbeg_pd_address    0x2201
++#define DW_AT_upc_threads_scaled       0x3210
++#define DW_AT_PGI_lbase        0x3a00
++#define DW_AT_PGI_soffset      0x3a01
++#define DW_AT_PGI_lstride      0x3a02
++#define DW_AT_lo_user 0x2000  /* Implementation-defined range start.  */
++#define DW_AT_hi_user 0x3ff0  /* Implementation-defined range end.  */
++#define DW_OP_addr     0x03
++#define DW_OP_deref    0x06
++#define DW_OP_const1u  0x08
++#define DW_OP_const1s  0x09
++#define DW_OP_const2u  0x0a
++#define DW_OP_const2s  0x0b
++#define DW_OP_const4u  0x0c
++#define DW_OP_const4s  0x0d
++#define DW_OP_const8u  0x0e
++#define DW_OP_const8s  0x0f
++#define DW_OP_constu   0x10
++#define DW_OP_consts   0x11
++#define DW_OP_dup      0x12
++#define DW_OP_drop     0x13
++#define DW_OP_over     0x14
++#define DW_OP_pick     0x15
++#define DW_OP_swap     0x16
++#define DW_OP_rot      0x17
++#define DW_OP_xderef   0x18
++#define DW_OP_abs      0x19
++#define DW_OP_and      0x1a
++#define DW_OP_div      0x1b
++#define DW_OP_minus    0x1c
++#define DW_OP_mod      0x1d
++#define DW_OP_mul      0x1e
++#define DW_OP_neg      0x1f
++#define DW_OP_not      0x20
++#define DW_OP_or       0x21
++#define DW_OP_plus     0x22
++#define DW_OP_plus_uconst      0x23
++#define DW_OP_shl      0x24
++#define DW_OP_shr      0x25
++#define DW_OP_shra     0x26
++#define DW_OP_xor      0x27
++#define DW_OP_bra      0x28
++#define DW_OP_eq       0x29
++#define DW_OP_ge       0x2a
++#define DW_OP_gt       0x2b
++#define DW_OP_le       0x2c
++#define DW_OP_lt       0x2d
++#define DW_OP_ne       0x2e
++#define DW_OP_skip     0x2f
++#define DW_OP_lit0     0x30
++#define DW_OP_lit1     0x31
++#define DW_OP_lit2     0x32
++#define DW_OP_lit3     0x33
++#define DW_OP_lit4     0x34
++#define DW_OP_lit5     0x35
++#define DW_OP_lit6     0x36
++#define DW_OP_lit7     0x37
++#define DW_OP_lit8     0x38
++#define DW_OP_lit9     0x39
++#define DW_OP_lit10    0x3a
++#define DW_OP_lit11    0x3b
++#define DW_OP_lit12    0x3c
++#define DW_OP_lit13    0x3d
++#define DW_OP_lit14    0x3e
++#define DW_OP_lit15    0x3f
++#define DW_OP_lit16    0x40
++#define DW_OP_lit17    0x41
++#define DW_OP_lit18    0x42
++#define DW_OP_lit19    0x43
++#define DW_OP_lit20    0x44
++#define DW_OP_lit21    0x45
++#define DW_OP_lit22    0x46
++#define DW_OP_lit23    0x47
++#define DW_OP_lit24    0x48
++#define DW_OP_lit25    0x49
++#define DW_OP_lit26    0x4a
++#define DW_OP_lit27    0x4b
++#define DW_OP_lit28    0x4c
++#define DW_OP_lit29    0x4d
++#define DW_OP_lit30    0x4e
++#define DW_OP_lit31    0x4f
++#define DW_OP_reg0     0x50
++#define DW_OP_reg1     0x51
++#define DW_OP_reg2     0x52
++#define DW_OP_reg3     0x53
++#define DW_OP_reg4     0x54
++#define DW_OP_reg5     0x55
++#define DW_OP_reg6     0x56
++#define DW_OP_reg7     0x57
++#define DW_OP_reg8     0x58
++#define DW_OP_reg9     0x59
++#define DW_OP_reg10    0x5a
++#define DW_OP_reg11    0x5b
++#define DW_OP_reg12    0x5c
++#define DW_OP_reg13    0x5d
++#define DW_OP_reg14    0x5e
++#define DW_OP_reg15    0x5f
++#define DW_OP_reg16    0x60
++#define DW_OP_reg17    0x61
++#define DW_OP_reg18    0x62
++#define DW_OP_reg19    0x63
++#define DW_OP_reg20    0x64
++#define DW_OP_reg21    0x65
++#define DW_OP_reg22    0x66
++#define DW_OP_reg23    0x67
++#define DW_OP_reg24    0x68
++#define DW_OP_reg25    0x69
++#define DW_OP_reg26    0x6a
++#define DW_OP_reg27    0x6b
++#define DW_OP_reg28    0x6c
++#define DW_OP_reg29    0x6d
++#define DW_OP_reg30    0x6e
++#define DW_OP_reg31    0x6f
++#define DW_OP_breg0    0x70
++#define DW_OP_breg1    0x71
++#define DW_OP_breg2    0x72
++#define DW_OP_breg3    0x73
++#define DW_OP_breg4    0x74
++#define DW_OP_breg5    0x75
++#define DW_OP_breg6    0x76
++#define DW_OP_breg7    0x77
++#define DW_OP_breg8    0x78
++#define DW_OP_breg9    0x79
++#define DW_OP_breg10   0x7a
++#define DW_OP_breg11   0x7b
++#define DW_OP_breg12   0x7c
++#define DW_OP_breg13   0x7d
++#define DW_OP_breg14   0x7e
++#define DW_OP_breg15   0x7f
++#define DW_OP_breg16   0x80
++#define DW_OP_breg17   0x81
++#define DW_OP_breg18   0x82
++#define DW_OP_breg19   0x83
++#define DW_OP_breg20   0x84
++#define DW_OP_breg21   0x85
++#define DW_OP_breg22   0x86
++#define DW_OP_breg23   0x87
++#define DW_OP_breg24   0x88
++#define DW_OP_breg25   0x89
++#define DW_OP_breg26   0x8a
++#define DW_OP_breg27   0x8b
++#define DW_OP_breg28   0x8c
++#define DW_OP_breg29   0x8d
++#define DW_OP_breg30   0x8e
++#define DW_OP_breg31   0x8f
++#define DW_OP_regx     0x90
++#define DW_OP_fbreg    0x91
++#define DW_OP_bregx    0x92
++#define DW_OP_piece    0x93
++#define DW_OP_deref_size       0x94
++#define DW_OP_xderef_size      0x95
++#define DW_OP_nop      0x96
++#define DW_OP_push_object_address      0x97
++#define DW_OP_call2    0x98
++#define DW_OP_call4    0x99
++#define DW_OP_call_ref         0x9a
++#define DW_OP_GNU_push_tls_address     0xe0
++#define DW_OP_HP_unknown       0xe0
++#define DW_OP_HP_is_value      0xe1
++#define DW_OP_HP_fltconst4     0xe2
++#define DW_OP_HP_fltconst8     0xe3
++#define DW_OP_HP_mod_range     0xe4
++#define DW_OP_HP_unmod_range   0xe5
++#define DW_OP_HP_tls   0xe6
++#define DW_OP_lo_user 0xe0    /* Implementation-defined range start.  */
++#define DW_OP_hi_user 0xff    /* Implementation-defined range end.  */
++#define DW_ATE_void    0x0
++#define DW_ATE_address         0x1
++#define DW_ATE_boolean         0x2
++#define DW_ATE_complex_float   0x3
++#define DW_ATE_float   0x4
++#define DW_ATE_signed  0x5
++#define DW_ATE_signed_char     0x6
++#define DW_ATE_unsigned        0x7
++#define DW_ATE_unsigned_char   0x8
++#define DW_ATE_imaginary_float         0x9
++#define DW_ATE_HP_float80      0x80
++#define DW_ATE_HP_complex_float80      0x81
++#define DW_ATE_HP_float128     0x82
++#define DW_ATE_HP_complex_float128     0x83
++#define DW_ATE_HP_floathpintel         0x84
++#define DW_ATE_HP_imaginary_float80    0x85
++#define DW_ATE_HP_imaginary_float128   0x86
++#define       DW_ATE_lo_user 0x80
++#define       DW_ATE_hi_user 0xff
++#define DW_ORD_row_major       0
++#define DW_ORD_col_major       1
++#define DW_ACCESS_public       1
++#define DW_ACCESS_protected    2
++#define DW_ACCESS_private      3
++#define DW_VIS_local   1
++#define DW_VIS_exported        2
++#define DW_VIS_qualified       3
++#define DW_VIRTUALITY_none     0
++#define DW_VIRTUALITY_virtual  1
++#define DW_VIRTUALITY_pure_virtual     2
++#define DW_ID_case_sensitive   0
++#define DW_ID_up_case  1
++#define DW_ID_down_case        2
++#define DW_ID_case_insensitive         3
++#define DW_CC_normal   0x1
++#define DW_CC_program  0x2
++#define DW_CC_nocall   0x3
++#define DW_CC_lo_user 0x40
++#define DW_CC_hi_user 0xff
++#define DW_INL_not_inlined     0
++#define DW_INL_inlined         1
++#define DW_INL_declared_not_inlined    2
++#define DW_INL_declared_inlined        3
++#define DW_DSC_label   0
++#define DW_DSC_range   1
++#define DW_LNS_extended_op     0
++#define DW_LNS_copy    1
++#define DW_LNS_advance_pc      2
++#define DW_LNS_advance_line    3
++#define DW_LNS_set_file        4
++#define DW_LNS_set_column      5
++#define DW_LNS_negate_stmt     6
++#define DW_LNS_set_basic_block         7
++#define DW_LNS_const_add_pc    8
++#define DW_LNS_fixed_advance_pc        9
++#define DW_LNS_set_prologue_end        10
++#define DW_LNS_set_epilogue_begin      11
++#define DW_LNS_set_isa         12
++#define DW_LNE_end_sequence    1
++#define DW_LNE_set_address     2
++#define DW_LNE_define_file     3
++#define DW_LNE_HP_negate_is_UV_update  0x11
++#define DW_LNE_HP_push_context         0x12
++#define DW_LNE_HP_pop_context  0x13
++#define DW_LNE_HP_set_file_line_column         0x14
++#define DW_LNE_HP_set_routine_name     0x15
++#define DW_LNE_HP_set_sequence         0x16
++#define DW_LNE_HP_negate_post_semantics        0x17
++#define DW_LNE_HP_negate_function_exit         0x18
++#define DW_LNE_HP_negate_front_end_logical     0x19
++#define DW_LNE_HP_define_proc  0x20
++#define DW_CFA_advance_loc     0x40
++#define DW_CFA_offset  0x80
++#define DW_CFA_restore         0xc0
++#define DW_CFA_nop     0x00
++#define DW_CFA_set_loc         0x01
++#define DW_CFA_advance_loc1    0x02
++#define DW_CFA_advance_loc2    0x03
++#define DW_CFA_advance_loc4    0x04
++#define DW_CFA_offset_extended         0x05
++#define DW_CFA_restore_extended        0x06
++#define DW_CFA_undefined       0x07
++#define DW_CFA_same_value      0x08
++#define DW_CFA_register        0x09
++#define DW_CFA_remember_state  0x0a
++#define DW_CFA_restore_state   0x0b
++#define DW_CFA_def_cfa         0x0c
++#define DW_CFA_def_cfa_register        0x0d
++#define DW_CFA_def_cfa_offset  0x0e
++#define DW_CFA_def_cfa_expression      0x0f
++#define DW_CFA_expression      0x10
++#define DW_CFA_offset_extended_sf      0x11
++#define DW_CFA_def_cfa_sf      0x12
++#define DW_CFA_def_cfa_offset_sf       0x13
++#define DW_CFA_MIPS_advance_loc8       0x1d
++#define DW_CFA_GNU_window_save         0x2d
++#define DW_CFA_GNU_args_size   0x2e
++#define DW_CFA_GNU_negative_offset_extended    0x2f
++#define DW_CIE_ID       0xffffffff
++#define DW_CIE_VERSION          1
++#define DW_CFA_extended   0
++#define DW_CFA_lo_user    0x1c
++#define DW_CFA_hi_user    0x3f
++#define DW_CHILDREN_no                     0x00
++#define DW_CHILDREN_yes                    0x01
++#define DW_ADDR_none          0
++#define DW_LANG_C89    0x0001
++#define DW_LANG_C      0x0002
++#define DW_LANG_Ada83  0x0003
++#define DW_LANG_C_plus_plus    0x0004
++#define DW_LANG_Cobol74        0x0005
++#define DW_LANG_Cobol85        0x0006
++#define DW_LANG_Fortran77      0x0007
++#define DW_LANG_Fortran90      0x0008
++#define DW_LANG_Pascal83       0x0009
++#define DW_LANG_Modula2        0x000a
++#define DW_LANG_Java   0x000b
++#define DW_LANG_C99    0x000c
++#define DW_LANG_Ada95  0x000d
++#define DW_LANG_Fortran95      0x000e
++#define DW_LANG_Mips_Assembler         0x8001
++#define DW_LANG_Upc    0x8765
++#define DW_LANG_lo_user 0x8000        /* Implementation-defined range start.  */
++#define DW_LANG_hi_user 0xffff        /* Implementation-defined range start.  */
++#define DW_MACINFO_define      1
++#define DW_MACINFO_undef       2
++#define DW_MACINFO_start_file  3
++#define DW_MACINFO_end_file    4
++#define DW_MACINFO_vendor_ext  255
++#define DW_EH_PE_absptr               0x00
++#define DW_EH_PE_omit         0xff
++#define DW_EH_PE_uleb128      0x01
++#define DW_EH_PE_udata2               0x02
++#define DW_EH_PE_udata4               0x03
++#define DW_EH_PE_udata8               0x04
++#define DW_EH_PE_sleb128      0x09
++#define DW_EH_PE_sdata2               0x0A
++#define DW_EH_PE_sdata4               0x0B
++#define DW_EH_PE_sdata8               0x0C
++#define DW_EH_PE_signed               0x08
++#define DW_EH_PE_pcrel                0x10
++#define DW_EH_PE_textrel      0x20
++#define DW_EH_PE_datarel      0x30
++#define DW_EH_PE_funcrel      0x40
++#define DW_EH_PE_aligned      0x50
++#define DW_EH_PE_indirect     0x80
++#endif
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/linux/dwarf2-lang.h linux-2.6.18.kgdb/include/linux/dwarf2-lang.h
+--- linux-2.6.18/include/linux/dwarf2-lang.h   1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18.kgdb/include/linux/dwarf2-lang.h      2008-06-10 16:19:58.000000000 +0400
+@@ -0,0 +1,300 @@
++#ifndef DWARF2_LANG
++#define DWARF2_LANG
++
++/*
++ * This is free software; you can redistribute it and/or modify it under
++ * the terms of the GNU General Public License as published by the Free
++ * Software Foundation; either version 2, or (at your option) any later
++ * version.
++ */
++/*
++ * This file defines macros that allow generation of DWARF debug records
++ * for asm files.  This file is platform independent.  Register numbers
++ * (which are about the only thing that is platform dependent) are to be
++ * supplied by a platform defined file.
++ */
++/*
++ * We need this to work for both asm and C.  In asm we are using the
++ * old comment trick to concatenate while C uses the new ANSI thing.
++ * Here we have concat macro...  The multi level thing is to allow and
++ * macros used in the names to be resolved prior to the cat (at which
++ * time they are no longer the same string).
++ */
++#define CAT3(a,b,c) _CAT3(a,b,c)
++#define _CAT3(a,b,c) __CAT3(a,b,c)
++#ifndef __STDC__
++#define __CAT3(a,b,c) a/**/b/**/c
++#else
++#define __CAT3(a,b,c) a##b##c
++#endif
++#ifdef __ASSEMBLY__
++#define IFC(a)
++#define IFN_C(a) a
++#define NL ;
++#define QUOTE_THIS(a) a
++#define DWARF_preamble .section .debug_frame,"",%progbits;
++#else
++#define IFC(a) a
++#define IFN_C(a)
++#define NL \n\t
++#define QUOTE_THIS(a) _QUOTE_THIS(a)
++#define _QUOTE_THIS(a) #a
++/* Don't let CPP see the " and , \042=" \054=, */
++#define DWARF_preamble .section .debug_frame \054\042\042\054%progbits
++#endif
++
++#ifdef CONFIG_64BIT
++#define DATA_ALIGN_FACTOR     8
++#define ADDR_LOC              .quad
++#else
++#define DATA_ALIGN_FACTOR     4
++#define ADDR_LOC              .long
++#endif
++
++#include <linux/dwarf2-defs.h>
++/*
++ * This macro starts a debug frame section.  The debug_frame describes
++ * where to find the registers that the enclosing function saved on
++ * entry.
++ *
++ * ORD is use by the label generator and should be the same as what is
++ * passed to CFI_postamble.
++ *
++ * pc,        pc register gdb ordinal.
++ *
++ * code_align this is the factor used to define locations or regions
++ * where the given definitions apply.  If you use labels to define these
++ * this should be 1.
++ *
++ * data_align this is the factor used to define register offsets.  If
++ * you use struct offset, this should be the size of the register in
++ * bytes or the negative of that.  This is how it is used: you will
++ * define a register as the reference register, say the stack pointer,
++ * then you will say where a register is located relative to this
++ * reference registers value, say 40 for register 3 (the gdb register
++ * number).  The <40> will be multiplied by <data_align> to define the
++ * byte offset of the given register (3, in this example).  So if your
++ * <40> is the byte offset and the reference register points at the
++ * begining, you would want 1 for the data_offset.  If <40> was the 40th
++ * 4-byte element in that structure you would want 4.  And if your
++ * reference register points at the end of the structure you would want
++ * a negative data_align value(and you would have to do other math as
++ * well).
++ */
++
++#define CFI_preamble(ORD, pc, code_align, data_align) \
++         DWARF_preamble       NL                              \
++      .align DATA_ALIGN_FACTOR NL                     \
++        .globl CAT3(frame,_,ORD) NL                   \
++CAT3(frame,_,ORD): NL                                 \
++      .long 7f-6f NL                                  \
++6:                                                    \
++      .long   DW_CIE_ID NL                            \
++      .byte   DW_CIE_VERSION NL                       \
++      .byte 0  NL                                     \
++      .uleb128 code_align NL                          \
++      .sleb128 data_align NL                          \
++      .byte pc NL
++
++/*
++ * After the above macro and prior to the CFI_postamble, you need to
++ * define the initial state.  This starts with defining the reference
++ * register and, usually the pc.  Here are some helper macros:
++ */
++
++#define CFA_define_reference(reg, offset)     \
++      .byte DW_CFA_def_cfa NL                 \
++      .uleb128 reg NL                         \
++      .uleb128 (offset) NL
++
++#define CFA_define_offset(reg, offset)                \
++      .byte (DW_CFA_offset + reg) NL          \
++      .uleb128 (offset) NL
++
++#define CFA_restore(reg)                      \
++        .byte (DW_CFA_restore + reg) NL
++
++#define CFI_postamble()                               \
++      .align DATA_ALIGN_FACTOR NL                             \
++7: NL                                         \
++.previous NL
++
++/*
++ * So now your code pushs stuff on the stack, you need a new location
++ * and the rules for what to do.  This starts a running description of
++ * the call frame.  You need to describe what changes with respect to
++ * the call registers as the location of the pc moves through the code.
++ * The following builds an FDE (fram descriptor entry?).  Like the
++ * above, it has a preamble and a postamble.  It also is tied to the CFI
++ * above.
++ * The preamble macro is tied to the CFI thru the first parameter.  The
++ * second is the code start address and then the code end address+1.
++ */
++#define FDE_preamble(ORD, initial_address, end_address)       \
++        DWARF_preamble NL                             \
++      .align DATA_ALIGN_FACTOR NL                                     \
++      .long 9f-8f NL                                  \
++8:                                                    \
++      .long CAT3(frame,_,ORD) NL                      \
++      ADDR_LOC initial_address NL                     \
++      ADDR_LOC (end_address - initial_address) NL
++
++#define FDE_postamble()                               \
++      .align DATA_ALIGN_FACTOR NL                             \
++9:     NL                                     \
++.previous NL
++
++/*
++ * That done, you can now add registers, subtract registers, move the
++ * reference and even change the reference.  You can also define a new
++ * area of code the info applies to.  For discontinuous bits you should
++ * start a new FDE.  You may have as many as you like.
++ */
++
++/*
++ * To advance the stack address by <bytes> (0x3f max)
++ */
++
++#define CFA_advance_loc(bytes)                        \
++      .byte DW_CFA_advance_loc+bytes NL
++
++/*
++ * This one is good for 0xff or 255
++ */
++#define CFA_advance_loc1(bytes)                       \
++      .byte DW_CFA_advance_loc1 NL            \
++        .byte bytes NL
++
++#define CFA_undefine_reg(reg)                 \
++        .byte DW_CFA_undefined NL             \
++      .uleb128 reg NL
++/*
++ * With the above you can define all the register locations.  But
++ * suppose the reference register moves... Takes the new offset NOT an
++ * increment.  This is how esp is tracked if it is not saved.
++ */
++
++#define CFA_define_cfa_offset(offset)         \
++      .byte DW_CFA_def_cfa_offset NL          \
++      .uleb128 (offset) NL
++/*
++ * Or suppose you want to use a different reference register...
++ */
++#define CFA_define_cfa_register(reg)          \
++      .byte DW_CFA_def_cfa_register NL        \
++      .uleb128 reg NL
++
++/*
++ * If you want to mess with the stack pointer, here is the expression.
++ * The stack starts empty.
++ */
++#define CFA_def_cfa_expression                        \
++        .byte DW_CFA_def_cfa_expression       NL      \
++      .uleb128 20f-10f NL                     \
++10:     NL
++/*
++ * This expression is to be used for other regs.  The stack starts with the
++ * stack address.
++ */
++
++#define CFA_expression(reg)                   \
++        .byte DW_CFA_expression        NL             \
++        .uleb128 reg NL                               \
++      .uleb128 20f-10f NL                     \
++10:     NL
++/*
++ * Here we do the expression stuff.  You should code the above followed
++ *  by expression OPs followed by CFA_expression_end.
++ */
++
++
++#define CFA_expression_end                    \
++20:    NL
++
++#define CFA_exp_OP_const4s(a)                 \
++        .byte DW_OP_const4s NL                        \
++        .long a NL
++
++#define  CFA_exp_OP_swap  .byte DW_OP_swap NL
++#define  CFA_exp_OP_dup  .byte DW_OP_dup NL
++#define  CFA_exp_OP_drop  .byte DW_OP_drop NL
++/*
++ * All these work on the top two elements on the stack, replacing them
++ * with the result.  Top comes first where it matters.  True is 1, false 0.
++ */
++#define  CFA_exp_OP_deref .byte DW_OP_deref NL
++#define  CFA_exp_OP_and   .byte DW_OP_and NL
++#define  CFA_exp_OP_div   .byte DW_OP_div NL
++#define  CFA_exp_OP_minus .byte DW_OP_minus NL
++#define  CFA_exp_OP_mod   .byte DW_OP_mod NL
++#define  CFA_exp_OP_neg   .byte DW_OP_neg NL
++#define  CFA_exp_OP_plus  .byte DW_OP_plus NL
++#define  CFA_exp_OP_not   .byte DW_OP_not NL
++#define  CFA_exp_OP_or    .byte DW_OP_or NL
++#define  CFA_exp_OP_xor   .byte DW_OP_xor NL
++#define  CFA_exp_OP_le    .byte DW_OP_le NL
++#define  CFA_exp_OP_ge    .byte DW_OP_ge NL
++#define  CFA_exp_OP_eq    .byte DW_OP_eq NL
++#define  CFA_exp_OP_lt    .byte DW_OP_lt NL
++#define  CFA_exp_OP_gt    .byte DW_OP_gt NL
++#define  CFA_exp_OP_ne    .byte DW_OP_ne NL
++/*
++ * These take a parameter as noted
++ */
++/*
++ * Unconditional skip to loc. loc is a label (loc:)
++ */
++#define CFA_exp_OP_skip(loc)                  \
++         .byte DW_OP_skip  NL                         \
++       .hword  loc-.-2 NL
++/*
++ * Conditional skip to loc (TOS != 0, TOS--) (loc is a label)
++ */
++#define CFA_exp_OP_bra(loc)                   \
++         .byte DW_OP_bra NL                   \
++       .hword loc-.-2 NL
++
++/*
++ * TOS += no (an unsigned number)
++ */
++#define CFA_exp_OP_plus_uconst(no)            \
++         .byte DW_OP_plus_uconst NL           \
++         .uleb128 no NL
++
++/*
++ * ++TOS = no (a unsigned number)
++ */
++#define CFA_exp_OP_constu(no)                 \
++         .byte DW_OP_constu NL                        \
++       .uleb128 no NL
++/*
++ * ++TOS = no (a signed number)
++ */
++#define CFA_exp_OP_consts(no)                 \
++         .byte DW_OP_consts NL                        \
++       .sleb128 no NL
++/*
++ * ++TOS = no (an unsigned byte)
++ */
++#define CFA_exp_OP_const1u(no)                        \
++         .byte DW_OP_const1u NL                       \
++       .byte no NL
++
++
++/*
++ * ++TOS = no (a address)
++ */
++#define CFA_exp_OP_addr(no)                   \
++         .byte DW_OP_addr NL                  \
++       .long no NL
++
++/*
++ * Push current frames value for "reg" + offset
++ * We take advantage of the opcode assignments to make this a litteral reg
++ * rather than use the DW_OP_bregx opcode.
++ */
++
++#define CFA_exp_OP_breg(reg,offset)           \
++         .byte DW_OP_breg0+reg NL             \
++         .sleb128 offset NL
++#endif
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/linux/dwarf2.h linux-2.6.18.kgdb/include/linux/dwarf2.h
+--- linux-2.6.18/include/linux/dwarf2.h        1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18.kgdb/include/linux/dwarf2.h   2008-06-10 16:19:58.000000000 +0400
+@@ -0,0 +1,775 @@
++/* Declarations and definitions of codes relating to the DWARF2 symbolic
++   debugging information format.
++   Copyright (C) 1992, 1993, 1995, 1996, 1997, 1999, 2000, 2001, 2002,
++   2003 Free Software Foundation, Inc.
++
++   Written by Gary Funck (gary@intrepid.com) The Ada Joint Program
++   Office (AJPO), Florida State Unviversity and Silicon Graphics Inc.
++   provided support for this effort -- June 21, 1995.
++
++   Derived from the DWARF 1 implementation written by Ron Guilmette
++   (rfg@netcom.com), November 1990.
++
++   This file is part of GCC.
++
++   GCC is free software; you can redistribute it and/or modify it under
++   the terms of the GNU General Public License as published by the Free
++   Software Foundation; either version 2, or (at your option) any later
++   version.
++
++   GCC is distributed in the hope that it will be useful, but WITHOUT
++   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
++   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
++   License for more details.
++
++   You should have received a copy of the GNU General Public License
++   along with GCC; see the file COPYING.  If not, write to the Free
++   Software Foundation, 59 Temple Place - Suite 330, Boston, MA
++   02111-1307, USA.  */
++
++/* This file is derived from the DWARF specification (a public document)
++   Revision 2.0.0 (July 27, 1993) developed by the UNIX International
++   Programming Languages Special Interest Group (UI/PLSIG) and distributed
++   by UNIX International.  Copies of this specification are available from
++   UNIX International, 20 Waterview Boulevard, Parsippany, NJ, 07054.
++
++   This file also now contains definitions from the DWARF 3 specification.  */
++
++/* This file is shared between GCC and GDB, and should not contain
++   prototypes.  */
++
++#ifndef _ELF_DWARF2_H
++#define _ELF_DWARF2_H
++
++/* Structure found in the .debug_line section.  */
++typedef struct
++{
++  unsigned char li_length          [4];
++  unsigned char li_version         [2];
++  unsigned char li_prologue_length [4];
++  unsigned char li_min_insn_length [1];
++  unsigned char li_default_is_stmt [1];
++  unsigned char li_line_base       [1];
++  unsigned char li_line_range      [1];
++  unsigned char li_opcode_base     [1];
++}
++DWARF2_External_LineInfo;
++
++typedef struct
++{
++  unsigned long  li_length;
++  unsigned short li_version;
++  unsigned int   li_prologue_length;
++  unsigned char  li_min_insn_length;
++  unsigned char  li_default_is_stmt;
++  int            li_line_base;
++  unsigned char  li_line_range;
++  unsigned char  li_opcode_base;
++}
++DWARF2_Internal_LineInfo;
++
++/* Structure found in .debug_pubnames section.  */
++typedef struct
++{
++  unsigned char pn_length  [4];
++  unsigned char pn_version [2];
++  unsigned char pn_offset  [4];
++  unsigned char pn_size    [4];
++}
++DWARF2_External_PubNames;
++
++typedef struct
++{
++  unsigned long  pn_length;
++  unsigned short pn_version;
++  unsigned long  pn_offset;
++  unsigned long  pn_size;
++}
++DWARF2_Internal_PubNames;
++
++/* Structure found in .debug_info section.  */
++typedef struct
++{
++  unsigned char  cu_length        [4];
++  unsigned char  cu_version       [2];
++  unsigned char  cu_abbrev_offset [4];
++  unsigned char  cu_pointer_size  [1];
++}
++DWARF2_External_CompUnit;
++
++typedef struct
++{
++  unsigned long  cu_length;
++  unsigned short cu_version;
++  unsigned long  cu_abbrev_offset;
++  unsigned char  cu_pointer_size;
++}
++DWARF2_Internal_CompUnit;
++
++typedef struct
++{
++  unsigned char  ar_length       [4];
++  unsigned char  ar_version      [2];
++  unsigned char  ar_info_offset  [4];
++  unsigned char  ar_pointer_size [1];
++  unsigned char  ar_segment_size [1];
++}
++DWARF2_External_ARange;
++
++typedef struct
++{
++  unsigned long  ar_length;
++  unsigned short ar_version;
++  unsigned long  ar_info_offset;
++  unsigned char  ar_pointer_size;
++  unsigned char  ar_segment_size;
++}
++DWARF2_Internal_ARange;
++
++
++/* Tag names and codes.  */
++enum dwarf_tag
++  {
++    DW_TAG_padding = 0x00,
++    DW_TAG_array_type = 0x01,
++    DW_TAG_class_type = 0x02,
++    DW_TAG_entry_point = 0x03,
++    DW_TAG_enumeration_type = 0x04,
++    DW_TAG_formal_parameter = 0x05,
++    DW_TAG_imported_declaration = 0x08,
++    DW_TAG_label = 0x0a,
++    DW_TAG_lexical_block = 0x0b,
++    DW_TAG_member = 0x0d,
++    DW_TAG_pointer_type = 0x0f,
++    DW_TAG_reference_type = 0x10,
++    DW_TAG_compile_unit = 0x11,
++    DW_TAG_string_type = 0x12,
++    DW_TAG_structure_type = 0x13,
++    DW_TAG_subroutine_type = 0x15,
++    DW_TAG_typedef = 0x16,
++    DW_TAG_union_type = 0x17,
++    DW_TAG_unspecified_parameters = 0x18,
++    DW_TAG_variant = 0x19,
++    DW_TAG_common_block = 0x1a,
++    DW_TAG_common_inclusion = 0x1b,
++    DW_TAG_inheritance = 0x1c,
++    DW_TAG_inlined_subroutine = 0x1d,
++    DW_TAG_module = 0x1e,
++    DW_TAG_ptr_to_member_type = 0x1f,
++    DW_TAG_set_type = 0x20,
++    DW_TAG_subrange_type = 0x21,
++    DW_TAG_with_stmt = 0x22,
++    DW_TAG_access_declaration = 0x23,
++    DW_TAG_base_type = 0x24,
++    DW_TAG_catch_block = 0x25,
++    DW_TAG_const_type = 0x26,
++    DW_TAG_constant = 0x27,
++    DW_TAG_enumerator = 0x28,
++    DW_TAG_file_type = 0x29,
++    DW_TAG_friend = 0x2a,
++    DW_TAG_namelist = 0x2b,
++    DW_TAG_namelist_item = 0x2c,
++    DW_TAG_packed_type = 0x2d,
++    DW_TAG_subprogram = 0x2e,
++    DW_TAG_template_type_param = 0x2f,
++    DW_TAG_template_value_param = 0x30,
++    DW_TAG_thrown_type = 0x31,
++    DW_TAG_try_block = 0x32,
++    DW_TAG_variant_part = 0x33,
++    DW_TAG_variable = 0x34,
++    DW_TAG_volatile_type = 0x35,
++    /* DWARF 3.  */
++    DW_TAG_dwarf_procedure = 0x36,
++    DW_TAG_restrict_type = 0x37,
++    DW_TAG_interface_type = 0x38,
++    DW_TAG_namespace = 0x39,
++    DW_TAG_imported_module = 0x3a,
++    DW_TAG_unspecified_type = 0x3b,
++    DW_TAG_partial_unit = 0x3c,
++    DW_TAG_imported_unit = 0x3d,
++    /* SGI/MIPS Extensions.  */
++    DW_TAG_MIPS_loop = 0x4081,
++    /* HP extensions.  See: ftp://ftp.hp.com/pub/lang/tools/WDB/wdb-4.0.tar.gz .  */
++    DW_TAG_HP_array_descriptor = 0x4090,
++    /* GNU extensions.  */
++    DW_TAG_format_label = 0x4101,     /* For FORTRAN 77 and Fortran 90.  */
++    DW_TAG_function_template = 0x4102,        /* For C++.  */
++    DW_TAG_class_template = 0x4103,   /* For C++.  */
++    DW_TAG_GNU_BINCL = 0x4104,
++    DW_TAG_GNU_EINCL = 0x4105,
++    /* Extensions for UPC.  See: http://upc.gwu.edu/~upc.  */
++    DW_TAG_upc_shared_type = 0x8765,
++    DW_TAG_upc_strict_type = 0x8766,
++    DW_TAG_upc_relaxed_type = 0x8767,
++    /* PGI (STMicroelectronics) extensions.  No documentation available.  */
++    DW_TAG_PGI_kanji_type      = 0xA000,
++    DW_TAG_PGI_interface_block = 0xA020
++  };
++
++#define DW_TAG_lo_user        0x4080
++#define DW_TAG_hi_user        0xffff
++
++/* Flag that tells whether entry has a child or not.  */
++#define DW_children_no   0
++#define       DW_children_yes  1
++
++/* Form names and codes.  */
++enum dwarf_form
++  {
++    DW_FORM_addr = 0x01,
++    DW_FORM_block2 = 0x03,
++    DW_FORM_block4 = 0x04,
++    DW_FORM_data2 = 0x05,
++    DW_FORM_data4 = 0x06,
++    DW_FORM_data8 = 0x07,
++    DW_FORM_string = 0x08,
++    DW_FORM_block = 0x09,
++    DW_FORM_block1 = 0x0a,
++    DW_FORM_data1 = 0x0b,
++    DW_FORM_flag = 0x0c,
++    DW_FORM_sdata = 0x0d,
++    DW_FORM_strp = 0x0e,
++    DW_FORM_udata = 0x0f,
++    DW_FORM_ref_addr = 0x10,
++    DW_FORM_ref1 = 0x11,
++    DW_FORM_ref2 = 0x12,
++    DW_FORM_ref4 = 0x13,
++    DW_FORM_ref8 = 0x14,
++    DW_FORM_ref_udata = 0x15,
++    DW_FORM_indirect = 0x16
++  };
++
++/* Attribute names and codes.  */
++enum dwarf_attribute
++  {
++    DW_AT_sibling = 0x01,
++    DW_AT_location = 0x02,
++    DW_AT_name = 0x03,
++    DW_AT_ordering = 0x09,
++    DW_AT_subscr_data = 0x0a,
++    DW_AT_byte_size = 0x0b,
++    DW_AT_bit_offset = 0x0c,
++    DW_AT_bit_size = 0x0d,
++    DW_AT_element_list = 0x0f,
++    DW_AT_stmt_list = 0x10,
++    DW_AT_low_pc = 0x11,
++    DW_AT_high_pc = 0x12,
++    DW_AT_language = 0x13,
++    DW_AT_member = 0x14,
++    DW_AT_discr = 0x15,
++    DW_AT_discr_value = 0x16,
++    DW_AT_visibility = 0x17,
++    DW_AT_import = 0x18,
++    DW_AT_string_length = 0x19,
++    DW_AT_common_reference = 0x1a,
++    DW_AT_comp_dir = 0x1b,
++    DW_AT_const_value = 0x1c,
++    DW_AT_containing_type = 0x1d,
++    DW_AT_default_value = 0x1e,
++    DW_AT_inline = 0x20,
++    DW_AT_is_optional = 0x21,
++    DW_AT_lower_bound = 0x22,
++    DW_AT_producer = 0x25,
++    DW_AT_prototyped = 0x27,
++    DW_AT_return_addr = 0x2a,
++    DW_AT_start_scope = 0x2c,
++    DW_AT_stride_size = 0x2e,
++    DW_AT_upper_bound = 0x2f,
++    DW_AT_abstract_origin = 0x31,
++    DW_AT_accessibility = 0x32,
++    DW_AT_address_class = 0x33,
++    DW_AT_artificial = 0x34,
++    DW_AT_base_types = 0x35,
++    DW_AT_calling_convention = 0x36,
++    DW_AT_count = 0x37,
++    DW_AT_data_member_location = 0x38,
++    DW_AT_decl_column = 0x39,
++    DW_AT_decl_file = 0x3a,
++    DW_AT_decl_line = 0x3b,
++    DW_AT_declaration = 0x3c,
++    DW_AT_discr_list = 0x3d,
++    DW_AT_encoding = 0x3e,
++    DW_AT_external = 0x3f,
++    DW_AT_frame_base = 0x40,
++    DW_AT_friend = 0x41,
++    DW_AT_identifier_case = 0x42,
++    DW_AT_macro_info = 0x43,
++    DW_AT_namelist_items = 0x44,
++    DW_AT_priority = 0x45,
++    DW_AT_segment = 0x46,
++    DW_AT_specification = 0x47,
++    DW_AT_static_link = 0x48,
++    DW_AT_type = 0x49,
++    DW_AT_use_location = 0x4a,
++    DW_AT_variable_parameter = 0x4b,
++    DW_AT_virtuality = 0x4c,
++    DW_AT_vtable_elem_location = 0x4d,
++    /* DWARF 3 values.  */
++    DW_AT_allocated     = 0x4e,
++    DW_AT_associated    = 0x4f,
++    DW_AT_data_location = 0x50,
++    DW_AT_stride        = 0x51,
++    DW_AT_entry_pc      = 0x52,
++    DW_AT_use_UTF8      = 0x53,
++    DW_AT_extension     = 0x54,
++    DW_AT_ranges        = 0x55,
++    DW_AT_trampoline    = 0x56,
++    DW_AT_call_column   = 0x57,
++    DW_AT_call_file     = 0x58,
++    DW_AT_call_line     = 0x59,
++    /* SGI/MIPS extensions.  */
++    DW_AT_MIPS_fde = 0x2001,
++    DW_AT_MIPS_loop_begin = 0x2002,
++    DW_AT_MIPS_tail_loop_begin = 0x2003,
++    DW_AT_MIPS_epilog_begin = 0x2004,
++    DW_AT_MIPS_loop_unroll_factor = 0x2005,
++    DW_AT_MIPS_software_pipeline_depth = 0x2006,
++    DW_AT_MIPS_linkage_name = 0x2007,
++    DW_AT_MIPS_stride = 0x2008,
++    DW_AT_MIPS_abstract_name = 0x2009,
++    DW_AT_MIPS_clone_origin = 0x200a,
++    DW_AT_MIPS_has_inlines = 0x200b,
++    /* HP extensions.  */
++    DW_AT_HP_block_index         = 0x2000,
++    DW_AT_HP_unmodifiable        = 0x2001, /* Same as DW_AT_MIPS_fde.  */
++    DW_AT_HP_actuals_stmt_list   = 0x2010,
++    DW_AT_HP_proc_per_section    = 0x2011,
++    DW_AT_HP_raw_data_ptr        = 0x2012,
++    DW_AT_HP_pass_by_reference   = 0x2013,
++    DW_AT_HP_opt_level           = 0x2014,
++    DW_AT_HP_prof_version_id     = 0x2015,
++    DW_AT_HP_opt_flags           = 0x2016,
++    DW_AT_HP_cold_region_low_pc  = 0x2017,
++    DW_AT_HP_cold_region_high_pc = 0x2018,
++    DW_AT_HP_all_variables_modifiable = 0x2019,
++    DW_AT_HP_linkage_name        = 0x201a,
++    DW_AT_HP_prof_flags          = 0x201b,  /* In comp unit of procs_info for -g.  */
++    /* GNU extensions.  */
++    DW_AT_sf_names   = 0x2101,
++    DW_AT_src_info   = 0x2102,
++    DW_AT_mac_info   = 0x2103,
++    DW_AT_src_coords = 0x2104,
++    DW_AT_body_begin = 0x2105,
++    DW_AT_body_end   = 0x2106,
++    DW_AT_GNU_vector = 0x2107,
++    /* VMS extensions.  */
++    DW_AT_VMS_rtnbeg_pd_address = 0x2201,
++    /* UPC extension.  */
++    DW_AT_upc_threads_scaled = 0x3210,
++    /* PGI (STMicroelectronics) extensions.  */
++    DW_AT_PGI_lbase    = 0x3a00,
++    DW_AT_PGI_soffset  = 0x3a01,
++    DW_AT_PGI_lstride  = 0x3a02
++  };
++
++#define DW_AT_lo_user 0x2000  /* Implementation-defined range start.  */
++#define DW_AT_hi_user 0x3ff0  /* Implementation-defined range end.  */
++
++/* Location atom names and codes.  */
++enum dwarf_location_atom
++  {
++    DW_OP_addr = 0x03,
++    DW_OP_deref = 0x06,
++    DW_OP_const1u = 0x08,
++    DW_OP_const1s = 0x09,
++    DW_OP_const2u = 0x0a,
++    DW_OP_const2s = 0x0b,
++    DW_OP_const4u = 0x0c,
++    DW_OP_const4s = 0x0d,
++    DW_OP_const8u = 0x0e,
++    DW_OP_const8s = 0x0f,
++    DW_OP_constu = 0x10,
++    DW_OP_consts = 0x11,
++    DW_OP_dup = 0x12,
++    DW_OP_drop = 0x13,
++    DW_OP_over = 0x14,
++    DW_OP_pick = 0x15,
++    DW_OP_swap = 0x16,
++    DW_OP_rot = 0x17,
++    DW_OP_xderef = 0x18,
++    DW_OP_abs = 0x19,
++    DW_OP_and = 0x1a,
++    DW_OP_div = 0x1b,
++    DW_OP_minus = 0x1c,
++    DW_OP_mod = 0x1d,
++    DW_OP_mul = 0x1e,
++    DW_OP_neg = 0x1f,
++    DW_OP_not = 0x20,
++    DW_OP_or = 0x21,
++    DW_OP_plus = 0x22,
++    DW_OP_plus_uconst = 0x23,
++    DW_OP_shl = 0x24,
++    DW_OP_shr = 0x25,
++    DW_OP_shra = 0x26,
++    DW_OP_xor = 0x27,
++    DW_OP_bra = 0x28,
++    DW_OP_eq = 0x29,
++    DW_OP_ge = 0x2a,
++    DW_OP_gt = 0x2b,
++    DW_OP_le = 0x2c,
++    DW_OP_lt = 0x2d,
++    DW_OP_ne = 0x2e,
++    DW_OP_skip = 0x2f,
++    DW_OP_lit0 = 0x30,
++    DW_OP_lit1 = 0x31,
++    DW_OP_lit2 = 0x32,
++    DW_OP_lit3 = 0x33,
++    DW_OP_lit4 = 0x34,
++    DW_OP_lit5 = 0x35,
++    DW_OP_lit6 = 0x36,
++    DW_OP_lit7 = 0x37,
++    DW_OP_lit8 = 0x38,
++    DW_OP_lit9 = 0x39,
++    DW_OP_lit10 = 0x3a,
++    DW_OP_lit11 = 0x3b,
++    DW_OP_lit12 = 0x3c,
++    DW_OP_lit13 = 0x3d,
++    DW_OP_lit14 = 0x3e,
++    DW_OP_lit15 = 0x3f,
++    DW_OP_lit16 = 0x40,
++    DW_OP_lit17 = 0x41,
++    DW_OP_lit18 = 0x42,
++    DW_OP_lit19 = 0x43,
++    DW_OP_lit20 = 0x44,
++    DW_OP_lit21 = 0x45,
++    DW_OP_lit22 = 0x46,
++    DW_OP_lit23 = 0x47,
++    DW_OP_lit24 = 0x48,
++    DW_OP_lit25 = 0x49,
++    DW_OP_lit26 = 0x4a,
++    DW_OP_lit27 = 0x4b,
++    DW_OP_lit28 = 0x4c,
++    DW_OP_lit29 = 0x4d,
++    DW_OP_lit30 = 0x4e,
++    DW_OP_lit31 = 0x4f,
++    DW_OP_reg0 = 0x50,
++    DW_OP_reg1 = 0x51,
++    DW_OP_reg2 = 0x52,
++    DW_OP_reg3 = 0x53,
++    DW_OP_reg4 = 0x54,
++    DW_OP_reg5 = 0x55,
++    DW_OP_reg6 = 0x56,
++    DW_OP_reg7 = 0x57,
++    DW_OP_reg8 = 0x58,
++    DW_OP_reg9 = 0x59,
++    DW_OP_reg10 = 0x5a,
++    DW_OP_reg11 = 0x5b,
++    DW_OP_reg12 = 0x5c,
++    DW_OP_reg13 = 0x5d,
++    DW_OP_reg14 = 0x5e,
++    DW_OP_reg15 = 0x5f,
++    DW_OP_reg16 = 0x60,
++    DW_OP_reg17 = 0x61,
++    DW_OP_reg18 = 0x62,
++    DW_OP_reg19 = 0x63,
++    DW_OP_reg20 = 0x64,
++    DW_OP_reg21 = 0x65,
++    DW_OP_reg22 = 0x66,
++    DW_OP_reg23 = 0x67,
++    DW_OP_reg24 = 0x68,
++    DW_OP_reg25 = 0x69,
++    DW_OP_reg26 = 0x6a,
++    DW_OP_reg27 = 0x6b,
++    DW_OP_reg28 = 0x6c,
++    DW_OP_reg29 = 0x6d,
++    DW_OP_reg30 = 0x6e,
++    DW_OP_reg31 = 0x6f,
++    DW_OP_breg0 = 0x70,
++    DW_OP_breg1 = 0x71,
++    DW_OP_breg2 = 0x72,
++    DW_OP_breg3 = 0x73,
++    DW_OP_breg4 = 0x74,
++    DW_OP_breg5 = 0x75,
++    DW_OP_breg6 = 0x76,
++    DW_OP_breg7 = 0x77,
++    DW_OP_breg8 = 0x78,
++    DW_OP_breg9 = 0x79,
++    DW_OP_breg10 = 0x7a,
++    DW_OP_breg11 = 0x7b,
++    DW_OP_breg12 = 0x7c,
++    DW_OP_breg13 = 0x7d,
++    DW_OP_breg14 = 0x7e,
++    DW_OP_breg15 = 0x7f,
++    DW_OP_breg16 = 0x80,
++    DW_OP_breg17 = 0x81,
++    DW_OP_breg18 = 0x82,
++    DW_OP_breg19 = 0x83,
++    DW_OP_breg20 = 0x84,
++    DW_OP_breg21 = 0x85,
++    DW_OP_breg22 = 0x86,
++    DW_OP_breg23 = 0x87,
++    DW_OP_breg24 = 0x88,
++    DW_OP_breg25 = 0x89,
++    DW_OP_breg26 = 0x8a,
++    DW_OP_breg27 = 0x8b,
++    DW_OP_breg28 = 0x8c,
++    DW_OP_breg29 = 0x8d,
++    DW_OP_breg30 = 0x8e,
++    DW_OP_breg31 = 0x8f,
++    DW_OP_regx = 0x90,
++    DW_OP_fbreg = 0x91,
++    DW_OP_bregx = 0x92,
++    DW_OP_piece = 0x93,
++    DW_OP_deref_size = 0x94,
++    DW_OP_xderef_size = 0x95,
++    DW_OP_nop = 0x96,
++    /* DWARF 3 extensions.  */
++    DW_OP_push_object_address = 0x97,
++    DW_OP_call2 = 0x98,
++    DW_OP_call4 = 0x99,
++    DW_OP_call_ref = 0x9a,
++    /* GNU extensions.  */
++    DW_OP_GNU_push_tls_address = 0xe0,
++    /* HP extensions.  */
++    DW_OP_HP_unknown     = 0xe0, /* Ouch, the same as GNU_push_tls_address.  */
++    DW_OP_HP_is_value    = 0xe1,
++    DW_OP_HP_fltconst4   = 0xe2,
++    DW_OP_HP_fltconst8   = 0xe3,
++    DW_OP_HP_mod_range   = 0xe4,
++    DW_OP_HP_unmod_range = 0xe5,
++    DW_OP_HP_tls         = 0xe6
++  };
++
++#define DW_OP_lo_user 0xe0    /* Implementation-defined range start.  */
++#define DW_OP_hi_user 0xff    /* Implementation-defined range end.  */
++
++/* Type encodings.  */
++enum dwarf_type
++  {
++    DW_ATE_void = 0x0,
++    DW_ATE_address = 0x1,
++    DW_ATE_boolean = 0x2,
++    DW_ATE_complex_float = 0x3,
++    DW_ATE_float = 0x4,
++    DW_ATE_signed = 0x5,
++    DW_ATE_signed_char = 0x6,
++    DW_ATE_unsigned = 0x7,
++    DW_ATE_unsigned_char = 0x8,
++    /* DWARF 3.  */
++    DW_ATE_imaginary_float = 0x9,
++    /* HP extensions.  */
++    DW_ATE_HP_float80            = 0x80, /* Floating-point (80 bit).  */
++    DW_ATE_HP_complex_float80    = 0x81, /* Complex floating-point (80 bit).  */
++    DW_ATE_HP_float128           = 0x82, /* Floating-point (128 bit).  */
++    DW_ATE_HP_complex_float128   = 0x83, /* Complex floating-point (128 bit).  */
++    DW_ATE_HP_floathpintel       = 0x84, /* Floating-point (82 bit IA64).  */
++    DW_ATE_HP_imaginary_float80  = 0x85,
++    DW_ATE_HP_imaginary_float128 = 0x86
++  };
++
++#define       DW_ATE_lo_user 0x80
++#define       DW_ATE_hi_user 0xff
++
++/* Array ordering names and codes.  */
++enum dwarf_array_dim_ordering
++  {
++    DW_ORD_row_major = 0,
++    DW_ORD_col_major = 1
++  };
++
++/* Access attribute.  */
++enum dwarf_access_attribute
++  {
++    DW_ACCESS_public = 1,
++    DW_ACCESS_protected = 2,
++    DW_ACCESS_private = 3
++  };
++
++/* Visibility.  */
++enum dwarf_visibility_attribute
++  {
++    DW_VIS_local = 1,
++    DW_VIS_exported = 2,
++    DW_VIS_qualified = 3
++  };
++
++/* Virtuality.  */
++enum dwarf_virtuality_attribute
++  {
++    DW_VIRTUALITY_none = 0,
++    DW_VIRTUALITY_virtual = 1,
++    DW_VIRTUALITY_pure_virtual = 2
++  };
++
++/* Case sensitivity.  */
++enum dwarf_id_case
++  {
++    DW_ID_case_sensitive = 0,
++    DW_ID_up_case = 1,
++    DW_ID_down_case = 2,
++    DW_ID_case_insensitive = 3
++  };
++
++/* Calling convention.  */
++enum dwarf_calling_convention
++  {
++    DW_CC_normal = 0x1,
++    DW_CC_program = 0x2,
++    DW_CC_nocall = 0x3
++  };
++
++#define DW_CC_lo_user 0x40
++#define DW_CC_hi_user 0xff
++
++/* Inline attribute.  */
++enum dwarf_inline_attribute
++  {
++    DW_INL_not_inlined = 0,
++    DW_INL_inlined = 1,
++    DW_INL_declared_not_inlined = 2,
++    DW_INL_declared_inlined = 3
++  };
++
++/* Discriminant lists.  */
++enum dwarf_discrim_list
++  {
++    DW_DSC_label = 0,
++    DW_DSC_range = 1
++  };
++
++/* Line number opcodes.  */
++enum dwarf_line_number_ops
++  {
++    DW_LNS_extended_op = 0,
++    DW_LNS_copy = 1,
++    DW_LNS_advance_pc = 2,
++    DW_LNS_advance_line = 3,
++    DW_LNS_set_file = 4,
++    DW_LNS_set_column = 5,
++    DW_LNS_negate_stmt = 6,
++    DW_LNS_set_basic_block = 7,
++    DW_LNS_const_add_pc = 8,
++    DW_LNS_fixed_advance_pc = 9,
++    /* DWARF 3.  */
++    DW_LNS_set_prologue_end = 10,
++    DW_LNS_set_epilogue_begin = 11,
++    DW_LNS_set_isa = 12
++  };
++
++/* Line number extended opcodes.  */
++enum dwarf_line_number_x_ops
++  {
++    DW_LNE_end_sequence = 1,
++    DW_LNE_set_address = 2,
++    DW_LNE_define_file = 3,
++    /* HP extensions.  */
++    DW_LNE_HP_negate_is_UV_update      = 0x11,
++    DW_LNE_HP_push_context             = 0x12,
++    DW_LNE_HP_pop_context              = 0x13,
++    DW_LNE_HP_set_file_line_column     = 0x14,
++    DW_LNE_HP_set_routine_name         = 0x15,
++    DW_LNE_HP_set_sequence             = 0x16,
++    DW_LNE_HP_negate_post_semantics    = 0x17,
++    DW_LNE_HP_negate_function_exit     = 0x18,
++    DW_LNE_HP_negate_front_end_logical = 0x19,
++    DW_LNE_HP_define_proc              = 0x20
++  };
++
++/* Call frame information.  */
++enum dwarf_call_frame_info
++  {
++    DW_CFA_advance_loc = 0x40,
++    DW_CFA_offset = 0x80,
++    DW_CFA_restore = 0xc0,
++    DW_CFA_nop = 0x00,
++    DW_CFA_set_loc = 0x01,
++    DW_CFA_advance_loc1 = 0x02,
++    DW_CFA_advance_loc2 = 0x03,
++    DW_CFA_advance_loc4 = 0x04,
++    DW_CFA_offset_extended = 0x05,
++    DW_CFA_restore_extended = 0x06,
++    DW_CFA_undefined = 0x07,
++    DW_CFA_same_value = 0x08,
++    DW_CFA_register = 0x09,
++    DW_CFA_remember_state = 0x0a,
++    DW_CFA_restore_state = 0x0b,
++    DW_CFA_def_cfa = 0x0c,
++    DW_CFA_def_cfa_register = 0x0d,
++    DW_CFA_def_cfa_offset = 0x0e,
++    /* DWARF 3.  */
++    DW_CFA_def_cfa_expression = 0x0f,
++    DW_CFA_expression = 0x10,
++    DW_CFA_offset_extended_sf = 0x11,
++    DW_CFA_def_cfa_sf = 0x12,
++    DW_CFA_def_cfa_offset_sf = 0x13,
++    /* SGI/MIPS specific.  */
++    DW_CFA_MIPS_advance_loc8 = 0x1d,
++    /* GNU extensions.  */
++    DW_CFA_GNU_window_save = 0x2d,
++    DW_CFA_GNU_args_size = 0x2e,
++    DW_CFA_GNU_negative_offset_extended = 0x2f
++  };
++
++#define DW_CIE_ID       0xffffffff
++#define DW_CIE_VERSION          1
++
++#define DW_CFA_extended   0
++#define DW_CFA_lo_user    0x1c
++#define DW_CFA_hi_user    0x3f
++
++#define DW_CHILDREN_no                     0x00
++#define DW_CHILDREN_yes                    0x01
++
++#define DW_ADDR_none          0
++
++/* Source language names and codes.  */
++enum dwarf_source_language
++  {
++    DW_LANG_C89 = 0x0001,
++    DW_LANG_C = 0x0002,
++    DW_LANG_Ada83 = 0x0003,
++    DW_LANG_C_plus_plus = 0x0004,
++    DW_LANG_Cobol74 = 0x0005,
++    DW_LANG_Cobol85 = 0x0006,
++    DW_LANG_Fortran77 = 0x0007,
++    DW_LANG_Fortran90 = 0x0008,
++    DW_LANG_Pascal83 = 0x0009,
++    DW_LANG_Modula2 = 0x000a,
++    DW_LANG_Java = 0x000b,
++    /* DWARF 3.  */
++    DW_LANG_C99 = 0x000c,
++    DW_LANG_Ada95 = 0x000d,
++    DW_LANG_Fortran95 = 0x000e,
++    /* MIPS.  */
++    DW_LANG_Mips_Assembler = 0x8001,
++    /* UPC.  */
++    DW_LANG_Upc = 0x8765
++  };
++
++#define DW_LANG_lo_user 0x8000        /* Implementation-defined range start.  */
++#define DW_LANG_hi_user 0xffff        /* Implementation-defined range start.  */
++
++/* Names and codes for macro information.  */
++enum dwarf_macinfo_record_type
++  {
++    DW_MACINFO_define = 1,
++    DW_MACINFO_undef = 2,
++    DW_MACINFO_start_file = 3,
++    DW_MACINFO_end_file = 4,
++    DW_MACINFO_vendor_ext = 255
++  };
++\f
++/* @@@ For use with GNU frame unwind information.  */
++
++#define DW_EH_PE_absptr               0x00
++#define DW_EH_PE_omit         0xff
++
++#define DW_EH_PE_uleb128      0x01
++#define DW_EH_PE_udata2               0x02
++#define DW_EH_PE_udata4               0x03
++#define DW_EH_PE_udata8               0x04
++#define DW_EH_PE_sleb128      0x09
++#define DW_EH_PE_sdata2               0x0A
++#define DW_EH_PE_sdata4               0x0B
++#define DW_EH_PE_sdata8               0x0C
++#define DW_EH_PE_signed               0x08
++
++#define DW_EH_PE_pcrel                0x10
++#define DW_EH_PE_textrel      0x20
++#define DW_EH_PE_datarel      0x30
++#define DW_EH_PE_funcrel      0x40
++#define DW_EH_PE_aligned      0x50
++
++#define DW_EH_PE_indirect     0x80
++
++#endif /* _ELF_DWARF2_H */
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/linux/kgdb.h linux-2.6.18.kgdb/include/linux/kgdb.h
+--- linux-2.6.18/include/linux/kgdb.h  1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18.kgdb/include/linux/kgdb.h     2008-06-10 16:20:11.000000000 +0400
+@@ -0,0 +1,279 @@
++/*
++ * include/linux/kgdb.h
++ *
++ * This provides the hooks and functions that KGDB needs to share between
++ * the core, I/O and arch-specific portions.
++ *
++ * Author: Amit Kale <amitkale@linsyssoft.com> and
++ *         Tom Rini <trini@kernel.crashing.org>
++ *
++ * 2001-2004 (c) Amit S. Kale and 2003-2005 (c) MontaVista Software, Inc.
++ * This file is licensed under the terms of the GNU General Public License
++ * version 2. This program is licensed "as is" without any warranty of any
++ * kind, whether express or implied.
++ */
++#ifdef __KERNEL__
++#ifndef _KGDB_H_
++#define _KGDB_H_
++
++#include <asm/atomic.h>
++
++#ifdef CONFIG_KGDB
++#include <asm/kgdb.h>
++#include <linux/serial_8250.h>
++#include <linux/linkage.h>
++#include <linux/init.h>
++
++#ifndef CHECK_EXCEPTION_STACK
++#define CHECK_EXCEPTION_STACK()       1
++#endif
++
++struct tasklet_struct;
++struct pt_regs;
++struct task_struct;
++struct uart_port;
++
++#ifdef CONFIG_KGDB_CONSOLE
++extern struct console kgdbcons;
++#endif
++
++/* To enter the debugger explicitly. */
++extern void breakpoint(void);
++extern int kgdb_connected;
++extern int kgdb_may_fault;
++extern struct tasklet_struct kgdb_tasklet_breakpoint;
++
++extern atomic_t kgdb_setting_breakpoint;
++extern atomic_t cpu_doing_single_step;
++extern atomic_t kgdb_sync_softlockup[NR_CPUS];
++
++extern struct task_struct *kgdb_usethread, *kgdb_contthread;
++
++enum kgdb_bptype {
++      bp_breakpoint = '0',
++      bp_hardware_breakpoint,
++      bp_write_watchpoint,
++      bp_read_watchpoint,
++      bp_access_watchpoint
++};
++
++enum kgdb_bpstate {
++      bp_none = 0,
++      bp_removed,
++      bp_set,
++      bp_active
++};
++
++struct kgdb_bkpt {
++      unsigned long bpt_addr;
++      unsigned char saved_instr[BREAK_INSTR_SIZE];
++      enum kgdb_bptype type;
++      enum kgdb_bpstate state;
++};
++
++/* The maximum number of KGDB I/O modules that can be loaded */
++#define MAX_KGDB_IO_HANDLERS 3
++
++#ifndef MAX_BREAKPOINTS
++#define MAX_BREAKPOINTS               1000
++#endif
++
++#define KGDB_HW_BREAKPOINT    1
++
++/* Required functions. */
++/**
++ *    regs_to_gdb_regs - Convert ptrace regs to GDB regs
++ *    @gdb_regs: A pointer to hold the registers in the order GDB wants.
++ *    @regs: The &struct pt_regs of the current process.
++ *
++ *    Convert the pt_regs in @regs into the format for registers that
++ *    GDB expects, stored in @gdb_regs.
++ */
++extern void regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs);
++
++/**
++ *    sleeping_regs_to_gdb_regs - Convert ptrace regs to GDB regs
++ *    @gdb_regs: A pointer to hold the registers in the order GDB wants.
++ *    @p: The &struct task_struct of the desired process.
++ *
++ *    Convert the register values of the sleeping process in @p to
++ *    the format that GDB expects.
++ *    This function is called when kgdb does not have access to the
++ *    &struct pt_regs and therefore it should fill the gdb registers
++ *    @gdb_regs with what has been saved in &struct thread_struct
++ *    thread field during switch_to.
++ */
++extern void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs,
++                                      struct task_struct *p);
++
++/**
++ *    gdb_regs_to_regs - Convert GDB regs to ptrace regs.
++ *    @gdb_regs: A pointer to hold the registers we've recieved from GDB.
++ *    @regs: A pointer to a &struct pt_regs to hold these values in.
++ *
++ *    Convert the GDB regs in @gdb_regs into the pt_regs, and store them
++ *    in @regs.
++ */
++extern void gdb_regs_to_regs(unsigned long *gdb_regs, struct pt_regs *regs);
++
++/**
++ *    kgdb_arch_handle_exception - Handle architecture specific GDB packets.
++ *    @vector: The error vector of the exception that happened.
++ *    @signo: The signal number of the exception that happened.
++ *    @err_code: The error code of the exception that happened.
++ *    @remcom_in_buffer: The buffer of the packet we have read.
++ *    @remcom_out_buffer: The buffer, of %BUFMAX to write a packet into.
++ *    @regs: The &struct pt_regs of the current process.
++ *
++ *    This function MUST handle the 'c' and 's' command packets,
++ *    as well packets to set / remove a hardware breakpoint, if used.
++ *    If there are additional packets which the hardware needs to handle,
++ *    they are handled here.  The code should return -1 if it wants to
++ *    process more packets, and a %0 or %1 if it wants to exit from the
++ *    kgdb hook.
++ */
++extern int kgdb_arch_handle_exception(int vector, int signo, int err_code,
++                                    char *remcom_in_buffer,
++                                    char *remcom_out_buffer,
++                                    struct pt_regs *regs);
++
++#ifndef JMP_REGS_ALIGNMENT
++#define JMP_REGS_ALIGNMENT
++#endif
++
++extern unsigned long kgdb_fault_jmp_regs[];
++
++/**
++ *    kgdb_fault_setjmp - Store state in case we fault.
++ *    @curr_context: An array to store state into.
++ *
++ *    Certain functions may try and access memory, and in doing so may
++ *    cause a fault.  When this happens, we trap it, restore state to
++ *    this call, and let ourself know that something bad has happened.
++ */
++extern asmlinkage int kgdb_fault_setjmp(unsigned long *curr_context);
++
++/**
++ *    kgdb_fault_longjmp - Restore state when we have faulted.
++ *    @curr_context: The previously stored state.
++ *
++ *    When something bad does happen, this function is called to
++ *    restore the known good state, and set the return value to 1, so
++ *    we know something bad happened.
++ */
++extern asmlinkage void kgdb_fault_longjmp(unsigned long *curr_context);
++
++/* Optional functions. */
++extern int kgdb_arch_init(void);
++extern void kgdb_disable_hw_debug(struct pt_regs *regs);
++extern void kgdb_post_master_code(struct pt_regs *regs, int e_vector,
++                                int err_code);
++extern void kgdb_roundup_cpus(unsigned long flags);
++extern int kgdb_set_hw_break(unsigned long addr);
++extern int kgdb_remove_hw_break(unsigned long addr);
++extern void kgdb_remove_all_hw_break(void);
++extern void kgdb_correct_hw_break(void);
++extern void kgdb_shadowinfo(struct pt_regs *regs, char *buffer,
++                          unsigned threadid);
++extern struct task_struct *kgdb_get_shadow_thread(struct pt_regs *regs,
++                                                int threadid);
++extern struct pt_regs *kgdb_shadow_regs(struct pt_regs *regs, int threadid);
++extern int kgdb_validate_break_address(unsigned long addr);
++extern int kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr);
++extern int kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle);
++
++/**
++ * struct kgdb_arch - Desribe architecture specific values.
++ * @gdb_bpt_instr: The instruction to trigger a breakpoint.
++ * @flags: Flags for the breakpoint, currently just %KGDB_HW_BREAKPOINT.
++ * @shadowth: A value of %1 indicates we shadow information on processes.
++ * @set_breakpoint: Allow an architecture to specify how to set a software
++ * breakpoint.
++ * @remove_breakpoint: Allow an architecture to specify how to remove a
++ * software breakpoint.
++ * @set_hw_breakpoint: Allow an architecture to specify how to set a hardware
++ * breakpoint.
++ * @remove_hw_breakpoint: Allow an architecture to specify how to remove a
++ * hardware breakpoint.
++ *
++ * The @shadowth flag is an option to shadow information not retrievable by
++ * gdb otherwise.  This is deprecated in favor of a binutils which supports
++ * CFI macros.
++ */
++struct kgdb_arch {
++      unsigned char gdb_bpt_instr[BREAK_INSTR_SIZE];
++      unsigned long flags;
++      unsigned shadowth;
++      int (*set_breakpoint) (unsigned long, char *);
++      int (*remove_breakpoint)(unsigned long, char *);
++      int (*set_hw_breakpoint)(unsigned long, int, enum kgdb_bptype);
++      int (*remove_hw_breakpoint)(unsigned long, int, enum kgdb_bptype);
++};
++
++/* Thread reference */
++typedef unsigned char threadref[8];
++
++/**
++ * struct kgdb_io - Desribe the interface for an I/O driver to talk with KGDB.
++ * @read_char: Pointer to a function that will return one char.
++ * @write_char: Pointer to a function that will write one char.
++ * @flush: Pointer to a function that will flush any pending writes.
++ * @init: Pointer to a function that will initialize the device.
++ * @late_init: Pointer to a function that will do any setup that has
++ * other dependencies.
++ * @pre_exception: Pointer to a function that will do any prep work for
++ * the I/O driver.
++ * @post_exception: Pointer to a function that will do any cleanup work
++ * for the I/O driver.
++ *
++ * The @init and @late_init function pointers allow for an I/O driver
++ * such as a serial driver to fully initialize the port with @init and
++ * be called very early, yet safely call request_irq() later in the boot
++ * sequence.
++ *
++ * @init is allowed to return a non-0 return value to indicate failure.
++ * If this is called early on, then KGDB will try again when it would call
++ * @late_init.  If it has failed later in boot as well, the user will be
++ * notified.
++ */
++struct kgdb_io {
++      int (*read_char) (void);
++      void (*write_char) (u8);
++      void (*flush) (void);
++      int (*init) (void);
++      void (*late_init) (void);
++      void (*pre_exception) (void);
++      void (*post_exception) (void);
++};
++
++extern struct kgdb_io kgdb_io_ops;
++extern struct kgdb_arch arch_kgdb_ops;
++extern int kgdb_initialized;
++
++extern int kgdb_register_io_module(struct kgdb_io *local_kgdb_io_ops);
++extern void kgdb_unregister_io_module(struct kgdb_io *local_kgdb_io_ops);
++
++extern void __init kgdb8250_add_port(int i, struct uart_port *serial_req);
++extern void __init kgdb8250_add_platform_port(int i, struct plat_serial8250_port *serial_req);
++
++extern int kgdb_hex2long(char **ptr, long *long_val);
++extern char *kgdb_mem2hex(char *mem, char *buf, int count);
++extern char *kgdb_hex2mem(char *buf, char *mem, int count);
++extern int kgdb_get_mem(char *addr, unsigned char *buf, int count);
++extern int kgdb_set_mem(char *addr, unsigned char *buf, int count);
++
++int kgdb_isremovedbreak(unsigned long addr);
++int kgdb_skipexception(int exception, struct pt_regs *regs);
++
++extern int kgdb_handle_exception(int ex_vector, int signo, int err_code,
++                              struct pt_regs *regs);
++extern void kgdb_nmihook(int cpu, void *regs);
++extern int debugger_step;
++extern atomic_t debugger_active;
++extern struct kgdb_arch *kgdb_ops;
++#else
++/* Stubs for when KGDB is not set. */
++static const atomic_t debugger_active = ATOMIC_INIT(0);
++#endif                                /* CONFIG_KGDB */
++#endif                                /* _KGDB_H_ */
++#endif                                /* __KERNEL__ */
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/linux/module.h linux-2.6.18.kgdb/include/linux/module.h
+--- linux-2.6.18/include/linux/module.h        2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/include/linux/module.h   2008-06-10 16:20:07.000000000 +0400
+@@ -224,8 +224,17 @@ enum module_state
+       MODULE_STATE_LIVE,
+       MODULE_STATE_COMING,
+       MODULE_STATE_GOING,
++      MODULE_STATE_GONE,
+ };
+ 
++#ifdef CONFIG_KGDB
++#define MAX_SECTNAME 31
++struct mod_section {
++       void *address;
++       char name[MAX_SECTNAME + 1];
++};
++#endif
++
+ /* Similar stuff for section attributes. */
+ #define MODULE_SECT_NAME_LEN 32
+ struct module_sect_attr
+@@ -253,6 +262,13 @@ struct module
+       /* Unique handle for this module */
+       char name[MODULE_NAME_LEN];
+ 
++#ifdef CONFIG_KGDB
++      /* keep kgdb info at the begining so that gdb doesn't have a chance to
++       * miss out any fields */
++      unsigned long num_sections;
++      struct mod_section *mod_sections;
++#endif
++
+       /* Sysfs stuff. */
+       struct module_kobject mkobj;
+       struct module_param_attrs *param_attrs;
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/linux/netpoll.h linux-2.6.18.kgdb/include/linux/netpoll.h
+--- linux-2.6.18/include/linux/netpoll.h       2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/include/linux/netpoll.h  2008-06-10 16:19:07.000000000 +0400
+@@ -17,7 +17,7 @@ struct netpoll;
+ struct netpoll {
+       struct net_device *dev;
+       char dev_name[16], *name;
+-      void (*rx_hook)(struct netpoll *, int, char *, int);
++      void (*rx_hook)(struct netpoll *, int, char *, int, struct sk_buff *);
+       void (*drop)(struct sk_buff *skb);
+       u32 local_ip, remote_ip;
+       u16 local_port, remote_port;
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/linux/serial_8250.h linux-2.6.18.kgdb/include/linux/serial_8250.h
+--- linux-2.6.18/include/linux/serial_8250.h   2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/include/linux/serial_8250.h      2008-06-10 16:19:03.000000000 +0400
+@@ -56,6 +56,7 @@ struct uart_port;
+ 
+ int serial8250_register_port(struct uart_port *);
+ void serial8250_unregister_port(int line);
++void serial8250_unregister_by_port(struct uart_port *port);
+ void serial8250_suspend_port(int line);
+ void serial8250_resume_port(int line);
+ 
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/kernel/Makefile linux-2.6.18.kgdb/kernel/Makefile
+--- linux-2.6.18/kernel/Makefile       2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/kernel/Makefile  2008-06-10 16:18:58.000000000 +0400
+@@ -42,6 +42,7 @@ obj-$(CONFIG_STOP_MACHINE) += stop_machi
+ obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
+ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
+ obj-$(CONFIG_KPROBES) += kprobes.o
++obj-$(CONFIG_KGDB) += kgdb.o kgdbarchlib.o
+ obj-$(CONFIG_SYSFS) += ksysfs.o
+ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
+ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/kernel/kgdb.c linux-2.6.18.kgdb/kernel/kgdb.c
+--- linux-2.6.18/kernel/kgdb.c 1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18.kgdb/kernel/kgdb.c    2008-06-10 16:20:11.000000000 +0400
+@@ -0,0 +1,1778 @@
++/*
++ * kernel/kgdb.c
++ *
++ * Maintainer: Tom Rini <trini@kernel.crashing.org>
++ *
++ * Copyright (C) 2000-2001 VERITAS Software Corporation.
++ * Copyright (C) 2002-2004 Timesys Corporation
++ * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
++ * Copyright (C) 2004 Pavel Machek <pavel@suse.cz>
++ * Copyright (C) 2004-2005 Tom Rini <trini@kernel.crashing.org>
++ * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
++ * Copyright (C) 2005 Wind River Systems, Inc.
++ *
++ * Contributors at various stages not listed above:
++ *  Jason Wessel ( jason.wessel@windriver.com )
++ *  George Anzinger <george@mvista.com>
++ *  Anurekh Saxena (anurekh.saxena@timesys.com)
++ *  Lake Stevens Instrument Division (Glenn Engel)
++ *  Jim Kingdon, Cygnus Support.
++ *
++ * Original KGDB stub: David Grothe <dave@gcom.com>,
++ * Tigran Aivazian <tigran@sco.com>
++ *
++ * This file is licensed under the terms of the GNU General Public License
++ * version 2. This program is licensed "as is" without any warranty of any
++ * kind, whether express or implied.
++ */
++
++#include <linux/string.h>
++#include <linux/kernel.h>
++#include <linux/interrupt.h>
++#include <linux/sched.h>
++#include <linux/smp.h>
++#include <linux/spinlock.h>
++#include <linux/delay.h>
++#include <linux/mm.h>
++#include <linux/threads.h>
++#include <linux/reboot.h>
++#include <asm/system.h>
++#include <asm/ptrace.h>
++#include <asm/uaccess.h>
++#include <linux/kgdb.h>
++#include <asm/atomic.h>
++#include <linux/notifier.h>
++#include <linux/module.h>
++#include <asm/cacheflush.h>
++#include <linux/init.h>
++#include <linux/sysrq.h>
++#include <linux/console.h>
++#include <linux/sched.h>
++#include <asm/byteorder.h>
++
++extern int pid_max;
++/* How many times to count all of the waiting CPUs */
++#define ROUNDUP_WAIT          640000  /* Arbitrary, increase if needed. */
++#define BUF_THREAD_ID_SIZE    16
++
++/*
++ * kgdb_initialized with a value of 1 indicates that kgdb is setup and is
++ * all ready to serve breakpoints and other kernel exceptions.  A value of
++ * -1 indicates that we have tried to initialize early, and need to try
++ * again later.
++ */
++int kgdb_initialized;
++/* Is a host GDB connected to us? */
++int kgdb_connected;
++/* Could we be about to try and access a bad memory location? If so we
++ * also need to flag this has happend. */
++int kgdb_may_fault;
++/* All the KGDB handlers are installed */
++int kgdb_from_module_registered = 0;
++
++/* We provide a kgdb_io_ops structure that may be overriden. */
++struct kgdb_io __attribute__ ((weak)) kgdb_io_ops;
++
++static struct kgdb_io kgdb_io_ops_prev[MAX_KGDB_IO_HANDLERS];
++static int kgdb_io_handler_cnt = 0;
++
++/* Export the following symbols for use with kernel modules */
++EXPORT_SYMBOL(kgdb_io_ops);
++EXPORT_SYMBOL(kgdb_tasklet_breakpoint);
++EXPORT_SYMBOL(kgdb_connected);
++EXPORT_SYMBOL(kgdb_register_io_module);
++EXPORT_SYMBOL(kgdb_unregister_io_module);
++EXPORT_SYMBOL(debugger_active);
++
++/*
++ * Holds information about breakpoints in a kernel. These breakpoints are
++ * added and removed by gdb.
++ */
++struct kgdb_bkpt kgdb_break[MAX_BREAKPOINTS];
++
++static const char hexchars[] = "0123456789abcdef";
++
++static spinlock_t slavecpulocks[NR_CPUS];
++static atomic_t procindebug[NR_CPUS];
++atomic_t kgdb_setting_breakpoint;
++EXPORT_SYMBOL(kgdb_setting_breakpoint);
++struct task_struct *kgdb_usethread, *kgdb_contthread;
++
++int debugger_step;
++atomic_t debugger_active;
++
++/* Our I/O buffers. */
++static char remcom_in_buffer[BUFMAX];
++static char remcom_out_buffer[BUFMAX];
++/* Storage for the registers, in GDB format. */
++static unsigned long gdb_regs[(NUMREGBYTES + sizeof(unsigned long) - 1) /
++                            sizeof(unsigned long)];
++/* Storage of registers for handling a fault. */
++unsigned long kgdb_fault_jmp_regs[NUMCRITREGBYTES / sizeof(unsigned long)]
++ JMP_REGS_ALIGNMENT;
++static int kgdb_notify_reboot(struct notifier_block *this,
++                              unsigned long code ,void *x);
++struct debuggerinfo_struct {
++      void *debuggerinfo;
++      struct task_struct *task;
++} kgdb_info[NR_CPUS];
++
++/* to keep track of the CPU which is doing the single stepping*/
++atomic_t cpu_doing_single_step = ATOMIC_INIT(-1);
++
++atomic_t  kgdb_sync_softlockup[NR_CPUS] = {ATOMIC_INIT(0)};
++
++/* reboot notifier block */
++static struct notifier_block kgdb_reboot_notifier = {
++      .notifier_call  = kgdb_notify_reboot,
++      .next           = NULL,
++      .priority       = INT_MAX,
++};
++
++static int hex(char ch)
++{
++      if ((ch >= 'a') && (ch <= 'f'))
++              return (ch - 'a' + 10);
++      if ((ch >= '0') && (ch <= '9'))
++              return (ch - '0');
++      if ((ch >= 'A') && (ch <= 'F'))
++              return (ch - 'A' + 10);
++      return (-1);
++}
++
++/* scan for the sequence $<data>#<checksum>   */
++static void get_packet(char *buffer)
++{
++      unsigned char checksum;
++      unsigned char xmitcsum;
++      int count;
++      char ch;
++      if (!kgdb_io_ops.read_char)
++              return;
++      do {
++              /* Spin and wait around for the start character, ignore all
++               * other characters */
++              while ((ch = (kgdb_io_ops.read_char())) != '$') ;
++              kgdb_connected = 1;
++              checksum = 0;
++              xmitcsum = -1;
++
++              count = 0;
++
++              /* now, read until a # or end of buffer is found */
++              while (count < (BUFMAX - 1)) {
++                      ch = kgdb_io_ops.read_char();
++                      if (ch == '#')
++                              break;
++                      checksum = checksum + ch;
++                      buffer[count] = ch;
++                      count = count + 1;
++              }
++              buffer[count] = 0;
++
++              if (ch == '#') {
++                      xmitcsum = hex(kgdb_io_ops.read_char()) << 4;
++                      xmitcsum += hex(kgdb_io_ops.read_char());
++
++                      if (checksum != xmitcsum)
++                              /* failed checksum */
++                              kgdb_io_ops.write_char('-');
++                      else
++                              /* successful transfer */
++                              kgdb_io_ops.write_char('+');
++                      if (kgdb_io_ops.flush)
++                              kgdb_io_ops.flush();
++              }
++      } while (checksum != xmitcsum);
++}
++
++/*
++ * Send the packet in buffer.
++ * Check for gdb connection if asked for.
++ */
++static void put_packet(char *buffer)
++{
++      unsigned char checksum;
++      int count;
++      char ch;
++
++      if (!kgdb_io_ops.write_char)
++              return;
++      /* $<packet info>#<checksum>. */
++      while (1) {
++              kgdb_io_ops.write_char('$');
++              checksum = 0;
++              count = 0;
++
++              while ((ch = buffer[count])) {
++                      kgdb_io_ops.write_char(ch);
++                      checksum += ch;
++                      count++;
++              }
++
++              kgdb_io_ops.write_char('#');
++              kgdb_io_ops.write_char(hexchars[checksum >> 4]);
++              kgdb_io_ops.write_char(hexchars[checksum % 16]);
++              if (kgdb_io_ops.flush)
++                      kgdb_io_ops.flush();
++
++              /* Now see what we get in reply. */
++              ch = kgdb_io_ops.read_char();
++
++              if (ch == 3)
++                      ch = kgdb_io_ops.read_char();
++
++              /* If we get an ACK, we are done. */
++              if (ch == '+')
++                      return;
++
++              /* If we get the start of another packet, this means
++               * that GDB is attempting to reconnect.  We will NAK
++               * the packet being sent, and stop trying to send this
++               * packet. */
++              if (ch == '$') {
++                      kgdb_io_ops.write_char('-');
++                      if (kgdb_io_ops.flush)
++                              kgdb_io_ops.flush();
++                      return;
++              }
++      }
++}
++
++/*
++ * convert the memory pointed to by mem into hex, placing result in buf
++ * return a pointer to the last char put in buf (null). May return an error.
++ */
++char *kgdb_mem2hex(char *mem, char *buf, int count)
++{
++      kgdb_may_fault = 1;
++      if ((kgdb_fault_setjmp(kgdb_fault_jmp_regs)) != 0) {
++              kgdb_may_fault = 0;
++              return ERR_PTR(-EINVAL);
++      }
++      /* Accessing some registers in a single load instruction is
++       * required to avoid bad side effects for some I/O registers.
++       */
++      if ((count == 2) && (((long)mem & 1) == 0)) {
++              unsigned short tmp_s = *(unsigned short *)mem;
++              mem += 2;
++#ifdef __BIG_ENDIAN
++              *buf++ = hexchars[(tmp_s >> 12) & 0xf];
++              *buf++ = hexchars[(tmp_s >> 8) & 0xf];
++              *buf++ = hexchars[(tmp_s >> 4) & 0xf];
++              *buf++ = hexchars[tmp_s & 0xf];
++#else
++              *buf++ = hexchars[(tmp_s >> 4) & 0xf];
++              *buf++ = hexchars[tmp_s & 0xf];
++              *buf++ = hexchars[(tmp_s >> 12) & 0xf];
++              *buf++ = hexchars[(tmp_s >> 8) & 0xf];
++#endif
++      } else if ((count == 4) && (((long)mem & 3) == 0)) {
++              unsigned long tmp_l = *(unsigned int *)mem;
++              mem += 4;
++#ifdef __BIG_ENDIAN
++              *buf++ = hexchars[(tmp_l >> 28) & 0xf];
++              *buf++ = hexchars[(tmp_l >> 24) & 0xf];
++              *buf++ = hexchars[(tmp_l >> 20) & 0xf];
++              *buf++ = hexchars[(tmp_l >> 16) & 0xf];
++              *buf++ = hexchars[(tmp_l >> 12) & 0xf];
++              *buf++ = hexchars[(tmp_l >> 8) & 0xf];
++              *buf++ = hexchars[(tmp_l >> 4) & 0xf];
++              *buf++ = hexchars[tmp_l & 0xf];
++#else
++              *buf++ = hexchars[(tmp_l >> 4) & 0xf];
++              *buf++ = hexchars[tmp_l & 0xf];
++              *buf++ = hexchars[(tmp_l >> 12) & 0xf];
++              *buf++ = hexchars[(tmp_l >> 8) & 0xf];
++              *buf++ = hexchars[(tmp_l >> 20) & 0xf];
++              *buf++ = hexchars[(tmp_l >> 16) & 0xf];
++              *buf++ = hexchars[(tmp_l >> 28) & 0xf];
++              *buf++ = hexchars[(tmp_l >> 24) & 0xf];
++#endif
++#ifdef CONFIG_64BIT
++      } else if ((count == 8) && (((long)mem & 7) == 0)) {
++              unsigned long long tmp_ll = *(unsigned long long *)mem;
++              mem += 8;
++#ifdef __BIG_ENDIAN
++              *buf++ = hexchars[(tmp_ll >> 60) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 56) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 52) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 48) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 44) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 40) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 36) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 32) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 28) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 24) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 20) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 16) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 12) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 8) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 4) & 0xf];
++              *buf++ = hexchars[tmp_ll & 0xf];
++#else
++              *buf++ = hexchars[(tmp_ll >> 4) & 0xf];
++              *buf++ = hexchars[tmp_ll & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 12) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 8) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 20) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 16) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 28) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 24) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 36) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 32) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 44) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 40) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 52) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 48) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 60) & 0xf];
++              *buf++ = hexchars[(tmp_ll >> 56) & 0xf];
++#endif
++#endif
++      } else {
++              while (count-- > 0) {
++                      unsigned char ch = *mem++;
++                      *buf++ = hexchars[ch >> 4];
++                      *buf++ = hexchars[ch & 0xf];
++              }
++      }
++      kgdb_may_fault = 0;
++      *buf = 0;
++      return (buf);
++}
++
++/*
++ * Copy the binary array pointed to by buf into mem.  Fix $, #, and
++ * 0x7d escaped with 0x7d.  Return a pointer to the character after
++ * the last byte written.
++ */
++static char *kgdb_ebin2mem(char *buf, char *mem, int count)
++{
++      kgdb_may_fault = 1;
++      if ((kgdb_fault_setjmp(kgdb_fault_jmp_regs)) != 0) {
++              kgdb_may_fault = 0;
++              return ERR_PTR(-EINVAL);
++      }
++      for (; count > 0; count--, buf++) {
++              if (*buf == 0x7d)
++                      *mem++ = *(++buf) ^ 0x20;
++              else
++                      *mem++ = *buf;
++      }
++      kgdb_may_fault = 0;
++      return mem;
++}
++
++/*
++ * convert the hex array pointed to by buf into binary to be placed in mem
++ * return a pointer to the character AFTER the last byte written
++ * May return an error.
++ */
++char *kgdb_hex2mem(char *buf, char *mem, int count)
++{
++      kgdb_may_fault = 1;
++      if ((kgdb_fault_setjmp(kgdb_fault_jmp_regs)) != 0) {
++              kgdb_may_fault = 0;
++              return ERR_PTR(-EINVAL);
++      }
++      if ((count == 2) && (((long)mem & 1) == 0)) {
++              unsigned short tmp_s = 0;
++#ifdef __BIG_ENDIAN
++              tmp_s |= hex(*buf++) << 12;
++              tmp_s |= hex(*buf++) << 8;
++              tmp_s |= hex(*buf++) << 4;
++              tmp_s |= hex(*buf++);
++#else
++              tmp_s |= hex(*buf++) << 4;
++              tmp_s |= hex(*buf++);
++              tmp_s |= hex(*buf++) << 12;
++              tmp_s |= hex(*buf++) << 8;
++#endif
++              *(unsigned short *)mem = tmp_s;
++              mem += 2;
++      } else if ((count == 4) && (((long)mem & 3) == 0)) {
++              unsigned long tmp_l = 0;
++#ifdef __BIG_ENDIAN
++              tmp_l |= hex(*buf++) << 28;
++              tmp_l |= hex(*buf++) << 24;
++              tmp_l |= hex(*buf++) << 20;
++              tmp_l |= hex(*buf++) << 16;
++              tmp_l |= hex(*buf++) << 12;
++              tmp_l |= hex(*buf++) << 8;
++              tmp_l |= hex(*buf++) << 4;
++              tmp_l |= hex(*buf++);
++#else
++              tmp_l |= hex(*buf++) << 4;
++              tmp_l |= hex(*buf++);
++              tmp_l |= hex(*buf++) << 12;
++              tmp_l |= hex(*buf++) << 8;
++              tmp_l |= hex(*buf++) << 20;
++              tmp_l |= hex(*buf++) << 16;
++              tmp_l |= hex(*buf++) << 28;
++              tmp_l |= hex(*buf++) << 24;
++#endif
++              *(unsigned long *)mem = tmp_l;
++              mem += 4;
++      } else {
++              int i;
++              for (i = 0; i < count; i++) {
++                      unsigned char ch = hex(*buf++) << 4;
++                      ch |= hex(*buf++);
++                      *mem++ = ch;
++              }
++      }
++      kgdb_may_fault = 0;
++      return (mem);
++}
++
++/*
++ * While we find nice hex chars, build a long_val.
++ * Return number of chars processed.
++ */
++int kgdb_hex2long(char **ptr, long *long_val)
++{
++      int hex_val, num = 0;
++
++      *long_val = 0;
++
++      while (**ptr) {
++              hex_val = hex(**ptr);
++              if (hex_val >= 0) {
++                      *long_val = (*long_val << 4) | hex_val;
++                      num++;
++              } else
++                      break;
++
++              (*ptr)++;
++      }
++
++      return (num);
++}
++
++/* Write memory due to an 'M' or 'X' packet. */
++static char *write_mem_msg(int binary)
++{
++      char *ptr = &remcom_in_buffer[1];
++      unsigned long addr, length;
++
++      if (kgdb_hex2long(&ptr, &addr) > 0 && *(ptr++) == ',' &&
++          kgdb_hex2long(&ptr, &length) > 0 && *(ptr++) == ':') {
++              if (binary)
++                      ptr = kgdb_ebin2mem(ptr, (char *)addr, length);
++              else
++                      ptr = kgdb_hex2mem(ptr, (char *)addr, length);
++              if (CACHE_FLUSH_IS_SAFE)
++                      flush_icache_range(addr, addr + length + 1);
++              if (IS_ERR(ptr))
++                      return ptr;
++              return NULL;
++      }
++
++      return ERR_PTR(-EINVAL);
++}
++
++static inline char *pack_hex_byte(char *pkt, int byte)
++{
++      *pkt++ = hexchars[(byte >> 4) & 0xf];
++      *pkt++ = hexchars[(byte & 0xf)];
++      return pkt;
++}
++
++static inline void error_packet(char *pkt, int error)
++{
++      error = -error;
++      pkt[0] = 'E';
++      pkt[1] = hexchars[(error / 10)];
++      pkt[2] = hexchars[(error % 10)];
++      pkt[3] = '\0';
++}
++
++static char *pack_threadid(char *pkt, threadref * id)
++{
++      char *limit;
++      unsigned char *altid;
++
++      altid = (unsigned char *)id;
++      limit = pkt + BUF_THREAD_ID_SIZE;
++      while (pkt < limit)
++              pkt = pack_hex_byte(pkt, *altid++);
++
++      return pkt;
++}
++
++void int_to_threadref(threadref * id, int value)
++{
++      unsigned char *scan;
++      int i = 4;
++
++      scan = (unsigned char *)id;
++      while (i--)
++              *scan++ = 0;
++      *scan++ = (value >> 24) & 0xff;
++      *scan++ = (value >> 16) & 0xff;
++      *scan++ = (value >> 8) & 0xff;
++      *scan++ = (value & 0xff);
++}
++
++static struct task_struct *getthread(struct pt_regs *regs, int tid)
++{
++      if (last_pid == 0)
++              return current;
++
++      if (num_online_cpus() &&
++          (tid >= pid_max + num_online_cpus() + kgdb_ops->shadowth))
++              return NULL;
++
++      if (kgdb_ops->shadowth && (tid >= pid_max + num_online_cpus()))
++              return kgdb_get_shadow_thread(regs, tid - pid_max -
++                                            num_online_cpus());
++
++      if (tid >= pid_max)
++              return idle_task(tid - pid_max);
++
++      if (!tid)
++              return NULL;
++
++      return find_task_by_pid(tid);
++}
++
++#ifdef CONFIG_SMP
++static void kgdb_wait(struct pt_regs *regs)
++{
++      unsigned long flags;
++      int processor;
++
++      local_irq_save(flags);
++      processor = smp_processor_id();
++      kgdb_info[processor].debuggerinfo = regs;
++      kgdb_info[processor].task = current;
++      atomic_set(&procindebug[processor], 1);
++      atomic_set(&kgdb_sync_softlockup[smp_processor_id()], 1);
++
++      /* Wait till master processor goes completely into the debugger.
++       * FIXME: this looks racy */
++      while (!atomic_read(&procindebug[atomic_read(&debugger_active) - 1])) {
++              int i = 10;     /* an arbitrary number */
++
++              while (--i)
++                      cpu_relax();
++      }
++
++      /* Wait till master processor is done with debugging */
++      spin_lock_nested(&slavecpulocks[processor], processor);
++
++      /* This has been taken from x86 kgdb implementation and
++       * will be needed by architectures that have SMP support
++       */
++      kgdb_correct_hw_break();
++
++      kgdb_info[processor].debuggerinfo = NULL;
++      kgdb_info[processor].task = NULL;
++
++      /* Signal the master processor that we are done */
++      atomic_set(&procindebug[processor], 0);
++      spin_unlock(&slavecpulocks[processor]);
++      local_irq_restore(flags);
++}
++#endif
++
++int kgdb_get_mem(char *addr, unsigned char *buf, int count)
++{
++      kgdb_may_fault = 1;
++      if ((kgdb_fault_setjmp(kgdb_fault_jmp_regs)) != 0) {
++              kgdb_may_fault = 0;
++              return -EINVAL;
++      }
++      while (count) {
++              if ((unsigned long)addr < TASK_SIZE)
++                      return -EINVAL;
++              *buf++ = *addr++;
++              count--;
++      }
++      kgdb_may_fault = 0;
++      return 0;
++}
++
++int kgdb_set_mem(char *addr, unsigned char *buf, int count)
++{
++      kgdb_may_fault = 1;
++      if ((kgdb_fault_setjmp(kgdb_fault_jmp_regs)) != 0) {
++              kgdb_may_fault = 0;
++              return -EINVAL;
++      }
++      while (count) {
++              if ((unsigned long)addr < TASK_SIZE)
++                      return -EINVAL;
++              *addr++ = *buf++;
++              count--;
++      }
++      kgdb_may_fault = 0;
++      return 0;
++}
++int kgdb_activate_sw_breakpoints(void)
++{
++      int i;
++      int error = 0;
++      unsigned long addr;
++      for (i = 0; i < MAX_BREAKPOINTS; i++) {
++              if (kgdb_break[i].state != bp_set)
++                      continue;
++              addr = kgdb_break[i].bpt_addr;
++              if ((error = kgdb_arch_set_breakpoint(addr,
++                                      kgdb_break[i].saved_instr)))
++                      return error;
++
++              if (CACHE_FLUSH_IS_SAFE) {
++                      if (current->mm && addr < TASK_SIZE)
++                              flush_cache_range(current->mm->mmap_cache,
++                                              addr, addr + BREAK_INSTR_SIZE);
++                      else
++                              flush_icache_range(addr, addr +
++                                              BREAK_INSTR_SIZE);
++              }
++
++              kgdb_break[i].state = bp_active;
++        }
++      return 0;
++}
++
++static int kgdb_set_sw_break(unsigned long addr)
++{
++      int i, breakno = -1;
++      int error = 0;
++      if ((error = kgdb_validate_break_address(addr)) < 0)
++              return error;
++      for (i = 0; i < MAX_BREAKPOINTS; i++) {
++              if ((kgdb_break[i].state == bp_set) &&
++                      (kgdb_break[i].bpt_addr == addr))
++                      return -EEXIST;
++      }
++      for (i = 0; i < MAX_BREAKPOINTS; i++) {
++              if (kgdb_break[i].state == bp_removed &&
++                              kgdb_break[i].bpt_addr == addr) {
++                      breakno = i;
++                      break;
++              }
++      }
++
++      if (breakno == -1) {
++              for (i = 0; i < MAX_BREAKPOINTS; i++) {
++                      if (kgdb_break[i].state == bp_none) {
++                              breakno = i;
++                              break;
++                      }
++              }
++      }
++      if (breakno == -1)
++              return -E2BIG;
++
++      kgdb_break[breakno].state = bp_set;
++      kgdb_break[breakno].type = bp_breakpoint;
++      kgdb_break[breakno].bpt_addr = addr;
++
++      return 0;
++}
++
++int kgdb_deactivate_sw_breakpoints(void)
++{
++      int i;
++      int error = 0;
++      unsigned long addr;
++      for (i = 0; i < MAX_BREAKPOINTS; i++) {
++              if (kgdb_break[i].state != bp_active)
++                      continue;
++              addr = kgdb_break[i].bpt_addr;
++              if ((error = kgdb_arch_remove_breakpoint(addr,
++                                      kgdb_break[i].saved_instr)))
++                      return error;
++
++              if (CACHE_FLUSH_IS_SAFE && current->mm &&
++                              addr < TASK_SIZE)
++                      flush_cache_range(current->mm->mmap_cache,
++                                      addr, addr + BREAK_INSTR_SIZE);
++              else if (CACHE_FLUSH_IS_SAFE)
++                      flush_icache_range(addr,
++                                      addr + BREAK_INSTR_SIZE);
++              kgdb_break[i].state = bp_set;
++      }
++      return 0;
++}
++
++static int kgdb_remove_sw_break(unsigned long addr)
++{
++      int i;
++
++      for (i = 0; i < MAX_BREAKPOINTS; i++) {
++              if ((kgdb_break[i].state == bp_set) &&
++                      (kgdb_break[i].bpt_addr == addr)) {
++                      kgdb_break[i].state = bp_removed;
++                      return 0;
++              }
++      }
++      return -ENOENT;
++}
++
++int kgdb_isremovedbreak(unsigned long addr)
++{
++      int i;
++      for (i = 0; i < MAX_BREAKPOINTS; i++) {
++              if ((kgdb_break[i].state == bp_removed) &&
++                      (kgdb_break[i].bpt_addr == addr)) {
++                      return 1;
++              }
++      }
++      return 0;
++}
++
++int remove_all_break(void)
++{
++      int i;
++      int error;
++      unsigned long addr;
++
++      /* Clear memory breakpoints. */
++      for (i = 0; i < MAX_BREAKPOINTS; i++) {
++              if (kgdb_break[i].state != bp_set)
++                      continue;
++              addr = kgdb_break[i].bpt_addr;
++              if ((error = kgdb_arch_remove_breakpoint(addr,
++                                      kgdb_break[i].saved_instr)))
++                      return error;
++              kgdb_break[i].state = bp_removed;
++      }
++
++      /* Clear hardware breakpoints. */
++      kgdb_remove_all_hw_break();
++
++      return 0;
++}
++
++static inline int shadow_pid(int realpid)
++{
++      if (realpid) {
++              return realpid;
++      }
++      return pid_max + smp_processor_id();
++}
++
++static char gdbmsgbuf[BUFMAX + 1];
++static void kgdb_msg_write(const char *s, int len)
++{
++      int i;
++      int wcount;
++      char *bufptr;
++
++      /* 'O'utput */
++      gdbmsgbuf[0] = 'O';
++
++      /* Fill and send buffers... */
++      while (len > 0) {
++              bufptr = gdbmsgbuf + 1;
++
++              /* Calculate how many this time */
++              if ((len << 1) > (BUFMAX - 2))
++                      wcount = (BUFMAX - 2) >> 1;
++              else
++                      wcount = len;
++
++              /* Pack in hex chars */
++              for (i = 0; i < wcount; i++)
++                      bufptr = pack_hex_byte(bufptr, s[i]);
++              *bufptr = '\0';
++
++              /* Move up */
++              s += wcount;
++              len -= wcount;
++
++              /* Write packet */
++              put_packet(gdbmsgbuf);
++      }
++}
++
++/*
++ * This function does all command procesing for interfacing to gdb.
++ *
++ * Locking hierarchy:
++ *    interface locks, if any (begin_session)
++ *    kgdb lock (debugger_active)
++ *
++ * Note that since we can be in here prior to our cpumask being filled
++ * out, we err on the side of caution and loop over NR_CPUS instead
++ * of a for_each_online_cpu.
++ *
++ */
++int kgdb_handle_exception(int ex_vector, int signo, int err_code,
++                        struct pt_regs *linux_regs)
++{
++      unsigned long length, addr;
++      char *ptr;
++      unsigned long flags;
++      unsigned i;
++      long threadid;
++      threadref thref;
++      struct task_struct *thread = NULL;
++      unsigned procid;
++      int numshadowth = num_online_cpus() + kgdb_ops->shadowth;
++      long kgdb_usethreadid = 0;
++      int error = 0, all_cpus_synced = 0;
++      struct pt_regs *shadowregs;
++      int processor = smp_processor_id();
++      void *local_debuggerinfo;
++
++      /* Panic on recursive debugger calls. */
++      if (atomic_read(&debugger_active) == smp_processor_id() + 1)
++              return 0;
++
++      acquirelock:
++
++      /* Call the I/O drivers pre_exception routine if the I/O
++       * driver defined one
++       */
++      if (kgdb_io_ops.pre_exception)
++              kgdb_io_ops.pre_exception();
++
++      /*
++       * Interrupts will be restored by the 'trap return' code, except when
++       * single stepping.
++       */
++      local_irq_save(flags);
++
++      /* Hold debugger_active */
++      procid = smp_processor_id();
++
++      while (cmpxchg(&atomic_read(&debugger_active), 0, (procid + 1)) != 0) {
++              int i = 25;     /* an arbitrary number */
++
++              while (--i)
++                      cpu_relax();
++
++              if (atomic_read(&cpu_doing_single_step) != -1 &&
++                              atomic_read(&cpu_doing_single_step) != procid)
++                      udelay(1);
++      }
++
++      atomic_set(&kgdb_sync_softlockup[smp_processor_id()], 1);
++
++      /*
++       * Don't enter if the last instance of the exception handler wanted to
++       * come into the debugger again.
++       */
++      if (atomic_read(&cpu_doing_single_step) != -1 &&
++          atomic_read(&cpu_doing_single_step) != procid) {
++              atomic_set(&debugger_active, 0);
++              local_irq_restore(flags);
++              goto acquirelock;
++      }
++
++      /*
++      * Don't enter if we have hit a removed breakpoint.
++      */
++      if (kgdb_skipexception(ex_vector, linux_regs))
++              goto kgdb_restore;
++
++      kgdb_info[processor].debuggerinfo = linux_regs;
++      kgdb_info[processor].task = current;
++
++      kgdb_disable_hw_debug(linux_regs);
++
++      if (!debugger_step || !kgdb_contthread)
++              for (i = 0; i < NR_CPUS; i++)
++                      spin_lock_nested(&slavecpulocks[i], i);
++
++      /* Make sure we get the other CPUs */
++      if (!debugger_step || !kgdb_contthread)
++              kgdb_roundup_cpus(flags);
++
++      /* spin_lock code is good enough as a barrier so we don't
++       * need one here */
++      atomic_set(&procindebug[processor], 1);
++
++      /* Wait a reasonable time for the other CPUs to be notified and
++       * be waiting for us.  Very early on this could be imperfect
++       * as num_online_cpus() could be 0.*/
++      for (i = 0; i < ROUNDUP_WAIT; i++) {
++              int cpu, num = 0;
++              for (cpu = 0; cpu < NR_CPUS; cpu++) {
++                      if (atomic_read(&procindebug[cpu]))
++                              num++;
++              }
++              if (num >= num_online_cpus()) {
++                      all_cpus_synced = 1;
++                      break;
++              }
++      }
++
++      /* Clear the out buffer. */
++      memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
++
++      /* Master processor is completely in the debugger */
++      kgdb_post_master_code(linux_regs, ex_vector, err_code);
++      kgdb_deactivate_sw_breakpoints();
++      debugger_step = 0;
++      kgdb_contthread = NULL;
++
++      if (kgdb_connected) {
++              /* If we're still unable to roundup all of the CPUs,
++               * send an 'O' packet informing the user again. */
++              if (!all_cpus_synced)
++                      kgdb_msg_write("Not all CPUs have been synced for "
++                                     "KGDB\n", 39);
++              /* Reply to host that an exception has occurred */
++              ptr = remcom_out_buffer;
++              *ptr++ = 'T';
++              *ptr++ = hexchars[(signo >> 4) % 16];
++              *ptr++ = hexchars[signo % 16];
++              ptr += strlen(strcpy(ptr, "thread:"));
++              int_to_threadref(&thref, shadow_pid(current->pid));
++              ptr = pack_threadid(ptr, &thref);
++              *ptr++ = ';';
++
++              put_packet(remcom_out_buffer);
++      }
++
++      kgdb_usethread = kgdb_info[processor].task;
++      kgdb_usethreadid = shadow_pid(kgdb_info[processor].task->pid);
++
++      while (kgdb_io_ops.read_char) {
++              char *bpt_type;
++              error = 0;
++
++              /* Clear the out buffer. */
++              memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
++
++              get_packet(remcom_in_buffer);
++
++              switch (remcom_in_buffer[0]) {
++              case '?':
++                      /* We know that this packet is only sent
++                       * during initial connect.  So to be safe,
++                       * we clear out our breakpoints now incase
++                       * GDB is reconnecting. */
++                      remove_all_break();
++                      /* Also, if we haven't been able to roundup all
++                       * CPUs, send an 'O' packet informing the user
++                       * as much.  Only need to do this once. */
++                      if (!all_cpus_synced)
++                              kgdb_msg_write("Not all CPUs have been "
++                                             "synced for KGDB\n", 39);
++                      remcom_out_buffer[0] = 'S';
++                      remcom_out_buffer[1] = hexchars[signo >> 4];
++                      remcom_out_buffer[2] = hexchars[signo % 16];
++                      break;
++
++              case 'g':       /* return the value of the CPU registers */
++                      thread = kgdb_usethread;
++
++                      if (!thread) {
++                              thread = kgdb_info[processor].task;
++                              local_debuggerinfo =
++                                  kgdb_info[processor].debuggerinfo;
++                      } else {
++                              local_debuggerinfo = NULL;
++                              for (i = 0; i < NR_CPUS; i++) {
++                                      /* Try to find the task on some other
++                                       * or possibly this node if we do not
++                                       * find the matching task then we try
++                                       * to approximate the results.
++                                       */
++                                      if (thread == kgdb_info[i].task)
++                                              local_debuggerinfo =
++                                                  kgdb_info[i].debuggerinfo;
++                              }
++                      }
++
++                      /* All threads that don't have debuggerinfo should be
++                       * in __schedule() sleeping, since all other CPUs
++                       * are in kgdb_wait, and thus have debuggerinfo. */
++                      if (kgdb_ops->shadowth &&
++                          kgdb_usethreadid >= pid_max + num_online_cpus()) {
++                              shadowregs = kgdb_shadow_regs(linux_regs,
++                                                            kgdb_usethreadid -
++                                                            pid_max -
++                                                            num_online_cpus
++                                                            ());
++                              if (!shadowregs) {
++                                      error_packet(remcom_out_buffer,
++                                                   -EINVAL);
++                                      break;
++                              }
++                              regs_to_gdb_regs(gdb_regs, shadowregs);
++                      } else if (local_debuggerinfo)
++                              regs_to_gdb_regs(gdb_regs, local_debuggerinfo);
++                      else {
++                              /* Pull stuff saved during
++                               * switch_to; nothing else is
++                               * accessible (or even particularly relevant).
++                               * This should be enough for a stack trace. */
++                              sleeping_thread_to_gdb_regs(gdb_regs, thread);
++                      }
++                      kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer,
++                                   NUMREGBYTES);
++                      break;
++
++                      /* set the value of the CPU registers - return OK */
++              case 'G':
++                      kgdb_hex2mem(&remcom_in_buffer[1], (char *)gdb_regs,
++                                   NUMREGBYTES);
++
++                      if (kgdb_usethread && kgdb_usethread != current)
++                              error_packet(remcom_out_buffer, -EINVAL);
++                      else {
++                              gdb_regs_to_regs(gdb_regs, linux_regs);
++                              strcpy(remcom_out_buffer, "OK");
++                      }
++                      break;
++
++                      /* mAA..AA,LLLL  Read LLLL bytes at address AA..AA */
++              case 'm':
++                      ptr = &remcom_in_buffer[1];
++                      if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' &&
++                          kgdb_hex2long(&ptr, &length) > 0) {
++                              if (IS_ERR(ptr = kgdb_mem2hex((char *)addr,
++                                                            remcom_out_buffer,
++                                                            length)))
++                                      error_packet(remcom_out_buffer,
++                                                   PTR_ERR(ptr));
++                      } else
++                              error_packet(remcom_out_buffer, -EINVAL);
++                      break;
++
++                      /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */
++              case 'M':
++                      if (IS_ERR(ptr = write_mem_msg(0)))
++                              error_packet(remcom_out_buffer, PTR_ERR(ptr));
++                      else
++                              strcpy(remcom_out_buffer, "OK");
++                      break;
++                      /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */
++              case 'X':
++                      if (IS_ERR(ptr = write_mem_msg(1)))
++                              error_packet(remcom_out_buffer, PTR_ERR(ptr));
++                      else
++                              strcpy(remcom_out_buffer, "OK");
++                      break;
++
++                      /* kill or detach. KGDB should treat this like a
++                       * continue.
++                       */
++              case 'D':
++                      if ((error = remove_all_break()) < 0) {
++                              error_packet(remcom_out_buffer, error);
++                      } else {
++                              strcpy(remcom_out_buffer, "OK");
++                              kgdb_connected = 0;
++                      }
++                      put_packet(remcom_out_buffer);
++                      goto default_handle;
++
++              case 'k':
++                      /* Don't care about error from remove_all_break */
++                      remove_all_break();
++                      kgdb_connected = 0;
++                      goto default_handle;
++
++                      /* Reboot */
++              case 'R':
++                      /* For now, only honor R0 */
++                      if (strcmp(remcom_in_buffer, "R0") == 0) {
++                              printk(KERN_CRIT "Executing reboot\n");
++                              strcpy(remcom_out_buffer, "OK");
++                              put_packet(remcom_out_buffer);
++                              emergency_sync();
++                              /* Execution should not return from
++                               * machine_restart()
++                               */
++                              machine_restart(NULL);
++                              kgdb_connected = 0;
++                              goto default_handle;
++                      }
++
++                      /* query */
++              case 'q':
++                      switch (remcom_in_buffer[1]) {
++                      case 's':
++                      case 'f':
++                              if (memcmp(remcom_in_buffer + 2, "ThreadInfo",
++                                         10)) {
++                                      error_packet(remcom_out_buffer,
++                                                   -EINVAL);
++                                      break;
++                              }
++
++                              /*
++                               * If we have not yet completed in
++                               * pidhash_init() there isn't much we
++                               * can give back.
++                               */
++                              if (last_pid == 0) {
++                                      if (remcom_in_buffer[1] == 'f')
++                                              strcpy(remcom_out_buffer,
++                                                     "m0000000000000001");
++                                      break;
++                              }
++
++                              if (remcom_in_buffer[1] == 'f') {
++                                      threadid = 1;
++                              }
++                              remcom_out_buffer[0] = 'm';
++                              ptr = remcom_out_buffer + 1;
++                              for (i = 0; i < 17 && threadid < pid_max +
++                                   numshadowth; threadid++) {
++                                      thread = getthread(linux_regs,
++                                                         threadid);
++                                      if (thread) {
++                                              int_to_threadref(&thref,
++                                                               threadid);
++                                              pack_threadid(ptr, &thref);
++                                              ptr += 16;
++                                              *(ptr++) = ',';
++                                              i++;
++                                      }
++                              }
++                              *(--ptr) = '\0';
++                              break;
++
++                      case 'C':
++                              /* Current thread id */
++                              strcpy(remcom_out_buffer, "QC");
++
++                              threadid = shadow_pid(current->pid);
++
++                              int_to_threadref(&thref, threadid);
++                              pack_threadid(remcom_out_buffer + 2, &thref);
++                              break;
++                      case 'T':
++                              if (memcmp(remcom_in_buffer + 1,
++                                         "ThreadExtraInfo,", 16)) {
++                                      error_packet(remcom_out_buffer,
++                                                   -EINVAL);
++                                      break;
++                              }
++                              threadid = 0;
++                              ptr = remcom_in_buffer + 17;
++                              kgdb_hex2long(&ptr, &threadid);
++                              if (!getthread(linux_regs, threadid)) {
++                                      error_packet(remcom_out_buffer,
++                                                   -EINVAL);
++                                      break;
++                              }
++                              if (threadid < pid_max) {
++                                      kgdb_mem2hex(getthread(linux_regs,
++                                                             threadid)->comm,
++                                                   remcom_out_buffer, 16);
++                              } else if (threadid >= pid_max +
++                                         num_online_cpus()) {
++                                      kgdb_shadowinfo(linux_regs,
++                                                      remcom_out_buffer,
++                                                      threadid - pid_max -
++                                                      num_online_cpus());
++                              } else {
++                                      static char tmpstr[23 +
++                                                         BUF_THREAD_ID_SIZE];
++                                      sprintf(tmpstr, "Shadow task %d"
++                                              " for pid 0",
++                                              (int)(threadid - pid_max));
++                                      kgdb_mem2hex(tmpstr, remcom_out_buffer,
++                                                   strlen(tmpstr));
++                              }
++                              break;
++                      }
++                      break;
++
++                      /* task related */
++              case 'H':
++                      switch (remcom_in_buffer[1]) {
++                      case 'g':
++                              ptr = &remcom_in_buffer[2];
++                              kgdb_hex2long(&ptr, &threadid);
++                              thread = getthread(linux_regs, threadid);
++                              if (!thread && threadid > 0) {
++                                      error_packet(remcom_out_buffer,
++                                                   -EINVAL);
++                                      break;
++                              }
++                              kgdb_usethread = thread;
++                              kgdb_usethreadid = threadid;
++                              strcpy(remcom_out_buffer, "OK");
++                              break;
++
++                      case 'c':
++                              ptr = &remcom_in_buffer[2];
++                              kgdb_hex2long(&ptr, &threadid);
++                              if (!threadid) {
++                                      kgdb_contthread = NULL;
++                              } else {
++                                      thread = getthread(linux_regs,
++                                                         threadid);
++                                      if (!thread && threadid > 0) {
++                                              error_packet(remcom_out_buffer,
++                                                           -EINVAL);
++                                              break;
++                                      }
++                                      kgdb_contthread = thread;
++                              }
++                              strcpy(remcom_out_buffer, "OK");
++                              break;
++                      }
++                      break;
++
++                      /* Query thread status */
++              case 'T':
++                      ptr = &remcom_in_buffer[1];
++                      kgdb_hex2long(&ptr, &threadid);
++                      thread = getthread(linux_regs, threadid);
++                      if (thread)
++                              strcpy(remcom_out_buffer, "OK");
++                      else
++                              error_packet(remcom_out_buffer, -EINVAL);
++                      break;
++              /* Since GDB-5.3, it's been drafted that '0' is a software
++               * breakpoint, '1' is a hardware breakpoint, so let's do
++               * that.
++               */
++              case 'z':
++              case 'Z':
++                      bpt_type = &remcom_in_buffer[1];
++                      ptr = &remcom_in_buffer[2];
++
++                      if (kgdb_ops->set_hw_breakpoint && *bpt_type >= '1') {
++                              /* Unsupported */
++                              if (*bpt_type > '4')
++                                      break;
++                      } else if (*bpt_type != '0' && *bpt_type != '1')
++                              /* Unsupported. */
++                              break;
++                      /* Test if this is a hardware breakpoint, and
++                       * if we support it. */
++                      if (*bpt_type == '1' &&
++                          !kgdb_ops->flags & KGDB_HW_BREAKPOINT)
++                              /* Unsupported. */
++                              break;
++
++                      if (*(ptr++) != ',') {
++                              error_packet(remcom_out_buffer, -EINVAL);
++                              break;
++                      } else if (kgdb_hex2long(&ptr, &addr)) {
++                              if (*(ptr++) != ',' ||
++                                  !kgdb_hex2long(&ptr, &length)) {
++                                      error_packet(remcom_out_buffer,
++                                                   -EINVAL);
++                                      break;
++                              }
++                      } else {
++                              error_packet(remcom_out_buffer, -EINVAL);
++                              break;
++                      }
++
++                      if (remcom_in_buffer[0] == 'Z' && *bpt_type == '0')
++                              error = kgdb_set_sw_break(addr);
++                      else if (remcom_in_buffer[0] == 'Z' && *bpt_type == '1')
++                              error = kgdb_set_hw_break(addr);
++                      else if (remcom_in_buffer[0] == 'z' && *bpt_type == '0')
++                              error = kgdb_remove_sw_break(addr);
++                      else if (remcom_in_buffer[0] == 'z' && *bpt_type == '1')
++                              error = kgdb_remove_hw_break(addr);
++                      else if (remcom_in_buffer[0] == 'Z')
++                              error = kgdb_ops->set_hw_breakpoint(addr,
++                                                                  (int)length,
++                                                                  *bpt_type);
++                      else if (remcom_in_buffer[0] == 'z')
++                              error = kgdb_ops->remove_hw_breakpoint(addr,
++                                                                     (int)
++                                                                     length,
++                                                                     *bpt_type);
++
++                      if (error == 0)
++                              strcpy(remcom_out_buffer, "OK");
++                      else
++                              error_packet(remcom_out_buffer, error);
++
++                      break;
++              case 'c':
++              case 's':
++                      if (kgdb_contthread && kgdb_contthread != current) {
++                              /* Can't switch threads in kgdb */
++                              error_packet(remcom_out_buffer, -EINVAL);
++                              break;
++                      }
++                      kgdb_activate_sw_breakpoints();
++                      /* Followthrough to default processing */
++              default:
++                    default_handle:
++                      error = kgdb_arch_handle_exception(ex_vector, signo,
++                                                         err_code,
++                                                         remcom_in_buffer,
++                                                         remcom_out_buffer,
++                                                         linux_regs);
++
++                      if (error >= 0 || remcom_in_buffer[0] == 'D' ||
++                          remcom_in_buffer[0] == 'k')
++                              goto kgdb_exit;
++
++              }               /* switch */
++
++              /* reply to the request */
++              put_packet(remcom_out_buffer);
++      }
++
++      kgdb_exit:
++      /* Call the I/O driver's post_exception routine if the I/O
++       * driver defined one.
++       */
++      if (kgdb_io_ops.post_exception)
++              kgdb_io_ops.post_exception();
++
++      kgdb_info[processor].debuggerinfo = NULL;
++      kgdb_info[processor].task = NULL;
++      atomic_set(&procindebug[processor], 0);
++
++      if (!debugger_step || !kgdb_contthread) {
++              for (i = 0; i < NR_CPUS; i++)
++                      spin_unlock(&slavecpulocks[i]);
++              /* Wait till all the processors have quit
++               * from the debugger. */
++              for (i = 0; i < NR_CPUS; i++) {
++                      while (atomic_read(&procindebug[i])) {
++                              int j = 10;     /* an arbitrary number */
++
++                              while (--j)
++                                      cpu_relax();
++                      }
++              }
++      }
++
++#ifdef CONFIG_SMP
++      /* This delay has a real purpose.  The problem is that if you
++       * are single-stepping, you are sending an NMI to all the
++       * other processors to stop them.  Interrupts come in, but
++       * don't get handled.  Then you let them go just long enough
++       * to get into their interrupt routines and use up some stack.
++       * You stop them again, and then do the same thing.  After a
++       * while you blow the stack on the other processors.  This
++       * delay gives some time for interrupts to be cleared out on
++       * the other processors.
++       */
++      if (debugger_step)
++              mdelay(2);
++#endif
++kgdb_restore:
++      /* Free debugger_active */
++      atomic_set(&debugger_active, 0);
++      local_irq_restore(flags);
++
++      return error;
++}
++
++/*
++ * GDB places a breakpoint at this function to know dynamically
++ * loaded objects. It's not defined static so that only one instance with this
++ * name exists in the kernel.
++ */
++
++int module_event(struct notifier_block *self, unsigned long val, void *data)
++{
++      return 0;
++}
++
++static struct notifier_block kgdb_module_load_nb = {
++      .notifier_call = module_event,
++};
++
++void kgdb_nmihook(int cpu, void *regs)
++{
++#ifdef CONFIG_SMP
++      if (!atomic_read(&procindebug[cpu]) && atomic_read(&debugger_active) != (cpu + 1))
++              kgdb_wait((struct pt_regs *)regs);
++#endif
++}
++
++/*
++ * This is called when a panic happens.  All we need to do is
++ * breakpoint().
++ */
++static int kgdb_panic_notify(struct notifier_block *self, unsigned long cmd,
++                           void *ptr)
++{
++      breakpoint();
++
++      return 0;
++}
++
++static struct notifier_block kgdb_panic_notifier = {
++      .notifier_call = kgdb_panic_notify,
++};
++
++/*
++ * Initialization that needs to be done in either of our entry points.
++ */
++static void __init kgdb_internal_init(void)
++{
++      int i;
++
++      /* Initialize our spinlocks. */
++      for (i = 0; i < NR_CPUS; i++)
++              spin_lock_init(&slavecpulocks[i]);
++
++      for (i = 0; i < MAX_BREAKPOINTS; i++)
++              kgdb_break[i].state = bp_none;
++
++      /* Initialize the I/O handles */
++      memset(&kgdb_io_ops_prev, 0, sizeof(kgdb_io_ops_prev));
++
++      /* We can't do much if this fails */
++      register_module_notifier(&kgdb_module_load_nb);
++
++      kgdb_initialized = 1;
++}
++
++static void kgdb_register_for_panic(void)
++{
++      /* Register for panics(). */
++      /* The registration is done in the kgdb_register_for_panic
++       * routine because KGDB should not try to handle a panic when
++       * there are no kgdb_io_ops setup. It is assumed that the
++       * kgdb_io_ops are setup at the time this method is called.
++       */
++      if (!kgdb_from_module_registered) {
++              atomic_notifier_chain_register(&panic_notifier_list,
++                                      &kgdb_panic_notifier);
++              kgdb_from_module_registered = 1;
++      }
++}
++
++static void kgdb_unregister_for_panic(void)
++{
++      /* When this routine is called KGDB should unregister from the
++       * panic handler and clean up, making sure it is not handling any
++       * break exceptions at the time.
++       */
++      if (kgdb_from_module_registered) {
++              kgdb_from_module_registered = 0;
++              atomic_notifier_chain_unregister(&panic_notifier_list,
++                                        &kgdb_panic_notifier);
++      }
++}
++
++int kgdb_register_io_module(struct kgdb_io *local_kgdb_io_ops)
++{
++
++      if (kgdb_connected) {
++              printk(KERN_ERR "kgdb: Cannot load I/O module while KGDB "
++                     "connected.\n");
++              return -EINVAL;
++      }
++
++      /* Save the old values so they can be restored */
++      if (kgdb_io_handler_cnt >= MAX_KGDB_IO_HANDLERS) {
++              printk(KERN_ERR "kgdb: No more I/O handles available.\n");
++              return -EINVAL;
++      }
++
++      /* Check to see if there is an existing driver and if so save its
++       * values.  Also check to make sure the same driver was not trying
++       * to re-register.
++       */
++      if (kgdb_io_ops.read_char != NULL &&
++        kgdb_io_ops.read_char != local_kgdb_io_ops->read_char) {
++              memcpy(&kgdb_io_ops_prev[kgdb_io_handler_cnt],
++                     &kgdb_io_ops, sizeof(struct kgdb_io));
++              kgdb_io_handler_cnt++;
++      }
++
++      /* Initialize the io values for this module */
++      memcpy(&kgdb_io_ops, local_kgdb_io_ops, sizeof(struct kgdb_io));
++
++      /* Make the call to register kgdb if is not initialized */
++      kgdb_register_for_panic();
++
++      return 0;
++}
++
++void kgdb_unregister_io_module(struct kgdb_io *local_kgdb_io_ops)
++{
++      int i;
++
++      /* Unregister KGDB if there were no other prior io hooks, else
++       * restore the io hooks.
++       */
++      if (kgdb_io_handler_cnt > 0 && kgdb_io_ops_prev[0].read_char != NULL) {
++              /* First check if the hook that is in use is the one being
++               * removed */
++              if (kgdb_io_ops.read_char == local_kgdb_io_ops->read_char) {
++                      /* Set 'i' to the value of where the list should be
++                       * shifed */
++                      i = kgdb_io_handler_cnt - 1;
++                      memcpy(&kgdb_io_ops, &kgdb_io_ops_prev[i],
++                             sizeof(struct kgdb_io));
++              } else {
++                      /* Simple case to remove an entry for an I/O handler
++                       * that is not in use */
++                      for (i = 0; i < kgdb_io_handler_cnt; i++) {
++                              if (kgdb_io_ops_prev[i].read_char ==
++                                  local_kgdb_io_ops->read_char)
++                                      break;
++                      }
++              }
++
++              /* Shift all the entries in the handler array so it is
++               * ordered from oldest to newest.
++               */
++              kgdb_io_handler_cnt--;
++              for (; i < kgdb_io_handler_cnt; i++) {
++                      memcpy(&kgdb_io_ops_prev[i], &kgdb_io_ops_prev[i + 1],
++                             sizeof(struct kgdb_io));
++              }
++              /* Handle the case if we are on the last element and set it
++               * to NULL; */
++              memset(&kgdb_io_ops_prev[kgdb_io_handler_cnt], 0,
++                              sizeof(struct kgdb_io));
++
++              if (kgdb_connected)
++                      printk(KERN_ERR "kgdb: WARNING: I/O method changed "
++                             "while kgdb was connected state.\n");
++      } else {
++              /* KGDB is no longer able to communicate out, so
++               * unregister our hooks and reset state. */
++              kgdb_unregister_for_panic();
++              if (kgdb_connected) {
++                      printk(KERN_CRIT "kgdb: I/O module was unloaded while "
++                                      "a debugging session was running.  "
++                                      "KGDB will be reset.\n");
++                      if (remove_all_break() < 0)
++                              printk(KERN_CRIT "kgdb: Reset failed.\n");
++                      kgdb_connected = 0;
++              }
++              memset(&kgdb_io_ops, 0, sizeof(struct kgdb_io));
++      }
++}
++
++/*
++ * There are times we need to call a tasklet to cause a breakpoint
++ * as calling breakpoint() at that point might be fatal.  We have to
++ * check that the exception stack is setup, as tasklets may be scheduled
++ * prior to this.  When that happens, it is up to the architecture to
++ * schedule this when it is safe to run.
++ */
++static void kgdb_tasklet_bpt(unsigned long ing)
++{
++      if(CHECK_EXCEPTION_STACK())
++              breakpoint();
++}
++
++DECLARE_TASKLET(kgdb_tasklet_breakpoint, kgdb_tasklet_bpt, 0);
++
++/*
++ * This function can be called very early, either via early_param() or
++ * an explicit breakpoint() early on.
++ */
++static void __init kgdb_early_entry(void)
++{
++      /*
++       * Don't try and do anything until the architecture is able to
++       * setup the exception stack.  In this case, it is up to the
++       * architecture to hook in and look at us when they are ready.
++       */
++      if(!CHECK_EXCEPTION_STACK()) {
++              kgdb_initialized = -1;
++              tasklet_schedule(&kgdb_tasklet_breakpoint);
++              return;
++      }
++
++      /* Let the architecture do any setup that it needs to. */
++      kgdb_arch_init();
++
++      /* Now try the I/O. */
++      /* For early entry kgdb_io_ops.init must be defined */
++      if (!kgdb_io_ops.init || kgdb_io_ops.init()) {
++              /* Try again later. */
++              kgdb_initialized = -1;
++              return;
++      }
++
++      /* Finish up. */
++      kgdb_internal_init();
++
++      /* KGDB can assume that if kgdb_io_ops.init was defined that the
++       * panic registion should be performed at this time. This means
++       * kgdb_io_ops.init did not come from a kernel module and was
++       * initialized statically by a built in.
++       */
++      if (kgdb_io_ops.init)
++              kgdb_register_for_panic();
++}
++
++/*
++ * This function will always be invoked to make sure that KGDB will grab
++ * what it needs to so that if something happens while the system is
++ * running, KGDB will get involved.  If kgdb_early_entry() has already
++ * been invoked, there is little we need to do.
++ */
++static int __init kgdb_late_entry(void)
++{
++      int need_break = 0;
++
++      /* If kgdb_initialized is -1 then we were passed kgdbwait. */
++      if (kgdb_initialized == -1)
++              need_break = 1;
++
++      /*
++       * If we haven't tried to initialize KGDB yet, we need to call
++       * kgdb_arch_init before moving onto the I/O.
++       */
++      if (!kgdb_initialized)
++              kgdb_arch_init();
++
++      if (kgdb_initialized != 1) {
++              if (kgdb_io_ops.init && kgdb_io_ops.init()) {
++                      /* When KGDB allows I/O via modules and the core
++                       * I/O init fails KGDB must default to defering the
++                       * I/O setup, and appropriately print an error about
++                       * it.
++                       */
++                      printk(KERN_ERR "kgdb: Could not setup core I/O "
++                             "for KGDB.\n");
++                      printk(KERN_INFO "kgdb: Defering I/O setup to kernel "
++                             "module.\n");
++                      memset(&kgdb_io_ops, 0, sizeof(struct kgdb_io));
++              }
++
++              kgdb_internal_init();
++
++              /* KGDB can assume that if kgdb_io_ops.init was defined that
++               * panic registion should be performed at this time. This means
++               * kgdb_io_ops.init did not come from a kernel module and was
++               * initialized statically by a built in.
++               */
++              if (kgdb_io_ops.init)
++                      kgdb_register_for_panic();
++      }
++
++      /* Registering to reboot notifier list*/
++      register_reboot_notifier(&kgdb_reboot_notifier);
++
++      /* Now do any late init of the I/O. */
++      if (kgdb_io_ops.late_init)
++              kgdb_io_ops.late_init();
++
++      if (need_break) {
++              printk(KERN_CRIT "kgdb: Waiting for connection from remote"
++                     " gdb...\n");
++              breakpoint();
++      }
++
++      return 0;
++}
++
++late_initcall(kgdb_late_entry);
++
++/*
++ * This function will generate a breakpoint exception.  It is used at the
++ * beginning of a program to sync up with a debugger and can be used
++ * otherwise as a quick means to stop program execution and "break" into
++ * the debugger.
++ */
++void breakpoint(void)
++{
++      if (kgdb_initialized != 1) {
++              kgdb_early_entry();
++              if (kgdb_initialized == 1)
++                      printk(KERN_CRIT "Waiting for connection from remote "
++                             "gdb...\n");
++              else {
++                      printk(KERN_CRIT "KGDB cannot initialize I/O yet.\n");
++                      return;
++              }
++      }
++
++      atomic_set(&kgdb_setting_breakpoint, 1);
++      wmb();
++      BREAKPOINT();
++      wmb();
++      atomic_set(&kgdb_setting_breakpoint, 0);
++}
++
++EXPORT_SYMBOL(breakpoint);
++
++#ifdef CONFIG_MAGIC_SYSRQ
++static void sysrq_handle_gdb(int key, struct pt_regs *pt_regs,
++                           struct tty_struct *tty)
++{
++      printk("Entering GDB stub\n");
++      breakpoint();
++}
++static struct sysrq_key_op sysrq_gdb_op = {
++      .handler = sysrq_handle_gdb,
++      .help_msg = "Gdb",
++      .action_msg = "GDB",
++};
++
++static int gdb_register_sysrq(void)
++{
++      printk("Registering GDB sysrq handler\n");
++      register_sysrq_key('g', &sysrq_gdb_op);
++      return 0;
++}
++
++module_init(gdb_register_sysrq);
++#endif
++
++static int kgdb_notify_reboot(struct notifier_block *this,
++                            unsigned long code, void *x)
++{
++
++      unsigned long flags;
++
++      /* If we're debugging, or KGDB has not connected, don't try
++       * and print. */
++      if (!kgdb_connected || atomic_read(&debugger_active) != 0)
++              return 0;
++      if ((code == SYS_RESTART) || (code == SYS_HALT) || (code == SYS_POWER_OFF)){
++              local_irq_save(flags);
++              put_packet("X00");
++              local_irq_restore(flags);
++      }
++      return NOTIFY_DONE;
++}
++
++#ifdef CONFIG_KGDB_CONSOLE
++void kgdb_console_write(struct console *co, const char *s, unsigned count)
++{
++      unsigned long flags;
++
++      /* If we're debugging, or KGDB has not connected, don't try
++       * and print. */
++      if (!kgdb_connected || atomic_read(&debugger_active) != 0)
++              return;
++
++      local_irq_save(flags);
++      kgdb_msg_write(s, count);
++      local_irq_restore(flags);
++}
++
++struct console kgdbcons = {
++      .name = "kgdb",
++      .write = kgdb_console_write,
++      .flags = CON_PRINTBUFFER | CON_ENABLED,
++};
++static int __init kgdb_console_init(void)
++{
++      register_console(&kgdbcons);
++      return 0;
++}
++
++console_initcall(kgdb_console_init);
++#endif
++
++static int __init opt_kgdb_enter(char *str)
++{
++      /* We've already done this by an explicit breakpoint() call. */
++      if (kgdb_initialized)
++              return 0;
++
++      /* Call breakpoint() which will take care of init. */
++      breakpoint();
++
++      return 0;
++}
++
++early_param("kgdbwait", opt_kgdb_enter);
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/kernel/kgdbarchlib.c linux-2.6.18.kgdb/kernel/kgdbarchlib.c
+--- linux-2.6.18/kernel/kgdbarchlib.c  1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18.kgdb/kernel/kgdbarchlib.c     2008-06-10 16:18:58.000000000 +0400
+@@ -0,0 +1,198 @@
++#include <linux/kgdb.h>
++
++struct kgdb_arch *kgdb_ops = &arch_kgdb_ops;
++
++/**
++ *    kgdb_arch_init - Perform any architecture specific initalization.
++ *
++ *    RETURN:
++ *    The return value is ignored.
++ *
++ *    This function will handle the initalization of any architecture
++ *    specific hooks.
++ */
++int __attribute__ ((weak))
++    kgdb_arch_init(void)
++{
++      return 0;
++}
++
++/**
++ *    kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
++ *    @regs: Current &struct pt_regs.
++ *
++ *    This function will be called if the particular architecture must
++ *    disable hardware debugging while it is processing gdb packets or
++ *    handling exception.
++ */
++void __attribute__ ((weak))
++    kgdb_disable_hw_debug(struct pt_regs *regs)
++{
++}
++
++/*
++ * Skip an int3 exception when it occurs after a breakpoint has been
++ * removed. Backtrack eip by 1 since the int3 would have caused it to
++ * increment by 1.
++ */
++int __attribute__ ((weak))
++      kgdb_skipexception(int exception, struct pt_regs *regs)
++{
++      return 0;
++}
++
++/**
++ *    kgdb_set_hw_break - Set a hardware breakpoint at @addr.
++ *    @addr: The address to set a hardware breakpoint at.
++ */
++int __attribute__ ((weak))
++    kgdb_set_hw_break(unsigned long addr)
++{
++      return 0;
++}
++
++/**
++ *    kgdb_remove_hw_break - Remove a hardware breakpoint at @addr.
++ *    @addr: The address to remove a hardware breakpoint from.
++ */
++int __attribute__ ((weak))
++    kgdb_remove_hw_break(unsigned long addr)
++{
++      return 0;
++}
++
++/**
++ *    kgdb_remove_all_hw_break - Clear all hardware breakpoints.
++ */
++void __attribute__ ((weak))
++    kgdb_remove_all_hw_break(void)
++{
++}
++
++/**
++ *    kgdb_correct_hw_break - Correct hardware breakpoints.
++ *
++ *    A hook to allow for changes to the hardware breakpoint, called
++ *    after a single step (s) or continue (c) packet, and once we're about
++ *    to let the kernel continue running.
++ *
++ *    This is used to set the hardware breakpoint registers for all the
++ *    slave cpus on an SMP configuration. This must be called after any
++ *    changes are made to the hardware breakpoints (such as by a single
++ *    step (s) or continue (c) packet. This is only required on
++ *    architectures that support SMP and every processor has its own set
++ *    of breakpoint registers.
++ */
++void __attribute__ ((weak))
++    kgdb_correct_hw_break(void)
++{
++}
++
++/**
++ *    kgdb_post_master_code - Save error vector/code numbers.
++ *    @regs: Original pt_regs.
++ *    @e_vector: Original error vector.
++ *    @err_code: Original error code.
++ *
++ *    This is needed on architectures which support SMP and KGDB.
++ *    This function is called after all the slave cpus have been put
++ *    to a know spin state and the master CPU has control over KGDB.
++ */
++
++void __attribute__ ((weak))
++    kgdb_post_master_code(struct pt_regs *regs, int e_vector, int err_code)
++{
++}
++
++/**
++ *    kgdb_roundup_cpus - Get other CPUs into a holding pattern
++ *    @flags: Current IRQ state
++ *
++ *    On SMP systems, we need to get the attention of the other CPUs
++ *    and get them be in a known state.  This should do what is needed
++ *    to get the other CPUs to call kgdb_wait(). Note that on some arches,
++ *    the NMI approach is not used for rounding up all the CPUs. For example,
++ *    in case of MIPS, smp_call_function() is used to roundup CPUs. In
++ *    this case, we have to make sure that interrupts are enabled before
++ *    calling smp_call_function(). The argument to this function is
++ *    the flags that will be used when restoring the interrupts. There is
++ *    local_irq_save() call before kgdb_roundup_cpus().
++ */
++void __attribute__ ((weak))
++    kgdb_roundup_cpus(unsigned long flags)
++{
++}
++
++/**
++ *    kgdb_shadowinfo - Get shadowed information on @threadid.
++ *    @regs: The &struct pt_regs of the current process.
++ *    @buffer: A buffer of %BUFMAX size.
++ *    @threadid: The thread id of the shadowed process to get information on.
++ */
++void __attribute__ ((weak))
++    kgdb_shadowinfo(struct pt_regs *regs, char *buffer, unsigned threadid)
++{
++}
++
++/**
++ *    kgdb_get_shadow_thread - Get the shadowed &task_struct of @threadid.
++ *    @regs: The &struct pt_regs of the current thread.
++ *    @threadid: The thread id of the shadowed process to get information on.
++ *
++ *    RETURN:
++ *    This returns a pointer to the &struct task_struct of the shadowed
++ *    thread, @threadid.
++ */
++struct task_struct __attribute__ ((weak))
++    * kgdb_get_shadow_thread(struct pt_regs *regs, int threadid)
++{
++      return NULL;
++}
++
++/**
++ *    kgdb_shadow_regs - Return the shadowed registers of @threadid.
++ *    @regs: The &struct pt_regs of the current thread.
++ *    @threadid: The thread id we want the &struct pt_regs for.
++ *
++ *    RETURN:
++ *    The a pointer to the &struct pt_regs of the shadowed thread @threadid.
++ */
++struct pt_regs __attribute__ ((weak))
++    * kgdb_shadow_regs(struct pt_regs *regs, int threadid)
++{
++      return NULL;
++}
++
++int __attribute__ ((weak))
++     kgdb_validate_break_address(unsigned long addr)
++{
++      int error = 0;
++      char tmp_variable[BREAK_INSTR_SIZE];
++      error = kgdb_get_mem((char *)addr, tmp_variable, BREAK_INSTR_SIZE);
++      return error;
++}
++
++int __attribute__ ((weak))
++     kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr)
++{
++      int error = 0;
++      if ((error = kgdb_get_mem((char *)addr,
++              saved_instr, BREAK_INSTR_SIZE)) < 0)
++                      return error;
++
++      if ((error = kgdb_set_mem((char *)addr, kgdb_ops->gdb_bpt_instr,
++              BREAK_INSTR_SIZE)) < 0)
++                      return error;
++      return 0;
++}
++
++int __attribute__ ((weak))
++     kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle)
++{
++
++      int error = 0;
++      if ((error =kgdb_set_mem((char *)addr, (char *)bundle,
++              BREAK_INSTR_SIZE)) < 0)
++                      return error;
++      return 0;
++}
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/kernel/module.c linux-2.6.18.kgdb/kernel/module.c
+--- linux-2.6.18/kernel/module.c       2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/kernel/module.c  2008-06-10 16:20:07.000000000 +0400
+@@ -64,6 +64,7 @@ static DEFINE_SPINLOCK(modlist_lock);
+ /* List of modules, protected by module_mutex AND modlist_lock */
+ static DEFINE_MUTEX(module_mutex);
+ static LIST_HEAD(modules);
++static DECLARE_MUTEX(notify_mutex);
+ 
+ static BLOCKING_NOTIFIER_HEAD(module_notify_list);
+ 
+@@ -700,6 +701,12 @@ sys_delete_module(const char __user *nam
+       if (ret != 0)
+               goto out;
+ 
++      down(&notify_mutex);
++      blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING,
++                              mod);
++      up(&notify_mutex);
++
++
+       /* Never wait if forced. */
+       if (!forced && module_refcount(mod) != 0)
+               wait_for_zero_refcount(mod);
+@@ -712,6 +719,11 @@ sys_delete_module(const char __user *nam
+       }
+       free_module(mod);
+ 
++      down(&notify_mutex);
++      blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GONE,
++                      NULL);
++      up(&notify_mutex);
++
+  out:
+       mutex_unlock(&module_mutex);
+       return ret;
+@@ -1112,6 +1124,11 @@ static void free_module(struct module *m
+       /* Arch-specific cleanup. */
+       module_arch_cleanup(mod);
+ 
++#ifdef CONFIG_KGDB
++      /* kgdb info */
++      vfree(mod->mod_sections);
++#endif
++
+       /* Module unload stuff */
+       module_unload_free(mod);
+ 
+@@ -1371,6 +1388,31 @@ static void setup_modinfo(struct module 
+       }
+ }
+ 
++#ifdef CONFIG_KGDB
++int add_modsects (struct module *mod, Elf_Ehdr *hdr, Elf_Shdr *sechdrs, const
++                char *secstrings)
++{
++        int i;
++
++        mod->num_sections = hdr->e_shnum - 1;
++        mod->mod_sections = vmalloc((hdr->e_shnum - 1)*
++              sizeof (struct mod_section));
++
++        if (mod->mod_sections == NULL) {
++                return -ENOMEM;
++        }
++
++        for (i = 1; i < hdr->e_shnum; i++) {
++                mod->mod_sections[i - 1].address = (void *)sechdrs[i].sh_addr;
++                strncpy(mod->mod_sections[i - 1].name, secstrings +
++                                sechdrs[i].sh_name, MAX_SECTNAME);
++                mod->mod_sections[i - 1].name[MAX_SECTNAME] = '\0';
++      }
++
++      return 0;
++}
++#endif
++
+ #ifdef CONFIG_KALLSYMS
+ int is_exported(const char *name, const struct module *mod)
+ {
+@@ -1782,6 +1824,12 @@ static struct module *load_module(void _
+ 
+       add_kallsyms(mod, sechdrs, symindex, strindex, secstrings);
+ 
++#ifdef CONFIG_KGDB
++        if ((err = add_modsects(mod, hdr, sechdrs, secstrings)) < 0) {
++                goto nomodsectinfo;
++        }
++#endif
++
+       err = module_finalize(hdr, sechdrs, mod);
+       if (err < 0)
+               goto cleanup;
+@@ -1842,6 +1890,11 @@ static struct module *load_module(void _
+  arch_cleanup:
+       module_arch_cleanup(mod);
+  cleanup:
++
++#ifdef CONFIG_KGDB
++nomodsectinfo:
++       vfree(mod->mod_sections);
++#endif
+       module_unload_free(mod);
+       module_free(mod, mod->module_init);
+  free_core:
+@@ -1913,6 +1966,10 @@ sys_init_module(void __user *umod,
+               /* Init routine failed: abort.  Try to protect us from
+                    buggy refcounters. */
+               mod->state = MODULE_STATE_GOING;
++              down(&notify_mutex);
++              blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING,
++                              mod);
++              up(&notify_mutex);
+               synchronize_sched();
+               if (mod->unsafe)
+                       printk(KERN_ERR "%s: module is now stuck!\n",
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/kernel/sched.c linux-2.6.18.kgdb/kernel/sched.c
+--- linux-2.6.18/kernel/sched.c        2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/kernel/sched.c   2008-06-10 16:18:58.000000000 +0400
+@@ -52,6 +52,7 @@
+ #include <linux/acct.h>
+ #include <linux/kprobes.h>
+ #include <linux/delayacct.h>
++#include <linux/kgdb.h>
+ #include <asm/tlb.h>
+ 
+ #include <asm/unistd.h>
+@@ -6790,6 +6791,9 @@ void __might_sleep(char *file, int line)
+ #ifdef in_atomic
+       static unsigned long prev_jiffy;        /* ratelimiting */
+ 
++      if (atomic_read(&debugger_active))
++              return;
++
+       if ((in_atomic() || irqs_disabled()) &&
+           system_state == SYSTEM_RUNNING && !oops_in_progress) {
+               if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/kernel/softlockup.c linux-2.6.18.kgdb/kernel/softlockup.c
+--- linux-2.6.18/kernel/softlockup.c   2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/kernel/softlockup.c      2008-06-10 16:20:11.000000000 +0400
+@@ -13,6 +13,7 @@
+ #include <linux/kthread.h>
+ #include <linux/notifier.h>
+ #include <linux/module.h>
++#include <linux/kgdb.h>
+ 
+ static DEFINE_SPINLOCK(print_lock);
+ 
+@@ -37,6 +38,9 @@ static struct notifier_block panic_block
+ void touch_softlockup_watchdog(void)
+ {
+       __raw_get_cpu_var(touch_timestamp) = jiffies;
++#ifdef CONFIG_KGDB
++      atomic_set(&kgdb_sync_softlockup[raw_smp_processor_id()], 0);
++#endif
+ }
+ EXPORT_SYMBOL(touch_softlockup_watchdog);
+ 
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/kernel/timer.c linux-2.6.18.kgdb/kernel/timer.c
+--- linux-2.6.18/kernel/timer.c        2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/kernel/timer.c   2008-06-10 16:20:11.000000000 +0400
+@@ -34,6 +34,7 @@
+ #include <linux/cpu.h>
+ #include <linux/syscalls.h>
+ #include <linux/delay.h>
++#include <linux/kgdb.h>
+ 
+ #include <asm/uaccess.h>
+ #include <asm/unistd.h>
+@@ -1257,7 +1258,11 @@ static void run_timer_softirq(struct sof
+  */
+ void run_local_timers(void)
+ {
++      int this_cpu = smp_processor_id();
+       raise_softirq(TIMER_SOFTIRQ);
++#ifdef CONFIG_KGDB
++      if(!atomic_read(&kgdb_sync_softlockup[this_cpu]))
++#endif
+       softlockup_tick();
+ }
+ 
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/lib/Kconfig.debug linux-2.6.18.kgdb/lib/Kconfig.debug
+--- linux-2.6.18/lib/Kconfig.debug     2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/lib/Kconfig.debug        2008-06-10 16:19:51.000000000 +0400
+@@ -315,7 +315,7 @@ config DEBUG_VM
+ 
+ config FRAME_POINTER
+       bool "Compile the kernel with frame pointers"
+-      depends on DEBUG_KERNEL && (X86 || CRIS || M68K || M68KNOMMU || FRV || UML || S390)
++      depends on DEBUG_KERNEL && (X86 || CRIS || M68K || M68KNOMMU || FRV || UML || S390 || SUPERH)
+       default y if DEBUG_INFO && UML
+       help
+         If you say Y here the resulting kernel image will be slightly larger
+@@ -368,3 +368,158 @@ config RCU_TORTURE_TEST
+         at boot time (you probably don't).
+         Say M if you want the RCU torture tests to build as a module.
+         Say N if you are unsure.
++
++config WANT_EXTRA_DEBUG_INFORMATION
++      bool
++      select DEBUG_INFO
++      select FRAME_POINTER if X86 || SUPERH
++      default n
++
++config KGDB
++      bool "KGDB: kernel debugging with remote gdb"
++      select WANT_EXTRA_DEBUG_INFORMATION
++      depends on DEBUG_KERNEL && (ARM || X86 || MIPS || (SUPERH && !SUPERH64) || IA64 || X86_64 || PPC)
++      help
++        If you say Y here, it will be possible to remotely debug the
++        kernel using gdb. It is strongly suggested that you enable
++        DEBUG_INFO, and if available on your platform, FRAME_POINTER.
++        Documentation of kernel debugger available at
++        http://kgdb.sourceforge.net as well as in DocBook form
++        in Documentation/DocBook/.  If unsure, say N.
++
++config KGDB_CONSOLE
++      bool "KGDB: Console messages through gdb"
++      depends on KGDB
++        help
++          If you say Y here, console messages will appear through gdb.
++          Other consoles such as tty or ttyS will continue to work as usual.
++          Note, that if you use this in conjunction with KGDB_ETH, if the
++          ethernet driver runs into an error condition during use with KGDB
++          it is possible to hit an infinite recusrion, causing the kernel
++          to crash, and typically reboot.  For this reason, it is preferable
++          to use NETCONSOLE in conjunction with KGDB_ETH instead of
++          KGDB_CONSOLE.
++
++choice
++      prompt "Method for KGDB communication"
++      depends on KGDB
++      default KGDB_8250_NOMODULE
++      default KGDB_MPSC if SERIAL_MPSC
++      default KGDB_CPM_UART if (8xx || 8260)
++      default KGDB_SIBYTE if SIBYTE_SB1xxx_SOC
++      help
++        There are a number of different ways in which you can communicate
++        with KGDB.  The most common is via serial, with the 8250 driver
++        (should your hardware have an 8250, or ns1655x style uart).
++        Another option is to use the NETPOLL framework and UDP, should
++        your ethernet card support this.  Other options may exist.
++        You can elect to have one core I/O driver that is built into the
++        kernel for debugging as the kernel is booting, or using only
++        kernel modules.
++
++config KGDB_ONLY_MODULES
++      bool "KGDB: Use only kernel modules for I/O"
++      depends on MODULES
++      help
++        Use only kernel modules to configure KGDB I/O after the
++        kernel is booted.
++
++config KGDB_8250_NOMODULE
++      bool "KGDB: On generic serial port (8250)"
++      select KGDB_8250
++      help
++        Uses generic serial port (8250) to communicate with the host
++        GDB.  This is independent of the normal (SERIAL_8250) driver
++        for this chipset.
++
++config KGDBOE_NOMODULE
++      bool "KGDB: On ethernet - in kernel"
++      select KGDBOE
++      select NETPOLL
++      select NETPOLL_TRAP
++      select NETPOLL_RX
++      help
++        Uses the NETPOLL API to communicate with the host GDB via UDP.
++        In order for this to work, the ethernet interface specified must
++        support the NETPOLL API, and this must be initialized at boot.
++        See the documentation for syntax.
++
++config KGDB_MPSC
++      bool "KGDB on MV64x60 MPSC"
++      depends on SERIAL_MPSC
++      help
++        Uses a Marvell GT64260B or MV64x60 Multi-Purpose Serial
++        Controller (MPSC) channel. Note that the GT64260A is not
++        supported.
++
++config KGDB_CPM_UART
++      bool "KGDB: On CPM UART"
++      depends on PPC && (CPM2 || 8xx)
++      help
++        Uses CPM UART to communicate with the host GDB.
++
++config KGDB_SIBYTE
++      bool "KGDB: On the Broadcom SWARM serial port"
++      depends on MIPS && SIBYTE_SB1xxx_SOC
++endchoice
++
++config KGDBOE
++      tristate "KGDB: On ethernet" if !KGDBOE_NOMODULE
++      depends on m && KGDB
++      select NETPOLL
++      select NETPOLL_TRAP
++      select NETPOLL_RX
++      help
++        Uses the NETPOLL API to communicate with the host GDB via UDP.
++        In order for this to work, the ethernet interface specified must
++        support the NETPOLL API, and this must be initialized at boot.
++        See the documentation for syntax.
++
++config KGDB_8250
++      tristate "KGDB: On generic serial port (8250)" if !KGDB_8250_NOMODULE
++      depends on m && KGDB_ONLY_MODULES
++      help
++        Uses generic serial port (8250) to communicate with the host
++        GDB.  This is independent of the normal (SERIAL_8250) driver
++        for this chipset.
++
++config KGDB_SIMPLE_SERIAL
++      bool "Simple selection of KGDB serial port"
++      depends on KGDB_8250_NOMODULE
++      default y
++      help
++        If you say Y here, you will only have to pick the baud rate
++        and port number that you wish to use for KGDB.  Note that this
++        only works on architectures that register known serial ports
++        early on.  If you say N, you will have to provide, either here
++        or on the command line, the type (I/O or MMIO), IRQ and
++        address to use.  If in doubt, say Y.
++
++config KGDB_BAUDRATE
++      int "Debug serial port baud rate"
++      depends on (KGDB_8250 && KGDB_SIMPLE_SERIAL)
++      default "115200"
++      help
++        gdb and the kernel stub need to agree on the baud rate to be
++        used.  Standard rates from 9600 to 115200 are allowed, and this
++        may be overridden via the commandline.
++
++config KGDB_PORT_NUM
++      int "Serial port number for KGDB"
++      range 0 1 if KGDB_MPSC
++      range 0 3
++      depends on (KGDB_8250 && KGDB_SIMPLE_SERIAL) || KGDB_MPSC
++      default "1"
++      help
++        Pick the port number (0 based) for KGDB to use.
++
++config KGDB_8250_CONF_STRING
++      string "Configuration string for KGDB"
++      depends on KGDB_8250_NOMODULE && !KGDB_SIMPLE_SERIAL
++      default "io,2f8,115200,3" if X86
++      help
++        The format of this string should be <io or
++        mmio>,<address>,<baud rate>,<irq>.  For example, to use the
++        serial port on an i386 box located at 0x2f8 and 115200 baud
++        on IRQ 3 at use:
++        io,2f8,115200,3
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/net/core/netpoll.c linux-2.6.18.kgdb/net/core/netpoll.c
+--- linux-2.6.18/net/core/netpoll.c    2006-09-20 07:42:06.000000000 +0400
++++ linux-2.6.18.kgdb/net/core/netpoll.c       2008-06-10 16:19:07.000000000 +0400
+@@ -519,7 +519,8 @@ int __netpoll_rx(struct sk_buff *skb)
+ 
+       np->rx_hook(np, ntohs(uh->source),
+                   (char *)(uh+1),
+-                  ulen - sizeof(struct udphdr));
++                  ulen - sizeof(struct udphdr),
++                  skb);
+ 
+       kfree_skb(skb);
+       return 1;
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/scripts/dwarfh.awk linux-2.6.18.kgdb/scripts/dwarfh.awk
+--- linux-2.6.18/scripts/dwarfh.awk    1970-01-01 03:00:00.000000000 +0300
++++ linux-2.6.18.kgdb/scripts/dwarfh.awk       2008-06-10 16:19:58.000000000 +0400
+@@ -0,0 +1,19 @@
++BEGIN {
++      print "#ifndef  _ELF_DWARF_H"
++              print "/* Machine generated from dwarf2.h by scripts/dwarfh.awk */"
++}
++$2 == "=" {
++      gsub(/,/, "", $3)
++      print "#define " $1 "\t " $3
++}
++$1 == "#define" {
++      print $0
++      while( index($0,"\\") == length($0)){
++              getline
++              print $0
++      }
++}
++/.*/ {}
++END {
++      print "#endif"
++}
diff --git a/lustre/kernel_patches/patches/lockdep_chains-2.6.18-vanilla.patch b/lustre/kernel_patches/patches/lockdep_chains-2.6.18-vanilla.patch

new file mode 100644 (file)

index 0000000..f0f3894
--- /dev/null
+++ b/lustre/kernel_patches/patches/lockdep_chains-2.6.18-vanilla.patch
@@ -0,0 +1,269 @@
+commit 443cd507ce7f78c6f8742b72736585c031d5a921
+Author: Huang, Ying <ying.huang@intel.com>
+Date:   Fri Jun 20 16:39:21 2008 +0800
+
+    lockdep: add lock_class information to lock_chain and output it
+    
+    This patch records array of lock_class into lock_chain, and export
+    lock_chain information via /proc/lockdep_chains.
+    
+    It is based on x86/master branch of git-x86 tree, and has been tested
+    on x86_64 platform.
+    
+    Signed-off-by: Huang Ying <ying.huang@intel.com>
+    Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
+    Signed-off-by: Ingo Molnar <mingo@elte.hu>
+
+diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
+index 4c4d236..b26fbc7 100644
+--- a/include/linux/lockdep.h
++++ b/include/linux/lockdep.h
+@@ -182,6 +182,9 @@ struct lock_list {
+  * We record lock dependency chains, so that we can cache them:
+  */
+ struct lock_chain {
++      u8                              irq_context;
++      u8                              depth;
++      u16                             base;
+       struct list_head                entry;
+       u64                             chain_key;
+ };
+diff --git a/kernel/lockdep.c b/kernel/lockdep.c
+index 81a4e4a..a796f1f 100644
+--- a/kernel/lockdep.c
++++ b/kernel/lockdep.c
+@@ -1458,7 +1458,14 @@ out_bug:
+ }
+ 
+ unsigned long nr_lock_chains;
+-static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS];
++struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS];
++atomic_t nr_chain_hlocks;
++static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS];
++
++struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i)
++{
++      return lock_classes + chain_hlocks[chain->base + i];
++}
+ 
+ /*
+  * Look up a dependency chain. If the key is not present yet then
+@@ -1466,9 +1473,14 @@ static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS];
+  * validated. If the key is already hashed, return 0.
+  */
+-static inline int lookup_chain_cache(u64 chain_key)
++static inline int lookup_chain_cache(struct task_struct *curr,
++                                   struct held_lock *hlock,
++                                   u64 chain_key)
+ {
++      struct lock_class *class = hlock->class;
+       struct list_head *hash_head = chainhashentry(chain_key);
+       struct lock_chain *chain;
++      struct held_lock *hlock_curr, *hlock_next;
++      int i, j, n;
+ 
+       if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
+               return 0;
+@@ -1517,6 +1529,26 @@ cache_hit:
+       }
+       chain = lock_chains + nr_lock_chains++;
+       chain->chain_key = chain_key;
++      chain->irq_context = hlock->irq_context;
++      /* Find the first held_lock of current chain */
++      hlock_next = hlock;
++      for (i = curr->lockdep_depth - 1; i >= 0; i--) {
++              hlock_curr = curr->held_locks + i;
++              if (hlock_curr->irq_context != hlock_next->irq_context)
++                      break;
++              hlock_next = hlock;
++      }
++      i++;
++      chain->depth = curr->lockdep_depth + 1 - i;
++      n = atomic_add_return(chain->depth, &nr_chain_hlocks);
++      if (unlikely(n < MAX_LOCKDEP_CHAIN_HLOCKS)) {
++              chain->base = n - chain->depth;
++              for (j = 0; j < chain->depth - 1; j++, i++) {
++                      int lock_id = curr->held_locks[i].class - lock_classes;
++                      chain_hlocks[chain->base + j] = lock_id;
++              }
++              chain_hlocks[chain->base + j] = class - lock_classes;
++      }
+       list_add_tail_rcu(&chain->entry, hash_head);
+       debug_atomic_inc(&chain_lookup_misses);
+       inc_chains();
+@@ -1538,7 +1570,7 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
+        * (If lookup_chain_cache() returns with 1 it acquires
+        * hash_lock for us)
+        */
+-      if (!trylock && (check == 2) && lookup_chain_cache(chain_key)) {
++      if (!trylock && (check == 2) && lookup_chain_cache(curr, hlock, chain_key)) {
+               /*
+                * Check whether last held lock:
+                *
+diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
+index 8ce09bc..db09b17 100644
+--- a/kernel/lockdep_internals.h
++++ b/kernel/lockdep_internals.h
+@@ -23,6 +23,8 @@
+ #define MAX_LOCKDEP_CHAINS_BITS       14
+ #define MAX_LOCKDEP_CHAINS    (1UL << MAX_LOCKDEP_CHAINS_BITS)
+ 
++#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)
++
+ /*
+  * Stack-trace: tightly packed array of stack backtrace
+  * addresses. Protected by the hash_lock.
+@@ -30,15 +32,19 @@
+ #define MAX_STACK_TRACE_ENTRIES       262144UL
+ 
+ extern struct list_head all_lock_classes;
++extern struct lock_chain lock_chains[];
+ 
+ extern void
+ get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4);
+ 
+ extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str);
+ 
++struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i);
++
+ extern unsigned long nr_lock_classes;
+ extern unsigned long nr_list_entries;
+ extern unsigned long nr_lock_chains;
++extern atomic_t nr_chain_hlocks;
+ extern unsigned long nr_stack_trace_entries;
+ 
+ extern unsigned int nr_hardirq_chains;
+diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
+index 688c5f1..14d052c 100644
+--- a/kernel/lockdep_proc.c
++++ b/kernel/lockdep_proc.c
+@@ -178,6 +178,110 @@ static const struct file_operations proc_lockdep_operations = {
+       .release        = seq_release,
+ };
+ 
++static void print_name(struct seq_file *m, struct lock_class *class)
++{
++      char str[128];
++      const char *name = class->name;
++
++      if (!name) {
++              name = __get_key_name(class->key, str);
++              seq_printf(m, "%s", name);
++      } else{
++              seq_printf(m, "%s", name);
++              if (class->name_version > 1)
++                      seq_printf(m, "#%d", class->name_version);
++              if (class->subclass)
++                      seq_printf(m, "/%d", class->subclass);
++      }
++}
++
++static void *lc_next(struct seq_file *m, void *v, loff_t *pos)
++{
++      struct lock_chain *chain;
++
++      (*pos)++;
++
++      if (v == SEQ_START_TOKEN)
++              chain = m->private;
++      else {
++              chain = v;
++
++              if (*pos < nr_lock_chains)
++                      chain = lock_chains + *pos;
++              else
++                      chain = NULL;
++      }
++
++      return chain;
++}
++
++static void *lc_start(struct seq_file *m, loff_t *pos)
++{
++      if (*pos == 0)
++              return SEQ_START_TOKEN;
++
++      if (*pos < nr_lock_chains)
++              return lock_chains + *pos;
++
++      return NULL;
++}
++
++static void lc_stop(struct seq_file *m, void *v)
++{
++}
++
++static int lc_show(struct seq_file *m, void *v)
++{
++      struct lock_chain *chain = v;
++      struct lock_class *class;
++      int i;
++
++      if (v == SEQ_START_TOKEN) {
++              seq_printf(m, "all lock chains:\n");
++              return 0;
++      }
++
++      seq_printf(m, "irq_context: %d\n", chain->irq_context);
++
++      for (i = 0; i < chain->depth; i++) {
++              class = lock_chain_get_class(chain, i);
++              seq_printf(m, "[%p] ", class->key);
++              print_name(m, class);
++              seq_puts(m, "\n");
++      }
++      seq_puts(m, "\n");
++
++      return 0;
++}
++
++static const struct seq_operations lockdep_chains_ops = {
++      .start  = lc_start,
++      .next   = lc_next,
++      .stop   = lc_stop,
++      .show   = lc_show,
++};
++
++static int lockdep_chains_open(struct inode *inode, struct file *file)
++{
++      int res = seq_open(file, &lockdep_chains_ops);
++      if (!res) {
++              struct seq_file *m = file->private_data;
++
++              if (nr_lock_chains)
++                      m->private = lock_chains;
++              else
++                      m->private = NULL;
++      }
++      return res;
++}
++
++static const struct file_operations proc_lockdep_chains_operations = {
++      .open           = lockdep_chains_open,
++      .read           = seq_read,
++      .llseek         = seq_lseek,
++      .release        = seq_release,
++};
++
+ static void lockdep_stats_debug_show(struct seq_file *m)
+ {
+ #ifdef CONFIG_DEBUG_LOCKDEP
+@@ -294,5 +381,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
+ 
+       seq_printf(m, " dependency chains:             %11lu [max: %lu]\n",
+                       nr_lock_chains, MAX_LOCKDEP_CHAINS);
++      seq_printf(m, " dependency chain hlocks:       %11d [max: %lu]\n",
++                      atomic_read(&nr_chain_hlocks), MAX_LOCKDEP_CHAIN_HLOCKS);
+ 
+ #ifdef CONFIG_TRACE_IRQFLAGS
+@@ -661,6 +750,9 @@ static const struct file_operations proc_lock_stat_operations = {
+       entry = create_proc_entry("lockdep", S_IRUSR, NULL);
+       if (entry)
+               entry->proc_fops = &proc_lockdep_operations;
++      entry = create_proc_entry("lockdep_chains", S_IRUSR, NULL);
++   if (entry)
++              entry->proc_fops = &proc_lockdep_chains_operations;
+ 
+       entry = create_proc_entry("lockdep_stats", S_IRUSR, NULL);
+       if (entry)
+
diff --git a/lustre/kernel_patches/series/2.6-rhel4.series b/lustre/kernel_patches/series/2.6-rhel4.series

index d058295..49fe38a 100644 (file)
--- a/lustre/kernel_patches/series/2.6-rhel4.series
+++ b/lustre/kernel_patches/series/2.6-rhel4.series
@@ -29,6 +29,7 @@ quota-deadlock-on-pagelock-core.patch
  quota-umount-race-fix.patch
  quota-deadlock-on-pagelock-ext3.patch
  export-nr_free_buffer_pages.patch
+2.6-rhel4-kgdb-ga.patch 
  vfs-keep-inode-hashed-for-clear-inode.patch
  modpost_external_module_updates_rhel4.patch
  mpt-fusion-downgrade-to-3_02_73-rhel4.patch
diff --git a/lustre/kernel_patches/series/2.6.18-vanilla.series b/lustre/kernel_patches/series/2.6.18-vanilla.series

index 9253a3e..a9b79b9 100644 (file)
--- a/lustre/kernel_patches/series/2.6.18-vanilla.series
+++ b/lustre/kernel_patches/series/2.6.18-vanilla.series
@@ -15,3 +15,4 @@ jbd-16tb-overflow-fixes.patch
  jbd-check-for-unmapped-buffer.patch
  jbd-stats-2.6-rhel5.patch
  export-nr_free_buffer_pages.patch
+kgdb-2.6.18-vanilla.patch
diff --git a/lustre/lclient/Makefile.am b/lustre/lclient/Makefile.am

new file mode 100644 (file)

index 0000000..a6e1548
--- /dev/null
+++ b/lustre/lclient/Makefile.am
@@ -0,0 +1 @@
+EXTRA_DIST=glimpse.c lcommon_cl.c
diff --git a/lustre/lclient/glimpse.c b/lustre/lclient/glimpse.c

new file mode 100644 (file)

index 0000000..78acee6
--- /dev/null
+++ b/lustre/lclient/glimpse.c
@@ -0,0 +1,253 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * glimpse code shared between vvp and liblustre (and other Lustre clients in
+ * the future).
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ *   Author: Oleg Drokin <oleg.drokin@sun.com>
+ */
+
+#include <libcfs/libcfs.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <obd.h>
+
+#ifdef __KERNEL__
+# include <lustre_dlm.h>
+# include <lustre_lite.h>
+# include <lustre_mdc.h>
+# include <linux/pagemap.h>
+# include <linux/file.h>
+#else
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/queue.h>
+#include <fcntl.h>
+# include <sysio.h>
+# ifdef HAVE_XTIO_H
+#  include <xtio.h>
+# endif
+# include <fs.h>
+# include <mount.h>
+# include <inode.h>
+# ifdef HAVE_FILE_H
+#  include <file.h>
+# endif
+# include <liblustre.h>
+#endif
+
+#include "cl_object.h"
+#include "lclient.h"
+#ifdef __KERNEL__
+# include "../llite/llite_internal.h"
+#else
+# include "../liblustre/llite_lib.h"
+#endif
+
+static const struct cl_lock_descr whole_file = {
+        .cld_start = 0,
+        .cld_end   = CL_PAGE_EOF,
+        .cld_mode  = CLM_READ
+};
+
+int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io,
+                    struct inode *inode, struct cl_object *clob)
+{
+        struct cl_lock_descr *descr = &ccc_env_info(env)->cti_descr;
+        struct cl_inode_info *lli   = cl_i2info(inode);
+        const struct lu_fid  *fid   = lu_object_fid(&clob->co_lu);
+        struct ccc_io        *cio   = ccc_env_io(env);
+        struct cl_lock       *lock;
+        int result;
+
+        ENTRY;
+        result = 0;
+        if (!(lli->lli_flags & LLIF_MDS_SIZE_LOCK)) {
+                CDEBUG(D_DLMTRACE, "Glimpsing inode "DFID"\n", PFID(fid));
+                if (lli->lli_smd) {
+                        /* NOTE: this looks like DLM lock request, but it may
+                         *       not be one. Due to CEF_ASYNC flag (translated
+                         *       to LDLM_FL_HAS_INTENT by osc), this is
+                         *       glimpse request, that won't revoke any
+                         *       conflicting DLM locks held. Instead,
+                         *       ll_glimpse_callback() will be called on each
+                         *       client holding a DLM lock against this file,
+                         *       and resulting size will be returned for each
+                         *       stripe. DLM lock on [0, EOF] is acquired only
+                         *       if there were no conflicting locks. If there
+                         *       were conflicting locks, enqueuing or waiting
+                         *       fails with -ENAVAIL, but valid inode
+                         *       attributes are returned anyway. */
+                        *descr = whole_file;
+                        descr->cld_obj   = clob;
+                        descr->cld_mode  = CLM_PHANTOM;
+                        /* The lockreq for glimpse should be mandatory,
+                         * otherwise, osc may decide to use lockless */
+                        io->ci_lockreq = CILR_MANDATORY;
+                        cio->cui_glimpse = 1;
+                        lock = cl_lock_request(env, io, descr, CEF_ASYNC,
+                                               "glimpse", cfs_current());
+                        cio->cui_glimpse = 0;
+                        if (!IS_ERR(lock)) {
+                                result = cl_wait(env, lock);
+                                if (result == 0) {
+                                        cl_merge_lvb(inode);
+                                        cl_unuse(env, lock);
+                                }
+                                cl_lock_release(env, lock,
+                                                "glimpse", cfs_current());
+                        } else
+                                result = PTR_ERR(lock);
+                } else
+                        CDEBUG(D_DLMTRACE, "No objects for inode\n");
+        }
+
+        RETURN(result);
+}
+
+static int cl_io_get(struct inode *inode, struct lu_env **envout,
+                     struct cl_io **ioout, int *refcheck)
+{
+        struct ccc_thread_info *info;
+        struct lu_env          *env;
+        struct cl_io           *io;
+        struct cl_inode_info   *lli = cl_i2info(inode);
+        struct cl_object       *clob = lli->lli_clob;
+        int result;
+
+        if (S_ISREG(cl_inode_mode(inode))) {
+                env = cl_env_get(refcheck);
+                if (!IS_ERR(env)) {
+                        info = ccc_env_info(env);
+                        io = &info->cti_io;
+                        io->ci_obj = clob;
+                        *envout = env;
+                        *ioout  = io;
+                        result = +1;
+                } else
+                        result = PTR_ERR(env);
+        } else
+                result = 0;
+        return result;
+}
+
+int cl_glimpse_size(struct inode *inode)
+{
+        /*
+         * We don't need ast_flags argument to cl_glimpse_size(), because
+         * osc_lock_enqueue() takes care of the possible deadlock that said
+         * argument was introduced to avoid.
+         */
+        /*
+         * XXX but note that ll_file_seek() passes LDLM_FL_BLOCK_NOWAIT to
+         * cl_glimpse_size(), which doesn't make sense: glimpse locks are not
+         * blocking anyway.
+         */
+        struct lu_env          *env;
+        struct cl_io           *io;
+        int                     result;
+        int                     refcheck;
+
+        ENTRY;
+
+        result = cl_io_get(inode, &env, &io, &refcheck);
+        if (result > 0) {
+                result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+                if (result > 0)
+                        /*
+                         * nothing to do for this io. This currently happens
+                         * when stripe sub-object's are not yet created.
+                         */
+                        result = io->ci_result;
+                else if (result == 0)
+                        result = cl_glimpse_lock(env, io, inode, io->ci_obj);
+                cl_io_fini(env, io);
+                cl_env_put(env, &refcheck);
+        }
+        RETURN(result);
+}
+
+int cl_local_size(struct inode *inode)
+{
+        struct lu_env           *env;
+        struct cl_io            *io;
+        struct ccc_thread_info  *cti;
+        struct cl_object        *clob;
+        struct cl_lock_descr    *descr;
+        struct cl_lock          *lock;
+        int                      result;
+        int                      refcheck;
+
+        ENTRY;
+
+        /*
+         * XXX layering violation.
+         */
+        if (cl_i2info(inode)->lli_smd->lsm_stripe_count == 0)
+                RETURN(0);
+
+        result = cl_io_get(inode, &env, &io, &refcheck);
+        if (result <= 0)
+                RETURN(result);
+
+        clob = io->ci_obj;
+        result = cl_io_init(env, io, CIT_MISC, clob);
+        if (result > 0)
+                result = io->ci_result;
+        else if (result == 0) {
+                cti = ccc_env_info(env);
+                descr = &cti->cti_descr;
+
+                *descr = whole_file;
+                descr->cld_obj = clob;
+                lock = cl_lock_peek(env, io, descr, "localsize", cfs_current());
+                if (lock != NULL) {
+                        cl_merge_lvb(inode);
+                        cl_unuse(env, lock);
+                        cl_lock_release(env, lock, "localsize", cfs_current());
+                        result = 0;
+                } else
+                        result = -ENODATA;
+        }
+        cl_io_fini(env, io);
+        cl_env_put(env, &refcheck);
+        RETURN(result);
+}
+
diff --git a/lustre/lclient/lcommon_cl.c b/lustre/lclient/lcommon_cl.c

new file mode 100644 (file)

index 0000000..6b56b4e
--- /dev/null
+++ b/lustre/lclient/lcommon_cl.c
@@ -0,0 +1,1188 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * cl code shared between vvp and liblustre (and other Lustre clients in the
+ * future).
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#ifdef __KERNEL__
+# include <libcfs/libcfs.h>
+# include <linux/fs.h>
+# include <linux/sched.h>
+# include <linux/mm.h>
+# include <linux/smp_lock.h>
+# include <linux/quotaops.h>
+# include <linux/highmem.h>
+# include <linux/pagemap.h>
+# include <linux/rbtree.h>
+#else /* __KERNEL__ */
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/queue.h>
+#include <fcntl.h>
+# include <sysio.h>
+# ifdef HAVE_XTIO_H
+#  include <xtio.h>
+# endif
+# include <fs.h>
+# include <mount.h>
+# include <inode.h>
+# ifdef HAVE_FILE_H
+#  include <file.h>
+# endif
+# include <liblustre.h>
+#endif
+
+#include <obd.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <lustre_lite.h>
+#include <lustre_dlm.h>
+#include <lustre_ver.h>
+#include <lustre_mdc.h>
+#include <cl_object.h>
+
+#include <lclient.h>
+
+#ifdef __KERNEL__
+#include "../llite/llite_internal.h"
+#else
+#include "../liblustre/llite_lib.h"
+#endif
+
+const struct cl_req_operations ccc_req_ops;
+
+/*
+ * ccc_ prefix stands for "Common Client Code".
+ */
+
+static cfs_mem_cache_t *ccc_lock_kmem;
+static cfs_mem_cache_t *ccc_object_kmem;
+static cfs_mem_cache_t *ccc_thread_kmem;
+static cfs_mem_cache_t *ccc_session_kmem;
+static cfs_mem_cache_t *ccc_req_kmem;
+
+static struct lu_kmem_descr ccc_caches[] = {
+        {
+                .ckd_cache = &ccc_lock_kmem,
+                .ckd_name  = "ccc_lock_kmem",
+                .ckd_size  = sizeof (struct ccc_lock)
+        },
+        {
+                .ckd_cache = &ccc_object_kmem,
+                .ckd_name  = "ccc_object_kmem",
+                .ckd_size  = sizeof (struct ccc_object)
+        },
+        {
+                .ckd_cache = &ccc_thread_kmem,
+                .ckd_name  = "ccc_thread_kmem",
+                .ckd_size  = sizeof (struct ccc_thread_info),
+        },
+        {
+                .ckd_cache = &ccc_session_kmem,
+                .ckd_name  = "ccc_session_kmem",
+                .ckd_size  = sizeof (struct ccc_session)
+        },
+        {
+                .ckd_cache = &ccc_req_kmem,
+                .ckd_name  = "ccc_req_kmem",
+                .ckd_size  = sizeof (struct ccc_req)
+        },
+        {
+                .ckd_cache = NULL
+        }
+};
+
+/*****************************************************************************
+ *
+ * Vvp device and device type functions.
+ *
+ */
+
+void *ccc_key_init(const struct lu_context *ctx,
+                          struct lu_context_key *key)
+{
+        struct ccc_thread_info *info;
+
+        OBD_SLAB_ALLOC_PTR(info, ccc_thread_kmem);
+        if (info == NULL)
+                info = ERR_PTR(-ENOMEM);
+        return info;
+}
+
+void ccc_key_fini(const struct lu_context *ctx,
+                         struct lu_context_key *key, void *data)
+{
+        struct ccc_thread_info *info = data;
+        OBD_SLAB_FREE_PTR(info, ccc_thread_kmem);
+}
+
+void *ccc_session_key_init(const struct lu_context *ctx,
+                                  struct lu_context_key *key)
+{
+        struct ccc_session *session;
+
+        OBD_SLAB_ALLOC_PTR(session, ccc_session_kmem);
+        if (session == NULL)
+                session = ERR_PTR(-ENOMEM);
+        return session;
+}
+
+void ccc_session_key_fini(const struct lu_context *ctx,
+                                 struct lu_context_key *key, void *data)
+{
+        struct ccc_session *session = data;
+        OBD_SLAB_FREE_PTR(session, ccc_session_kmem);
+}
+
+struct lu_context_key ccc_key = {
+        .lct_tags = LCT_CL_THREAD,
+        .lct_init = ccc_key_init,
+        .lct_fini = ccc_key_fini
+};
+
+struct lu_context_key ccc_session_key = {
+        .lct_tags = LCT_SESSION,
+        .lct_init = ccc_session_key_init,
+        .lct_fini = ccc_session_key_fini
+};
+
+
+/* type constructor/destructor: ccc_type_{init,fini,start,stop}(). */
+// LU_TYPE_INIT_FINI(ccc, &ccc_key, &ccc_session_key);
+
+int ccc_device_init(const struct lu_env *env, struct lu_device *d,
+                           const char *name, struct lu_device *next)
+{
+        struct ccc_device  *vdv;
+        int rc;
+        ENTRY;
+
+        vdv = lu2ccc_dev(d);
+        vdv->cdv_next = lu2cl_dev(next);
+
+        LASSERT(d->ld_site != NULL && next->ld_type != NULL);
+        next->ld_site = d->ld_site;
+        rc = next->ld_type->ldt_ops->ldto_device_init(
+                        env, next, next->ld_type->ldt_name, NULL);
+        if (rc == 0) {
+                lu_device_get(next);
+                lu_ref_add(&next->ld_reference, "lu-stack", &lu_site_init);
+        }
+        RETURN(rc);
+}
+
+struct lu_device *ccc_device_fini(const struct lu_env *env,
+                                         struct lu_device *d)
+{
+        return cl2lu_dev(lu2ccc_dev(d)->cdv_next);
+}
+
+struct lu_device *ccc_device_alloc(const struct lu_env *env,
+                                   struct lu_device_type *t,
+                                   struct lustre_cfg *cfg,
+                                   const struct lu_device_operations *luops,
+                                   const struct cl_device_operations *clops)
+{
+        struct ccc_device *vdv;
+        struct lu_device  *lud;
+        struct cl_site    *site;
+        int rc;
+        ENTRY;
+
+        OBD_ALLOC_PTR(vdv);
+        if (vdv == NULL)
+                RETURN(ERR_PTR(-ENOMEM));
+
+        lud = &vdv->cdv_cl.cd_lu_dev;
+        cl_device_init(&vdv->cdv_cl, t);
+        ccc2lu_dev(vdv)->ld_ops = luops;
+        vdv->cdv_cl.cd_ops = clops;
+
+        OBD_ALLOC_PTR(site);
+        if (site != NULL) {
+                rc = cl_site_init(site, &vdv->cdv_cl);
+                if (rc == 0)
+                        rc = lu_site_init_finish(&site->cs_lu);
+                else {
+                        LASSERT(lud->ld_site == NULL);
+                        CERROR("Cannot init lu_site, rc %d.\n", rc);
+                        OBD_FREE_PTR(site);
+                }
+        } else
+                rc = -ENOMEM;
+        if (rc != 0) {
+                ccc_device_free(env, lud);
+                lud = ERR_PTR(rc);
+        }
+        RETURN(lud);
+}
+
+struct lu_device *ccc_device_free(const struct lu_env *env,
+                                         struct lu_device *d)
+{
+        struct ccc_device *vdv  = lu2ccc_dev(d);
+        struct cl_site    *site = lu2cl_site(d->ld_site);
+        struct lu_device  *next = cl2lu_dev(vdv->cdv_next);
+
+        if (d->ld_site != NULL) {
+                cl_site_fini(site);
+                OBD_FREE_PTR(site);
+        }
+        cl_device_fini(lu2cl_dev(d));
+        OBD_FREE_PTR(vdv);
+        return next;
+}
+
+int ccc_req_init(const struct lu_env *env, struct cl_device *dev,
+                        struct cl_req *req)
+{
+        struct ccc_req *vrq;
+        int result;
+
+        OBD_SLAB_ALLOC_PTR(vrq, ccc_req_kmem);
+        if (vrq != NULL) {
+                cl_req_slice_add(req, &vrq->crq_cl, dev, &ccc_req_ops);
+                result = 0;
+        } else
+                result = -ENOMEM;
+        return result;
+}
+
+/**
+ * An `emergency' environment used by ccc_inode_fini() when cl_env_get()
+ * fails. Access to this environment is serialized by ccc_inode_fini_guard
+ * mutex.
+ */
+static struct lu_env *ccc_inode_fini_env = NULL;
+
+/**
+ * A mutex serializing calls to slp_inode_fini() under extreme memory
+ * pressure, when environments cannot be allocated.
+ */
+static DEFINE_MUTEX(ccc_inode_fini_guard);
+static int dummy_refcheck;
+
+int ccc_global_init(struct lu_device_type *device_type)
+{
+        int result;
+
+        result = lu_kmem_init(ccc_caches);
+        if (result == 0) {
+                result = lu_device_type_init(device_type);
+                ccc_inode_fini_env = cl_env_alloc(&dummy_refcheck,
+                                                  LCT_REMEMBER|LCT_NOREF);
+                if (IS_ERR(ccc_inode_fini_env))
+                        result = PTR_ERR(ccc_inode_fini_env);
+                else
+                        ccc_inode_fini_env->le_ctx.lc_cookie = 0x4;
+        }
+        return result;
+}
+
+void ccc_global_fini(struct lu_device_type *device_type)
+{
+        if (ccc_inode_fini_env != NULL) {
+                cl_env_put(ccc_inode_fini_env, &dummy_refcheck);
+                ccc_inode_fini_env = NULL;
+        }
+        lu_device_type_fini(device_type);
+        lu_kmem_fini(ccc_caches);
+}
+
+/*****************************************************************************
+ *
+ * Object operations.
+ *
+ */
+
+struct lu_object *ccc_object_alloc(const struct lu_env *env,
+                                   const struct lu_object_header *_,
+                                   struct lu_device *dev,
+                                   const struct cl_object_operations *clops,
+                                   const struct lu_object_operations *luops)
+{
+        struct ccc_object *vob;
+        struct lu_object  *obj;
+
+        OBD_SLAB_ALLOC_PTR(vob, ccc_object_kmem);
+        if (vob != NULL) {
+                struct cl_object_header *hdr;
+
+                obj = ccc2lu(vob);
+                hdr = &vob->cob_header;
+                cl_object_header_init(hdr);
+                lu_object_init(obj, &hdr->coh_lu, dev);
+                lu_object_add_top(&hdr->coh_lu, obj);
+
+                vob->cob_cl.co_ops = clops;
+                obj->lo_ops = luops;
+        } else
+                obj = NULL;
+        return obj;
+}
+
+int ccc_object_init0(const struct lu_env *env,
+                            struct ccc_object *vob,
+                            const struct cl_object_conf *conf)
+{
+        vob->cob_inode = conf->coc_inode;
+        vob->cob_transient_pages = 0;
+        return 0;
+}
+
+int ccc_object_init(const struct lu_env *env, struct lu_object *obj,
+                           const struct lu_object_conf *conf)
+{
+        struct ccc_device *dev = lu2ccc_dev(obj->lo_dev);
+        struct ccc_object *vob = lu2ccc(obj);
+        struct lu_object  *below;
+        struct lu_device  *under;
+        int result;
+
+        under = &dev->cdv_next->cd_lu_dev;
+        below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, under);
+        if (below != NULL) {
+                const struct cl_object_conf *cconf;
+
+                cconf = lu2cl_conf(conf);
+                CFS_INIT_LIST_HEAD(&vob->cob_pending_list);
+                lu_object_add(obj, below);
+                result = ccc_object_init0(env, vob, cconf);
+        } else
+                result = -ENOMEM;
+        return result;
+}
+
+void ccc_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+        struct ccc_object *vob = lu2ccc(obj);
+
+        lu_object_fini(obj);
+        lu_object_header_fini(obj->lo_header);
+        OBD_SLAB_FREE_PTR(vob, ccc_object_kmem);
+}
+
+int ccc_lock_init(const struct lu_env *env,
+                  struct cl_object *obj, struct cl_lock *lock,
+                  const struct cl_io *_,
+                  const struct cl_lock_operations *lkops)
+{
+        struct ccc_lock *clk;
+        int result;
+
+        CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+
+        OBD_SLAB_ALLOC_PTR(clk, ccc_lock_kmem);
+        if (clk != NULL) {
+                cl_lock_slice_add(lock, &clk->clk_cl, obj, lkops);
+                result = 0;
+        } else
+                result = -ENOMEM;
+        return result;
+}
+
+int ccc_attr_set(const struct lu_env *env, struct cl_object *obj,
+                 const struct cl_attr *attr, unsigned valid)
+{
+        return 0;
+}
+
+int ccc_object_glimpse(const struct lu_env *env,
+                       const struct cl_object *obj, struct ost_lvb *lvb)
+{
+        struct inode *inode = ccc_object_inode(obj);
+
+        ENTRY;
+        lvb->lvb_mtime = cl_inode_mtime(inode);
+        lvb->lvb_atime = cl_inode_atime(inode);
+        lvb->lvb_ctime = cl_inode_ctime(inode);
+        RETURN(0);
+}
+
+
+
+int ccc_conf_set(const struct lu_env *env, struct cl_object *obj,
+                        const struct cl_object_conf *conf)
+{
+        /* TODO: destroy all pages attached to this object. */
+        return 0;
+}
+
+/*****************************************************************************
+ *
+ * Page operations.
+ *
+ */
+
+cfs_page_t *ccc_page_vmpage(const struct lu_env *env,
+                            const struct cl_page_slice *slice)
+{
+        return cl2vm_page(slice);
+}
+
+int ccc_page_is_under_lock(const struct lu_env *env,
+                           const struct cl_page_slice *slice,
+                           struct cl_io *io)
+{
+        struct ccc_io        *vio  = ccc_env_io(env);
+        struct cl_lock_descr *desc = &ccc_env_info(env)->cti_descr;
+        struct cl_page       *page = slice->cpl_page;
+
+        int result;
+
+        ENTRY;
+
+        if (io->ci_type == CIT_READ || io->ci_type == CIT_WRITE ||
+            io->ci_type == CIT_FAULT) {
+                if (vio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED)
+                        result = -EBUSY;
+                else {
+                        desc->cld_start = page->cp_index;
+                        desc->cld_end   = page->cp_index;
+                        desc->cld_obj   = page->cp_obj;
+                        desc->cld_mode  = CLM_READ;
+                        result = cl_queue_match(&io->ci_lockset.cls_done,
+                                                desc) ? -EBUSY : 0;
+                }
+        } else
+                result = 0;
+        RETURN(result);
+}
+
+int ccc_fail(const struct lu_env *env, const struct cl_page_slice *slice)
+{
+        /*
+         * Cached read?
+         */
+        LBUG();
+        return 0;
+}
+
+void ccc_transient_page_verify(const struct cl_page *page)
+{
+}
+
+void ccc_transient_page_own(const struct lu_env *env,
+                                   const struct cl_page_slice *slice,
+                                   struct cl_io *_)
+{
+        ccc_transient_page_verify(slice->cpl_page);
+}
+
+void ccc_transient_page_assume(const struct lu_env *env,
+                                      const struct cl_page_slice *slice,
+                                      struct cl_io *_)
+{
+        ccc_transient_page_verify(slice->cpl_page);
+}
+
+void ccc_transient_page_unassume(const struct lu_env *env,
+                                        const struct cl_page_slice *slice,
+                                        struct cl_io *_)
+{
+        ccc_transient_page_verify(slice->cpl_page);
+}
+
+void ccc_transient_page_disown(const struct lu_env *env,
+                                      const struct cl_page_slice *slice,
+                                      struct cl_io *_)
+{
+        ccc_transient_page_verify(slice->cpl_page);
+}
+
+void ccc_transient_page_discard(const struct lu_env *env,
+                                       const struct cl_page_slice *slice,
+                                       struct cl_io *_)
+{
+        struct cl_page *page = slice->cpl_page;
+
+        ccc_transient_page_verify(slice->cpl_page);
+
+        /*
+         * For transient pages, remove it from the radix tree.
+         */
+        cl_page_delete(env, page);
+}
+
+int ccc_transient_page_prep(const struct lu_env *env,
+                                   const struct cl_page_slice *slice,
+                                   struct cl_io *_)
+{
+        ENTRY;
+        /* transient page should always be sent. */
+        RETURN(0);
+}
+
+/*****************************************************************************
+ *
+ * Lock operations.
+ *
+ */
+
+void ccc_lock_fini(const struct lu_env *env, struct cl_lock_slice *slice)
+{
+        struct ccc_lock *clk = cl2ccc_lock(slice);
+
+        CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj));
+        OBD_SLAB_FREE_PTR(clk, ccc_lock_kmem);
+}
+
+int ccc_lock_enqueue(const struct lu_env *env,
+                     const struct cl_lock_slice *slice,
+                     struct cl_io *_, __u32 enqflags)
+{
+        CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj));
+        return 0;
+}
+
+int ccc_lock_unuse(const struct lu_env *env, const struct cl_lock_slice *slice)
+{
+        CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj));
+        return 0;
+}
+
+int ccc_lock_wait(const struct lu_env *env, const struct cl_lock_slice *slice)
+{
+        CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj));
+        return 0;
+}
+
+/**
+ * Implementation of cl_lock_operations::clo_fits_into() methods for ccc
+ * layer. This function is executed every time io finds an existing lock in
+ * the lock cache while creating new lock. This function has to decide whether
+ * cached lock "fits" into io.
+ *
+ * \param slice lock to be checked
+ *
+ * \param io    IO that wants a lock.
+ *
+ * \see lov_lock_fits_into().
+ */
+int ccc_lock_fits_into(const struct lu_env *env,
+                       const struct cl_lock_slice *slice,
+                       const struct cl_lock_descr *need,
+                       const struct cl_io *io)
+{
+        const struct cl_lock       *lock  = slice->cls_lock;
+        const struct cl_lock_descr *descr = &lock->cll_descr;
+        const struct ccc_io        *cio   = ccc_env_io(env);
+        int                         result;
+
+        ENTRY;
+        /*
+         * Work around DLM peculiarity: it assumes that glimpse
+         * (LDLM_FL_HAS_INTENT) lock is always LCK_PR, and returns reads lock
+         * when asked for LCK_PW lock with LDLM_FL_HAS_INTENT flag set. Make
+         * sure that glimpse doesn't get CLM_WRITE top-lock, so that it
+         * doesn't enqueue CLM_WRITE sub-locks.
+         */
+        if (cio->cui_glimpse)
+                result = descr->cld_mode != CLM_WRITE;
+        /*
+         * Also, don't match incomplete write locks for read, otherwise read
+         * would enqueue missing sub-locks in the write mode.
+         *
+         * XXX this is a candidate for generic locking policy, to be moved
+         * into cl_lock_lookup().
+         */
+        else if (need->cld_mode != descr->cld_mode)
+                result = lock->cll_state >= CLS_ENQUEUED;
+        else
+                result = 1;
+        RETURN(result);
+}
+
+/**
+ * Implements cl_lock_operations::clo_state() method for vvp layer, invoked
+ * whenever lock state changes. Transfers object attributes, that might be
+ * updated as a result of lock acquiring into inode.
+ */
+void ccc_lock_state(const struct lu_env *env,
+                    const struct cl_lock_slice *slice,
+                    enum cl_lock_state state)
+{
+        struct cl_lock   *lock;
+        struct cl_object *obj;
+        struct inode     *inode;
+        struct cl_attr   *attr;
+
+        ENTRY;
+        lock = slice->cls_lock;
+
+        /*
+         * Refresh inode attributes when the lock is moving into CLS_HELD
+         * state, and only when this is a result of real enqueue, rather than
+         * of finding lock in the cache.
+         */
+        if (state == CLS_HELD && lock->cll_state < CLS_HELD) {
+                int rc;
+
+                obj   = slice->cls_obj;
+                inode = ccc_object_inode(obj);
+                attr  = &ccc_env_info(env)->cti_attr;
+
+                /* vmtruncate()->ll_truncate() first sets the i_size and then
+                 * the kms under both a DLM lock and the
+                 * ll_inode_size_lock().  If we don't get the
+                 * ll_inode_size_lock() here we can match the DLM lock and
+                 * reset i_size from the kms before the truncating path has
+                 * updated the kms.  generic_file_write can then trust the
+                 * stale i_size when doing appending writes and effectively
+                 * cancel the result of the truncate.  Getting the
+                 * ll_inode_size_lock() after the enqueue maintains the DLM
+                 * -> ll_inode_size_lock() acquiring order. */
+                cl_isize_lock(inode, 0);
+                cl_object_attr_lock(obj);
+                rc = cl_object_attr_get(env, obj, attr);
+                if (rc == 0) {
+                        if (lock->cll_descr.cld_start == 0 &&
+                            lock->cll_descr.cld_end == CL_PAGE_EOF) {
+                                cl_isize_write(inode, attr->cat_kms);
+                                CDEBUG(D_INODE, DFID" updating i_size %llu\n",
+                                       PFID(lu_object_fid(&obj->co_lu)),
+                                       (__u64)cl_isize_read(inode));
+                        }
+                        cl_inode_mtime(inode) = attr->cat_mtime;
+                        cl_inode_atime(inode) = attr->cat_atime;
+                        cl_inode_ctime(inode) = attr->cat_ctime;
+                } else
+                        CL_LOCK_DEBUG(D_ERROR, env, lock, "attr_get: %i\n", rc);
+                cl_object_attr_unlock(obj);
+                cl_isize_unlock(inode, 0);
+        }
+        EXIT;
+}
+
+/*****************************************************************************
+ *
+ * io operations.
+ *
+ */
+
+void ccc_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+        struct cl_io *io = ios->cis_io;
+
+        CLOBINVRNT(env, io->ci_obj, ccc_object_invariant(io->ci_obj));
+}
+
+int ccc_io_one_lock_index(const struct lu_env *env, struct cl_io *io,
+                          __u32 enqflags, enum cl_lock_mode mode,
+                          pgoff_t start, pgoff_t end)
+{
+        struct ccc_io          *vio   = ccc_env_io(env);
+        struct cl_lock_descr   *descr = &vio->cui_link.cill_descr;
+        struct cl_object       *obj   = io->ci_obj;
+
+        CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+        ENTRY;
+
+        CDEBUG(D_VFSTRACE, "lock: %i [%lu, %lu]\n", mode, start, end);
+
+        memset(&vio->cui_link, 0, sizeof vio->cui_link);
+        descr->cld_mode  = mode;
+        descr->cld_obj   = obj;
+        descr->cld_start = start;
+        descr->cld_end   = end;
+
+        vio->cui_link.cill_enq_flags = enqflags;
+        cl_io_lock_add(env, io, &vio->cui_link);
+        RETURN(0);
+}
+
+int ccc_io_one_lock(const struct lu_env *env, struct cl_io *io,
+                    __u32 enqflags, enum cl_lock_mode mode,
+                    loff_t start, loff_t end)
+{
+        struct cl_object *obj = io->ci_obj;
+
+        return ccc_io_one_lock_index(env, io, enqflags, mode,
+                                     cl_index(obj, start), cl_index(obj, end));
+}
+
+void ccc_io_end(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+        CLOBINVRNT(env, ios->cis_io->ci_obj,
+                   ccc_object_invariant(ios->cis_io->ci_obj));
+}
+
+static void ccc_object_size_lock(struct cl_object *obj, int vfslock)
+{
+        struct inode *inode = ccc_object_inode(obj);
+
+        if (vfslock)
+                cl_isize_lock(inode, 0);
+        cl_object_attr_lock(obj);
+}
+
+static void ccc_object_size_unlock(struct cl_object *obj, int vfslock)
+{
+        struct inode *inode = ccc_object_inode(obj);
+
+        cl_object_attr_unlock(obj);
+        if (vfslock)
+                cl_isize_unlock(inode, 0);
+}
+
+/**
+ * Helper function that if necessary adjusts file size (inode->i_size), when
+ * position at the offset \a pos is accessed. File size can be arbitrary stale
+ * on a Lustre client, but client at least knows KMS. If accessed area is
+ * inside [0, KMS], set file size to KMS, otherwise glimpse file size.
+ *
+ * Locking: cl_isize_lock is used to serialize changes to inode size and to
+ * protect consistency between inode size and cl_object
+ * attributes. cl_object_size_lock() protects consistency between cl_attr's of
+ * top-object and sub-objects.
+ *
+ * In page fault path cl_isize_lock cannot be taken, client has to live with
+ * the resulting races.
+ */
+int ccc_prep_size(const struct lu_env *env, struct cl_object *obj,
+                  struct cl_io *io, loff_t pos, int vfslock)
+{
+        struct cl_attr *attr  = &ccc_env_info(env)->cti_attr;
+        struct inode   *inode = ccc_object_inode(obj);
+        loff_t kms;
+        int result;
+
+        /*
+         * Consistency guarantees: following possibilities exist for the
+         * relation between region being accessed and real file size at this
+         * moment:
+         *
+         *  (A): the region is completely inside of the file;
+         *
+         *  (B-x): x bytes of region are inside of the file, the rest is
+         *  outside;
+         *
+         *  (C): the region is completely outside of the file.
+         *
+         * This classification is stable under DLM lock already acquired by
+         * the caller, because to change the class, other client has to take
+         * DLM lock conflicting with our lock. Also, any updates to ->i_size
+         * by other threads on this client are serialized by
+         * ll_inode_size_lock(). This guarantees that short reads are handled
+         * correctly in the face of concurrent writes and truncates.
+         */
+        ccc_object_size_lock(obj, vfslock);
+        result = cl_object_attr_get(env, obj, attr);
+        if (result == 0) {
+                kms = attr->cat_kms;
+                if (pos > kms) {
+                        /*
+                         * A glimpse is necessary to determine whether we
+                         * return a short read (B) or some zeroes at the end
+                         * of the buffer (C)
+                         */
+                        ccc_object_size_unlock(obj, vfslock);
+                        return cl_glimpse_lock(env, io, inode, obj);
+                } else {
+                        /*
+                         * region is within kms and, hence, within real file
+                         * size (A). We need to increase i_size to cover the
+                         * read region so that generic_file_read() will do its
+                         * job, but that doesn't mean the kms size is
+                         * _correct_, it is only the _minimum_ size. If
+                         * someone does a stat they will get the correct size
+                         * which will always be >= the kms value here.
+                         * b=11081
+                         */
+                        /*
+                         * XXX in a page fault path, change inode size without
+                         * ll_inode_size_lock() held!  there is a race
+                         * condition with truncate path. (see ll_extent_lock)
+                         */
+                        /*
+                         * XXX i_size_write() is not used because it is not
+                         * safe to take the ll_inode_size_lock() due to a
+                         * potential lock inversion (bug 6077).  And since
+                         * it's not safe to use i_size_write() without a
+                         * covering mutex we do the assignment directly.  It
+                         * is not critical that the size be correct.
+                         */
+                        if (cl_isize_read(inode) < kms) {
+                                if (vfslock)
+                                        cl_isize_write(inode, kms);
+                                else
+                                        cl_isize_write_nolock(inode, kms);
+                        }
+                }
+        }
+        ccc_object_size_unlock(obj, vfslock);
+        return result;
+}
+
+/*****************************************************************************
+ *
+ * Transfer operations.
+ *
+ */
+
+void ccc_req_completion(const struct lu_env *env,
+                        const struct cl_req_slice *slice, int ioret)
+{
+        struct ccc_req *vrq;
+
+        vrq = cl2ccc_req(slice);
+        OBD_SLAB_FREE_PTR(vrq, ccc_req_kmem);
+}
+
+/**
+ * Implementation of struct cl_req_operations::cro_attr_set() for ccc
+ * layer. ccc is responsible for
+ *
+ *    - o_[mac]time
+ *
+ *    - o_mode
+ *
+ *    - o_fid (filled with inode number?!)
+ *
+ *    - o_[ug]id
+ *
+ *    - o_generation
+ *
+ *    - and IO epoch (stored in o_easize),
+ *
+ *  and capability.
+ */
+void ccc_req_attr_set(const struct lu_env *env,
+                      const struct cl_req_slice *slice,
+                      const struct cl_object *obj,
+                      struct cl_req_attr *attr, obd_valid flags)
+{
+        struct inode *inode;
+        struct obdo  *oa;
+        obd_flag      valid_flags;
+
+        oa = attr->cra_oa;
+        inode = ccc_object_inode(obj);
+        valid_flags = OBD_MD_FLTYPE|OBD_MD_FLATIME;
+
+        if (flags != (obd_valid)~0ULL)
+                valid_flags |= OBD_MD_FLMTIME|OBD_MD_FLCTIME|OBD_MD_FLATIME;
+        else {
+                LASSERT(attr->cra_capa == NULL);
+                attr->cra_capa = cl_capa_lookup(inode,
+                                                slice->crs_req->crq_type);
+        }
+
+        if (slice->crs_req->crq_type == CRT_WRITE) {
+                if (flags & OBD_MD_FLEPOCH) {
+                        oa->o_valid |= OBD_MD_FLEPOCH;
+                        oa->o_easize = cl_i2info(inode)->lli_ioepoch;
+                        valid_flags |= OBD_MD_FLMTIME|OBD_MD_FLCTIME|
+                                OBD_MD_FLUID|OBD_MD_FLGID|
+                                OBD_MD_FLFID|OBD_MD_FLGENER;
+                }
+        }
+        obdo_from_inode(oa, inode, valid_flags & flags);
+}
+
+const struct cl_req_operations ccc_req_ops = {
+        .cro_attr_set   = ccc_req_attr_set,
+        .cro_completion = ccc_req_completion
+};
+
+/* Setattr helpers */
+int cl_setattr_do_truncate(struct inode *inode, loff_t size,
+                           struct obd_capa *capa)
+{
+        struct lu_env *env;
+        struct cl_io  *io;
+        int            result;
+        int            refcheck;
+
+        ENTRY;
+
+        env = cl_env_get(&refcheck);
+        if (IS_ERR(env))
+                RETURN(PTR_ERR(env));
+
+        io = &ccc_env_info(env)->cti_io;
+        io->ci_obj = cl_i2info(inode)->lli_clob;
+        io->u.ci_truncate.tr_size = size;
+        io->u.ci_truncate.tr_capa = capa;
+        if (cl_io_init(env, io, CIT_TRUNC, io->ci_obj) == 0)
+                result = cl_io_loop(env, io);
+        else
+                result = io->ci_result;
+        cl_io_fini(env, io);
+        cl_env_put(env, &refcheck);
+        RETURN(result);
+}
+
+int cl_setattr_ost(struct inode *inode, struct obd_capa *capa)
+{
+        struct cl_inode_info *lli = cl_i2info(inode);
+        struct lov_stripe_md *lsm = lli->lli_smd;
+        int rc;
+        obd_flag flags;
+        struct obd_info oinfo = { { { 0 } } };
+        struct obdo *oa;
+
+        OBDO_ALLOC(oa);
+        if (oa) {
+                oa->o_id = lsm->lsm_object_id;
+                oa->o_gr = lsm->lsm_object_gr;
+                oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
+
+                flags = OBD_MD_FLTYPE | OBD_MD_FLATIME |
+                        OBD_MD_FLMTIME | OBD_MD_FLCTIME |
+                        OBD_MD_FLFID | OBD_MD_FLGENER |
+                        OBD_MD_FLGROUP;
+
+                obdo_from_inode(oa, inode, flags);
+
+                oinfo.oi_oa = oa;
+                oinfo.oi_md = lsm;
+
+                /* XXX: this looks unnecessary now. */
+                rc = obd_setattr_rqset(cl_i2sbi(inode)->ll_dt_exp, &oinfo,
+                                       NULL);
+                if (rc)
+                        CERROR("obd_setattr_async fails: rc=%d\n", rc);
+                OBDO_FREE(oa);
+        } else {
+                rc = -ENOMEM;
+        }
+        return rc;
+}
+
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ */
+
+struct lu_device *ccc2lu_dev(struct ccc_device *vdv)
+{
+        return &vdv->cdv_cl.cd_lu_dev;
+}
+
+struct ccc_device *lu2ccc_dev(const struct lu_device *d)
+{
+        return container_of0(d, struct ccc_device, cdv_cl.cd_lu_dev);
+}
+
+struct ccc_device *cl2ccc_dev(const struct cl_device *d)
+{
+        return container_of0(d, struct ccc_device, cdv_cl);
+}
+
+struct lu_object *ccc2lu(struct ccc_object *vob)
+{
+        return &vob->cob_cl.co_lu;
+}
+
+struct ccc_object *lu2ccc(const struct lu_object *obj)
+{
+        return container_of0(obj, struct ccc_object, cob_cl.co_lu);
+}
+
+struct ccc_object *cl2ccc(const struct cl_object *obj)
+{
+        return container_of0(obj, struct ccc_object, cob_cl);
+}
+
+struct ccc_lock *cl2ccc_lock(const struct cl_lock_slice *slice)
+{
+        return container_of(slice, struct ccc_lock, clk_cl);
+}
+
+struct ccc_io *cl2ccc_io(const struct lu_env *env,
+                         const struct cl_io_slice *slice)
+{
+        struct ccc_io *cio;
+
+        cio = container_of(slice, struct ccc_io, cui_cl);
+        LASSERT(cio == ccc_env_io(env));
+        return cio;
+}
+
+struct ccc_req *cl2ccc_req(const struct cl_req_slice *slice)
+{
+        return container_of0(slice, struct ccc_req, crq_cl);
+}
+
+cfs_page_t *cl2vm_page(const struct cl_page_slice *slice)
+{
+        return cl2ccc_page(slice)->cpg_page;
+}
+
+/*****************************************************************************
+ *
+ * Accessors.
+ *
+ */
+int ccc_object_invariant(const struct cl_object *obj)
+{
+        struct inode         *inode = ccc_object_inode(obj);
+        struct cl_inode_info *lli   = cl_i2info(inode);
+
+        return (S_ISREG(cl_inode_mode(inode)) ||
+                /* i_mode of unlinked inode is zeroed. */
+                cl_inode_mode(inode) == 0) && lli->lli_clob == obj;
+}
+
+struct inode *ccc_object_inode(const struct cl_object *obj)
+{
+        return cl2ccc(obj)->cob_inode;
+}
+
+/**
+ * Returns a pointer to cl_page associated with \a vmpage, without acquiring
+ * additional reference to the resulting page. This is an unsafe version of
+ * cl_vmpage_page() that can only be used under vmpage lock.
+ */
+struct cl_page *ccc_vmpage_page_transient(cfs_page_t *vmpage)
+{
+        KLASSERT(PageLocked(vmpage));
+        return (struct cl_page *)vmpage->private;
+}
+
+/**
+ * Initializes or updates CLIO part when new meta-data arrives from the
+ * server.
+ *
+ *     - allocates cl_object if necessary,
+ *     - updated layout, if object was already here.
+ */
+int cl_inode_init(struct inode *inode, struct lustre_md *md)
+{
+        struct lu_env        *env;
+        struct cl_inode_info *lli;
+        struct cl_object     *clob;
+        struct lu_site       *site;
+        struct lu_fid        *fid;
+        const struct cl_object_conf conf = {
+                .coc_inode = inode,
+                .u = {
+                        .coc_md    = md
+                }
+        };
+        int result = 0;
+        int refcheck;
+
+        /* LASSERT(inode->i_state & I_NEW); */
+        LASSERT(md->body->valid & OBD_MD_FLID);
+
+        if (!S_ISREG(cl_inode_mode(inode)))
+                return 0;
+
+        env = cl_env_get(&refcheck);
+        if (IS_ERR(env))
+                return PTR_ERR(env);
+
+        site = cl_i2sbi(inode)->ll_site;
+        lli  = cl_i2info(inode);
+        fid  = &lli->lli_fid;
+        LASSERT(fid_is_sane(fid));
+
+        if (lli->lli_clob == NULL) {
+                clob = cl_object_find(env, lu2cl_dev(site->ls_top_dev),
+                                      fid, &conf);
+                if (!IS_ERR(clob)) {
+                        /*
+                         * No locking is necessary, as new inode is
+                         * locked by I_NEW bit.
+                         *
+                         * XXX not true for call from ll_update_inode().
+                         */
+                        lli->lli_clob = clob;
+                        lu_object_ref_add(&clob->co_lu, "inode", inode);
+                } else
+                        result = PTR_ERR(clob);
+        } else
+                result = cl_conf_set(env, lli->lli_clob, &conf);
+        cl_env_put(env, &refcheck);
+
+        if (result != 0)
+                CERROR("Failure to initialize cl object "DFID": %d\n",
+                       PFID(fid), result);
+        return result;
+}
+
+void cl_inode_fini(struct inode *inode)
+{
+        struct lu_env           *env;
+        struct cl_inode_info    *lli  = cl_i2info(inode);
+        struct cl_object        *clob = lli->lli_clob;
+        int refcheck;
+        int emergency;
+
+        if (clob != NULL) {
+                struct lu_object_header *head = clob->co_lu.lo_header;
+                void                    *cookie;
+
+                cookie = cl_env_reenter();
+                env = cl_env_get(&refcheck);
+                emergency = IS_ERR(env);
+                if (emergency) {
+                        mutex_lock(&ccc_inode_fini_guard);
+                        LASSERT(ccc_inode_fini_env != NULL);
+                        cl_env_implant(ccc_inode_fini_env, &refcheck);
+                        env = ccc_inode_fini_env;
+                }
+                /*
+                 * cl_object cache is a slave to inode cache (which, in turn
+                 * is a slave to dentry cache), don't keep cl_object in memory
+                 * when its master is evicted.
+                 */
+                cl_object_kill(env, clob);
+                lu_object_ref_del(&clob->co_lu, "inode", inode);
+                /* XXX temporary: this is racy */
+                LASSERT(atomic_read(&head->loh_ref) == 1);
+                cl_object_put(env, clob);
+                lli->lli_clob = NULL;
+                if (emergency) {
+                        cl_env_unplant(ccc_inode_fini_env, &refcheck);
+                        mutex_unlock(&ccc_inode_fini_guard);
+                } else
+                        cl_env_put(env, &refcheck);
+                cl_env_reexit(cookie);
+        }
+}
diff --git a/lustre/ldlm/ldlm_extent.c b/lustre/ldlm/ldlm_extent.c

index ce51cd3..0858207 100644 (file)
--- a/lustre/ldlm/ldlm_extent.c
+++ b/lustre/ldlm/ldlm_extent.c
@@ -65,7 +65,7 @@ static void ldlm_extent_internal_policy_fixup(struct ldlm_lock *req,
          __u64 req_start = req->l_req_extent.start;
          __u64 req_end = req->l_req_extent.end;
          __u64 req_align, mask;
- 
+
          if (conflicting > 32 && (req_mode == LCK_PW || req_mode == LCK_CW)) {
                  if (req_end < req_start + LDLM_MAX_GROWN_EXTENT)
                          new_ex->end = min(req_start + LDLM_MAX_GROWN_EXTENT,
@@ -732,7 +732,7 @@ int ldlm_process_extent_lock(struct ldlm_lock *lock, int *flags, int first_enq,
                                   * break earlier because otherwise, we will go
                                   * to restart and ldlm_resource_unlink will be
                                   * called and it causes the interval node to be
-                                 * freed. Then we will fail at 
+                                 * freed. Then we will fail at
                                   * ldlm_extent_add_lock() */
                                  *flags &= ~(LDLM_FL_BLOCK_GRANTED | LDLM_FL_BLOCK_CONV |
                                              LDLM_FL_BLOCK_WAIT);
diff --git a/lustre/ldlm/ldlm_flock.c b/lustre/ldlm/ldlm_flock.c

index d38cfdf..1089022 100644 (file)
--- a/lustre/ldlm/ldlm_flock.c
+++ b/lustre/ldlm/ldlm_flock.c
@@ -112,7 +112,7 @@ ldlm_flock_destroy(struct ldlm_lock *lock, ldlm_mode_t mode, int flags)
                  /* client side - set a flag to prevent sending a CANCEL */
                  lock->l_flags |= LDLM_FL_LOCAL_ONLY | LDLM_FL_CBPENDING;
  
-                /* when reaching here, it is under lock_res_and_lock(). Thus, 
+                /* when reaching here, it is under lock_res_and_lock(). Thus,
                     need call the nolock version of ldlm_lock_decref_internal*/
                  ldlm_lock_decref_internal_nolock(lock, mode);
          }
@@ -523,7 +523,7 @@ ldlm_flock_completion_ast(struct ldlm_lock *lock, int flags, void *data)
           * holding the lock even if app still believes it has it, since
           * server already dropped it anyway. Only for granted locks too. */
          lock_res_and_lock(lock);
-        if ((lock->l_flags & (LDLM_FL_FAILED|LDLM_FL_LOCAL_ONLY)) == 
+        if ((lock->l_flags & (LDLM_FL_FAILED|LDLM_FL_LOCAL_ONLY)) ==
              (LDLM_FL_FAILED|LDLM_FL_LOCAL_ONLY)) {
                  unlock_res_and_lock(lock);
                  if (lock->l_req_mode == lock->l_granted_mode &&
diff --git a/lustre/ldlm/ldlm_internal.h b/lustre/ldlm/ldlm_internal.h

index a6eb21c..5332b2f 100644 (file)
--- a/lustre/ldlm/ldlm_internal.h
+++ b/lustre/ldlm/ldlm_internal.h
@@ -45,19 +45,19 @@ extern struct list_head ldlm_cli_namespace_list;
  
  static inline atomic_t *ldlm_namespace_nr(ldlm_side_t client)
  {
-        return client == LDLM_NAMESPACE_SERVER ? 
+        return client == LDLM_NAMESPACE_SERVER ?
                  &ldlm_srv_namespace_nr : &ldlm_cli_namespace_nr;
  }
  
  static inline struct list_head *ldlm_namespace_list(ldlm_side_t client)
  {
-        return client == LDLM_NAMESPACE_SERVER ? 
+        return client == LDLM_NAMESPACE_SERVER ?
                  &ldlm_srv_namespace_list : &ldlm_cli_namespace_list;
  }
  
  static inline struct semaphore *ldlm_namespace_lock(ldlm_side_t client)
  {
-        return client == LDLM_NAMESPACE_SERVER ? 
+        return client == LDLM_NAMESPACE_SERVER ?
                  &ldlm_srv_namespace_lock : &ldlm_cli_namespace_lock;
  }
  
@@ -75,11 +75,11 @@ enum {
          LDLM_CANCEL_LRUR   = 1 << 3  /* Cancel locks from lru resize. */
  };
  
-int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr, ldlm_sync_t sync, 
+int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr, ldlm_sync_t sync,
                      int flags);
  int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels,
                            int count, int max, int cancel_flags, int flags);
-int ldlm_cancel_lru_estimate(struct ldlm_namespace *ns, int count, int max, 
+int ldlm_cancel_lru_estimate(struct ldlm_namespace *ns, int count, int max,
                               int flags);
  extern int ldlm_enqueue_min;
  int ldlm_get_enq_timeout(struct ldlm_lock *lock);
@@ -107,7 +107,7 @@ typedef enum {
          LDLM_WORK_BL_AST,
          LDLM_WORK_CP_AST,
          LDLM_WORK_REVOKE_AST
-} ldlm_desc_ast_t; 
+} ldlm_desc_ast_t;
  
  void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list);
  struct ldlm_lock *
diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c

index f6b0a4d..81d357b 100644 (file)
--- a/lustre/ldlm/ldlm_lib.c
+++ b/lustre/ldlm/ldlm_lib.c
@@ -1583,7 +1583,6 @@ static int target_recovery_thread(void *arg)
          unsigned long flags;
          struct lu_env env;
          struct ptlrpc_thread fake_svc_thread, *thread = &fake_svc_thread;
-        __u32 recov_ctx_tags = LCT_MD_THREAD;
          int rc = 0;
          ENTRY;
  
@@ -1594,7 +1593,7 @@ static int target_recovery_thread(void *arg)
          RECALC_SIGPENDING;
          SIGNAL_MASK_UNLOCK(current, flags);
  
-        rc = lu_context_init(&env.le_ctx, recov_ctx_tags);
+        rc = lu_context_init(&env.le_ctx, LCT_MD_THREAD);
          if (rc)
                  RETURN(rc);
  
diff --git a/lustre/ldlm/ldlm_lock.c b/lustre/ldlm/ldlm_lock.c

index 1ea9a7b..27fa1af 100644 (file)
--- a/lustre/ldlm/ldlm_lock.c
+++ b/lustre/ldlm/ldlm_lock.c
@@ -414,7 +414,7 @@ int ldlm_lock_change_resource(struct ldlm_namespace *ns, struct ldlm_lock *lock,
                  lock_res(oldres);
                  lock_res_nested(newres, LRT_NEW);
          } else {
-        lock_res(newres);
+                lock_res(newres);
                  lock_res_nested(oldres, LRT_NEW);
          }
          LASSERT(memcmp(new_resid, &oldres->lr_name,
@@ -619,7 +619,8 @@ int ldlm_lock_addref_try(struct lustre_handle *lockh, __u32 mode)
          lock = ldlm_handle2lock(lockh);
          if (lock != NULL) {
                  lock_res_and_lock(lock);
-                if (!(lock->l_flags & LDLM_FL_CBPENDING)) {
+                if (lock->l_readers != 0 || lock->l_writers != 0 ||
+                    !(lock->l_flags & LDLM_FL_CBPENDING)) {
                          ldlm_lock_addref_internal_nolock(lock, mode);
                          result = 0;
                  }
@@ -916,7 +917,8 @@ void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list)
  static struct ldlm_lock *search_queue(struct list_head *queue,
                                        ldlm_mode_t *mode,
                                        ldlm_policy_data_t *policy,
-                                      struct ldlm_lock *old_lock, int flags)
+                                      struct ldlm_lock *old_lock,
+                                      int flags, int unref)
  {
          struct ldlm_lock *lock;
          struct list_head *tmp;
@@ -938,7 +940,7 @@ static struct ldlm_lock *search_queue(struct list_head *queue,
                  if (lock->l_flags & LDLM_FL_CBPENDING &&
                      !(flags & LDLM_FL_CBPENDING))
                          continue;
-                if (lock->l_flags & LDLM_FL_CBPENDING &&
+                if (!unref && lock->l_flags & LDLM_FL_CBPENDING &&
                      lock->l_readers == 0 && lock->l_writers == 0)
                          continue;
  
@@ -965,7 +967,8 @@ static struct ldlm_lock *search_queue(struct list_head *queue,
                        policy->l_inodebits.bits))
                          continue;
  
-                if (lock->l_destroyed || (lock->l_flags & LDLM_FL_FAILED))
+                if (!unref &&
+                    (lock->l_destroyed || (lock->l_flags & LDLM_FL_FAILED)))
                          continue;
  
                  if ((flags & LDLM_FL_LOCAL_ONLY) &&
@@ -993,88 +996,6 @@ void ldlm_lock_allow_match(struct ldlm_lock *lock)
          unlock_res_and_lock(lock);
  }
  
-/**
- * Checks if requested extent lock is compatible with another owned lock.
- *
- * Checks if \a lock is compatible with a read or write lock
- * (specified by \a rw) for an extent [\a start , \a end].
- *
- * \param lock the already owned lock
- * \param rw OBD_BRW_READ if requested for reading,
- *           OBD_BRW_WRITE if requested for writing
- * \param start start of the requested extent
- * \param end end of the requested extent
- * \param cookie transparent parameter for passing locking context
- *
- * \post result == 1, *cookie == context, appropriate lock is referenced
- *
- * \retval 1 owned lock is reused for the request
- * \retval 0 no lock reused for the request
- *
- * \see ldlm_lock_fast_release
- */
-int ldlm_lock_fast_match(struct ldlm_lock *lock, int rw,
-                         obd_off start, obd_off end,
-                         void **cookie)
-{
-        LASSERT(rw == OBD_BRW_READ || rw == OBD_BRW_WRITE);
-
-        if (!lock)
-                return 0;
-
-        lock_res_and_lock(lock);
-        /* check if granted mode is compatible */
-        if (rw == OBD_BRW_WRITE &&
-            !(lock->l_granted_mode & (LCK_PW|LCK_GROUP)))
-                goto no_match;
-
-        /* does the lock cover the region we would like to access? */
-        if ((lock->l_policy_data.l_extent.start > start) ||
-            (lock->l_policy_data.l_extent.end < end))
-                goto no_match;
-
-        /* if we received a blocking callback and the lock is no longer
-         * referenced, don't use it */
-        if ((lock->l_flags & LDLM_FL_CBPENDING) &&
-            !lock->l_writers && !lock->l_readers)
-                goto no_match;
-
-        ldlm_lock_addref_internal_nolock(lock, rw == OBD_BRW_WRITE ?
-                                                        LCK_PW : LCK_PR);
-        unlock_res_and_lock(lock);
-        *cookie = (void *)lock;
-        return 1; /* avoid using rc for stack relief */
-
-no_match:
-        unlock_res_and_lock(lock);
-        return 0;
-}
-
-/**
- * Releases a reference to a lock taken in a "fast" way.
- *
- * Releases a read or write (specified by \a rw) lock
- * referenced by \a cookie.
- *
- * \param rw OBD_BRW_READ if requested for reading,
- *           OBD_BRW_WRITE if requested for writing
- * \param cookie transparent parameter for passing locking context
- *
- * \post appropriate lock is dereferenced
- *
- * \see ldlm_lock_fast_lock
- */
-void ldlm_lock_fast_release(void *cookie, int rw)
-{
-        struct ldlm_lock *lock = (struct ldlm_lock *)cookie;
-
-        LASSERT(lock != NULL);
-        LASSERT(rw == OBD_BRW_READ || rw == OBD_BRW_WRITE);
-        LASSERT(rw == OBD_BRW_READ ||
-                (lock->l_granted_mode & (LCK_PW | LCK_GROUP)));
-        ldlm_lock_decref_internal(lock, rw == OBD_BRW_WRITE ? LCK_PW : LCK_PR);
-}
-
  /* Can be called in two ways:
   *
   * If 'ns' is NULL, then lockh describes an existing lock that we want to look
@@ -1102,7 +1023,7 @@ void ldlm_lock_fast_release(void *cookie, int rw)
  ldlm_mode_t ldlm_lock_match(struct ldlm_namespace *ns, int flags,
                              const struct ldlm_res_id *res_id, ldlm_type_t type,
                              ldlm_policy_data_t *policy, ldlm_mode_t mode,
-                            struct lustre_handle *lockh)
+                            struct lustre_handle *lockh, int unref)
  {
          struct ldlm_resource *res;
          struct ldlm_lock *lock, *old_lock = NULL;
@@ -1128,15 +1049,18 @@ ldlm_mode_t ldlm_lock_match(struct ldlm_namespace *ns, int flags,
          LDLM_RESOURCE_ADDREF(res);
          lock_res(res);
  
-        lock = search_queue(&res->lr_granted, &mode, policy, old_lock, flags);
+        lock = search_queue(&res->lr_granted, &mode, policy, old_lock,
+                            flags, unref);
          if (lock != NULL)
                  GOTO(out, rc = 1);
          if (flags & LDLM_FL_BLOCK_GRANTED)
                  GOTO(out, rc = 0);
-        lock = search_queue(&res->lr_converting, &mode, policy, old_lock,flags);
+        lock = search_queue(&res->lr_converting, &mode, policy, old_lock,
+                            flags, unref);
          if (lock != NULL)
                  GOTO(out, rc = 1);
-        lock = search_queue(&res->lr_waiting, &mode, policy, old_lock, flags);
+        lock = search_queue(&res->lr_waiting, &mode, policy, old_lock,
+                            flags, unref);
          if (lock != NULL)
                  GOTO(out, rc = 1);
  
diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c

index 7891e7a..c4fb02d 100644 (file)
--- a/lustre/ldlm/ldlm_lockd.c
+++ b/lustre/ldlm/ldlm_lockd.c
@@ -2394,8 +2394,6 @@ EXPORT_SYMBOL(ldlm_lock2handle);
  EXPORT_SYMBOL(__ldlm_handle2lock);
  EXPORT_SYMBOL(ldlm_lock_get);
  EXPORT_SYMBOL(ldlm_lock_put);
-EXPORT_SYMBOL(ldlm_lock_fast_match);
-EXPORT_SYMBOL(ldlm_lock_fast_release);
  EXPORT_SYMBOL(ldlm_lock_match);
  EXPORT_SYMBOL(ldlm_lock_cancel);
  EXPORT_SYMBOL(ldlm_lock_addref);
@@ -2403,7 +2401,6 @@ EXPORT_SYMBOL(ldlm_lock_addref_try);
  EXPORT_SYMBOL(ldlm_lock_decref);
  EXPORT_SYMBOL(ldlm_lock_decref_and_cancel);
  EXPORT_SYMBOL(ldlm_lock_change_resource);
-EXPORT_SYMBOL(ldlm_lock_set_data);
  EXPORT_SYMBOL(ldlm_it2str);
  EXPORT_SYMBOL(ldlm_lock_dump);
  EXPORT_SYMBOL(ldlm_lock_dump_handle);
diff --git a/lustre/ldlm/ldlm_pool.c b/lustre/ldlm/ldlm_pool.c

index c870218..4d0b57e 100644 (file)
--- a/lustre/ldlm/ldlm_pool.c
+++ b/lustre/ldlm/ldlm_pool.c
@@ -38,7 +38,7 @@
   * Author: Yury Umanets <umka@clusterfs.com>
   */
  
-/* 
+/*
   * Idea of this code is rather simple. Each second, for each server namespace
   * we have SLV - server lock volume which is calculated on current number of
   * granted locks, grant speed for past period, etc - that is, locking load.
@@ -103,6 +103,8 @@
  # include <liblustre.h>
  #endif
  
+#include <cl_object.h>
+
  #include <obd_class.h>
  #include <obd_support.h>
  #include "ldlm_internal.h"
@@ -110,17 +112,17 @@
  #ifdef HAVE_LRU_RESIZE_SUPPORT
  
  /*
- * 50 ldlm locks for 1MB of RAM. 
+ * 50 ldlm locks for 1MB of RAM.
   */
  #define LDLM_POOL_HOST_L ((num_physpages >> (20 - CFS_PAGE_SHIFT)) * 50)
  
  /*
- * Maximal possible grant step plan in %. 
+ * Maximal possible grant step plan in %.
   */
  #define LDLM_POOL_MAX_GSP (30)
  
  /*
- * Minimal possible grant step plan in %. 
+ * Minimal possible grant step plan in %.
   */
  #define LDLM_POOL_MIN_GSP (1)
  
@@ -130,13 +132,13 @@
   */
  #define LDLM_POOL_GSP_STEP (4)
  
-/* 
- * LDLM_POOL_GSP% of all locks is default GP. 
+/*
+ * LDLM_POOL_GSP% of all locks is default GP.
   */
  #define LDLM_POOL_GP(L)   (((L) * LDLM_POOL_MAX_GSP) / 100)
  
-/* 
- * Max age for locks on clients. 
+/*
+ * Max age for locks on clients.
   */
  #define LDLM_POOL_MAX_AGE (36000)
  
@@ -158,7 +160,7 @@ static inline __u64 ldlm_pool_slv_max(__u32 L)
  {
          /*
           * Allow to have all locks for 1 client for 10 hrs.
-         * Formula is the following: limit * 10h / 1 client. 
+         * Formula is the following: limit * 10h / 1 client.
           */
          __u64 lim = L *  LDLM_POOL_MAX_AGE / 1;
          return lim;
@@ -191,7 +193,7 @@ static inline struct ldlm_namespace *ldlm_pl2ns(struct ldlm_pool *pl)
  }
  
  /**
- * Calculates suggested grant_step in % of available locks for passed 
+ * Calculates suggested grant_step in % of available locks for passed
   * \a period. This is later used in grant_plan calculations.
   */
  static inline int ldlm_pool_t2gsp(int t)
@@ -199,7 +201,7 @@ static inline int ldlm_pool_t2gsp(int t)
          /*
           * This yeilds 1% grant step for anything below LDLM_POOL_GSP_STEP
           * and up to 30% for anything higher than LDLM_POOL_GSP_STEP.
-         * 
+         *
           * How this will affect execution is the following:
           *
           * - for thread peroid 1s we will have grant_step 1% which good from
@@ -211,25 +213,25 @@ static inline int ldlm_pool_t2gsp(int t)
           *
           * - for thread period 10s (which is default) we will have 23% which
           * means that clients will have enough of room to take some new locks
-         * without getting some back. All locks from this 23% which were not 
+         * without getting some back. All locks from this 23% which were not
           * taken by clients in current period will contribute in SLV growing.
           * SLV growing means more locks cached on clients until limit or grant
           * plan is reached.
           */
-        return LDLM_POOL_MAX_GSP - 
-                (LDLM_POOL_MAX_GSP - LDLM_POOL_MIN_GSP) / 
+        return LDLM_POOL_MAX_GSP -
+                (LDLM_POOL_MAX_GSP - LDLM_POOL_MIN_GSP) /
                  (1 << (t / LDLM_POOL_GSP_STEP));
  }
  
  /**
   * Recalculates next grant limit on passed \a pl.
   *
- * \pre ->pl_lock is locked. 
+ * \pre ->pl_lock is locked.
   */
  static inline void ldlm_pool_recalc_grant_plan(struct ldlm_pool *pl)
  {
          int granted, grant_step, limit;
-        
+
          limit = ldlm_pool_get_limit(pl);
          granted = atomic_read(&pl->pl_granted);
  
@@ -241,7 +243,7 @@ static inline void ldlm_pool_recalc_grant_plan(struct ldlm_pool *pl)
  /**
   * Recalculates next SLV on passed \a pl.
   *
- * \pre ->pl_lock is locked. 
+ * \pre ->pl_lock is locked.
   */
  static inline void ldlm_pool_recalc_slv(struct ldlm_pool *pl)
  {
@@ -258,13 +260,13 @@ static inline void ldlm_pool_recalc_slv(struct ldlm_pool *pl)
          if (grant_usage <= 0)
                  grant_usage = 1;
  
-        /* 
-         * Find out SLV change factor which is the ratio of grant usage 
-         * from limit. SLV changes as fast as the ratio of grant plan 
-         * consumtion. The more locks from grant plan are not consumed 
-         * by clients in last interval (idle time), the faster grows 
+        /*
+         * Find out SLV change factor which is the ratio of grant usage
+         * from limit. SLV changes as fast as the ratio of grant plan
+         * consumtion. The more locks from grant plan are not consumed
+         * by clients in last interval (idle time), the faster grows
           * SLV. And the opposite, the more grant plan is over-consumed
-         * (load time) the faster drops SLV. 
+         * (load time) the faster drops SLV.
           */
          slv_factor = (grant_usage * 100) / limit;
          if (2 * abs(granted - limit) > limit) {
@@ -286,7 +288,7 @@ static inline void ldlm_pool_recalc_slv(struct ldlm_pool *pl)
  /**
   * Recalculates next stats on passed \a pl.
   *
- * \pre ->pl_lock is locked. 
+ * \pre ->pl_lock is locked.
   */
  static inline void ldlm_pool_recalc_stats(struct ldlm_pool *pl)
  {
@@ -296,7 +298,7 @@ static inline void ldlm_pool_recalc_stats(struct ldlm_pool *pl)
          int grant_rate = atomic_read(&pl->pl_grant_rate);
          int cancel_rate = atomic_read(&pl->pl_cancel_rate);
  
-        lprocfs_counter_add(pl->pl_stats, LDLM_POOL_SLV_STAT, 
+        lprocfs_counter_add(pl->pl_stats, LDLM_POOL_SLV_STAT,
                              slv);
          lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANTED_STAT,
                              granted);
@@ -315,12 +317,12 @@ static void ldlm_srv_pool_push_slv(struct ldlm_pool *pl)
  {
          struct obd_device *obd;
  
-        /* 
+        /*
           * Set new SLV in obd field for using it later without accessing the
           * pool. This is required to avoid race between sending reply to client
           * with new SLV and cleanup server stack in which we can't guarantee
           * that namespace is still alive. We know only that obd is alive as
-         * long as valid export is alive. 
+         * long as valid export is alive.
           */
          obd = ldlm_pl2ns(pl)->ns_obd;
          LASSERT(obd != NULL);
@@ -332,7 +334,7 @@ static void ldlm_srv_pool_push_slv(struct ldlm_pool *pl)
  /**
   * Recalculates all pool fields on passed \a pl.
   *
- * \pre ->pl_lock is not locked. 
+ * \pre ->pl_lock is not locked.
   */
  static int ldlm_srv_pool_recalc(struct ldlm_pool *pl)
  {
@@ -344,22 +346,22 @@ static int ldlm_srv_pool_recalc(struct ldlm_pool *pl)
          if (recalc_interval_sec >= pl->pl_recalc_period) {
                  /*
                   * Recalc SLV after last period. This should be done
-                 * _before_ recalculating new grant plan. 
+                 * _before_ recalculating new grant plan.
                   */
                  ldlm_pool_recalc_slv(pl);
-                
+
                  /*
-                 * Make sure that pool informed obd of last SLV changes. 
+                 * Make sure that pool informed obd of last SLV changes.
                   */
                  ldlm_srv_pool_push_slv(pl);
  
                  /*
-                 * Update grant_plan for new period. 
+                 * Update grant_plan for new period.
                   */
                  ldlm_pool_recalc_grant_plan(pl);
  
                  pl->pl_recalc_time = cfs_time_current_sec();
-                lprocfs_counter_add(pl->pl_stats, LDLM_POOL_TIMING_STAT, 
+                lprocfs_counter_add(pl->pl_stats, LDLM_POOL_TIMING_STAT,
                                      recalc_interval_sec);
          }
  
@@ -371,9 +373,9 @@ static int ldlm_srv_pool_recalc(struct ldlm_pool *pl)
   * This function is used on server side as main entry point for memory
   * preasure handling. It decreases SLV on \a pl according to passed
   * \a nr and \a gfp_mask.
- * 
+ *
   * Our goal here is to decrease SLV such a way that clients hold \a nr
- * locks smaller in next 10h. 
+ * locks smaller in next 10h.
   */
  static int ldlm_srv_pool_shrink(struct ldlm_pool *pl,
                                  int nr, unsigned int gfp_mask)
@@ -381,22 +383,22 @@ static int ldlm_srv_pool_shrink(struct ldlm_pool *pl,
          __u32 limit;
          ENTRY;
  
-        /* 
-         * VM is asking how many entries may be potentially freed. 
+        /*
+         * VM is asking how many entries may be potentially freed.
           */
          if (nr == 0)
                  RETURN(atomic_read(&pl->pl_granted));
  
-        /* 
+        /*
           * Client already canceled locks but server is already in shrinker
-         * and can't cancel anything. Let's catch this race. 
+         * and can't cancel anything. Let's catch this race.
           */
          if (atomic_read(&pl->pl_granted) == 0)
                  RETURN(0);
  
          spin_lock(&pl->pl_lock);
  
-        /* 
+        /*
           * We want shrinker to possibly cause cancelation of @nr locks from
           * clients or grant approximately @nr locks smaller next intervals.
           *
@@ -406,7 +408,7 @@ static int ldlm_srv_pool_shrink(struct ldlm_pool *pl,
           * interval pool will either increase SLV if locks load is not high
           * or will keep on same level or even decrease again, thus, shrinker
           * decreased SLV will affect next recalc intervals and this way will
-         * make locking load lower. 
+         * make locking load lower.
           */
          if (nr < pl->pl_server_lock_volume) {
                  pl->pl_server_lock_volume = pl->pl_server_lock_volume - nr;
@@ -415,15 +417,15 @@ static int ldlm_srv_pool_shrink(struct ldlm_pool *pl,
                  pl->pl_server_lock_volume = ldlm_pool_slv_min(limit);
          }
  
-        /* 
-         * Make sure that pool informed obd of last SLV changes. 
+        /*
+         * Make sure that pool informed obd of last SLV changes.
           */
          ldlm_srv_pool_push_slv(pl);
          spin_unlock(&pl->pl_lock);
  
-        /* 
+        /*
           * We did not really free any memory here so far, it only will be
-         * freed later may be, so that we return 0 to not confuse VM. 
+         * freed later may be, so that we return 0 to not confuse VM.
           */
          RETURN(0);
  }
@@ -435,7 +437,7 @@ static int ldlm_srv_pool_setup(struct ldlm_pool *pl, int limit)
  {
          struct obd_device *obd;
          ENTRY;
-        
+
          obd = ldlm_pl2ns(pl)->ns_obd;
          LASSERT(obd != NULL && obd != LP_POISON);
          LASSERT(obd->obd_type != LP_POISON);
@@ -454,9 +456,9 @@ static void ldlm_cli_pool_pop_slv(struct ldlm_pool *pl)
  {
          struct obd_device *obd;
  
-        /* 
-         * Get new SLV and Limit from obd which is updated with comming 
-         * RPCs. 
+        /*
+         * Get new SLV and Limit from obd which is updated with comming
+         * RPCs.
           */
          obd = ldlm_pl2ns(pl)->ns_obd;
          LASSERT(obd != NULL);
@@ -484,29 +486,29 @@ static int ldlm_cli_pool_recalc(struct ldlm_pool *pl)
                  RETURN(0);
          }
  
-        /* 
-         * Make sure that pool knows last SLV and Limit from obd. 
+        /*
+         * Make sure that pool knows last SLV and Limit from obd.
           */
          ldlm_cli_pool_pop_slv(pl);
  
          pl->pl_recalc_time = cfs_time_current_sec();
-        lprocfs_counter_add(pl->pl_stats, LDLM_POOL_TIMING_STAT, 
+        lprocfs_counter_add(pl->pl_stats, LDLM_POOL_TIMING_STAT,
                              recalc_interval_sec);
          spin_unlock(&pl->pl_lock);
  
-        /* 
-         * Do not cancel locks in case lru resize is disabled for this ns. 
+        /*
+         * Do not cancel locks in case lru resize is disabled for this ns.
           */
          if (!ns_connect_lru_resize(ldlm_pl2ns(pl)))
                  RETURN(0);
  
-        /* 
+        /*
           * In the time of canceling locks on client we do not need to maintain
           * sharp timing, we only want to cancel locks asap according to new SLV.
           * It may be called when SLV has changed much, this is why we do not
-         * take into account pl->pl_recalc_time here. 
+         * take into account pl->pl_recalc_time here.
           */
-        RETURN(ldlm_cancel_lru(ldlm_pl2ns(pl), 0, LDLM_ASYNC, 
+        RETURN(ldlm_cancel_lru(ldlm_pl2ns(pl), 0, LDLM_ASYNC,
                                 LDLM_CANCEL_LRUR));
  }
  
@@ -519,30 +521,30 @@ static int ldlm_cli_pool_shrink(struct ldlm_pool *pl,
                                  int nr, unsigned int gfp_mask)
  {
          ENTRY;
-        
-        /* 
-         * Do not cancel locks in case lru resize is disabled for this ns. 
+
+        /*
+         * Do not cancel locks in case lru resize is disabled for this ns.
           */
          if (!ns_connect_lru_resize(ldlm_pl2ns(pl)))
                  RETURN(0);
  
-        /* 
-         * Make sure that pool knows last SLV and Limit from obd. 
+        /*
+         * Make sure that pool knows last SLV and Limit from obd.
           */
          ldlm_cli_pool_pop_slv(pl);
  
-        /* 
-         * Find out how many locks may be released according to shrink 
-         * policy. 
+        /*
+         * Find out how many locks may be released according to shrink
+         * policy.
           */
          if (nr == 0)
-                RETURN(ldlm_cancel_lru_estimate(ldlm_pl2ns(pl), 0, 0, 
+                RETURN(ldlm_cancel_lru_estimate(ldlm_pl2ns(pl), 0, 0,
                                                  LDLM_CANCEL_SHRINK));
  
-        /* 
-         * Cancel @nr locks accoding to shrink policy. 
+        /*
+         * Cancel @nr locks accoding to shrink policy.
           */
-        RETURN(ldlm_cancel_lru(ldlm_pl2ns(pl), nr, LDLM_SYNC, 
+        RETURN(ldlm_cancel_lru(ldlm_pl2ns(pl), nr, LDLM_SYNC,
                                 LDLM_CANCEL_SHRINK));
  }
  
@@ -575,7 +577,7 @@ int ldlm_pool_recalc(struct ldlm_pool *pl)
                  ldlm_pool_recalc_stats(pl);
  
                  /*
-                 * Zero out all rates and speed for the last period. 
+                 * Zero out all rates and speed for the last period.
                   */
                  atomic_set(&pl->pl_grant_rate, 0);
                  atomic_set(&pl->pl_cancel_rate, 0);
@@ -585,7 +587,7 @@ int ldlm_pool_recalc(struct ldlm_pool *pl)
  
          if (pl->pl_ops->po_recalc != NULL) {
                  count = pl->pl_ops->po_recalc(pl);
-                lprocfs_counter_add(pl->pl_stats, LDLM_POOL_RECALC_STAT, 
+                lprocfs_counter_add(pl->pl_stats, LDLM_POOL_RECALC_STAT,
                                      count);
                  return count;
          }
@@ -602,14 +604,14 @@ int ldlm_pool_shrink(struct ldlm_pool *pl, int nr,
                       unsigned int gfp_mask)
  {
          int cancel = 0;
-        
+
          if (pl->pl_ops->po_shrink != NULL) {
                  cancel = pl->pl_ops->po_shrink(pl, nr, gfp_mask);
                  if (nr > 0) {
-                        lprocfs_counter_add(pl->pl_stats, 
+                        lprocfs_counter_add(pl->pl_stats,
                                              LDLM_POOL_SHRINK_REQTD_STAT,
                                              nr);
-                        lprocfs_counter_add(pl->pl_stats, 
+                        lprocfs_counter_add(pl->pl_stats,
                                              LDLM_POOL_SHRINK_FREED_STAT,
                                              cancel);
                          CDEBUG(D_DLMTRACE, "%s: request to shrink %d locks, "
@@ -779,10 +781,10 @@ static int ldlm_pool_proc_init(struct ldlm_pool *pl)
          lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANTED_STAT,
                               LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
                               "granted", "locks");
-        lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_STAT, 
+        lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_STAT,
                               LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
                               "grant", "locks");
-        lprocfs_counter_init(pl->pl_stats, LDLM_POOL_CANCEL_STAT, 
+        lprocfs_counter_init(pl->pl_stats, LDLM_POOL_CANCEL_STAT,
                               LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV,
                               "cancel", "locks");
          lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_RATE_STAT,
@@ -878,8 +880,8 @@ void ldlm_pool_fini(struct ldlm_pool *pl)
  {
          ENTRY;
          ldlm_pool_proc_fini(pl);
-        
-        /* 
+
+        /*
           * Pool should not be used after this point. We can't free it here as
           * it lives in struct ldlm_namespace, but still interested in catching
           * any abnormal using cases.
@@ -894,27 +896,26 @@ EXPORT_SYMBOL(ldlm_pool_fini);
   */
  void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock)
  {
-        /* 
+        /*
           * FLOCK locks are special in a sense that they are almost never
           * cancelled, instead special kind of lock is used to drop them.
           * also there is no LRU for flock locks, so no point in tracking
-         * them anyway. 
+         * them anyway.
           */
          if (lock->l_resource->lr_type == LDLM_FLOCK)
                  return;
          ENTRY;
-                
+
          atomic_inc(&pl->pl_granted);
          atomic_inc(&pl->pl_grant_rate);
          atomic_inc(&pl->pl_grant_speed);
  
          lprocfs_counter_incr(pl->pl_stats, LDLM_POOL_GRANT_STAT);
- 
-        /* 
+        /*
           * Do not do pool recalc for client side as all locks which
-         * potentially may be canceled has already been packed into 
+         * potentially may be canceled has already been packed into
           * enqueue/cancel rpc. Also we do not want to run out of stack
-         * with too long call paths. 
+         * with too long call paths.
           */
          if (ns_is_server(ldlm_pl2ns(pl)))
                  ldlm_pool_recalc(pl);
@@ -938,7 +939,7 @@ void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock)
          atomic_dec(&pl->pl_granted);
          atomic_inc(&pl->pl_cancel_rate);
          atomic_dec(&pl->pl_grant_speed);
-        
+
          lprocfs_counter_incr(pl->pl_stats, LDLM_POOL_CANCEL_STAT);
  
          if (ns_is_server(ldlm_pl2ns(pl)))
@@ -950,7 +951,7 @@ EXPORT_SYMBOL(ldlm_pool_del);
  /**
   * Returns current \a pl SLV.
   *
- * \pre ->pl_lock is not locked. 
+ * \pre ->pl_lock is not locked.
   */
  __u64 ldlm_pool_get_slv(struct ldlm_pool *pl)
  {
@@ -965,7 +966,7 @@ EXPORT_SYMBOL(ldlm_pool_get_slv);
  /**
   * Sets passed \a slv to \a pl.
   *
- * \pre ->pl_lock is not locked. 
+ * \pre ->pl_lock is not locked.
   */
  void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv)
  {
@@ -978,7 +979,7 @@ EXPORT_SYMBOL(ldlm_pool_set_slv);
  /**
   * Returns current \a pl CLV.
   *
- * \pre ->pl_lock is not locked. 
+ * \pre ->pl_lock is not locked.
   */
  __u64 ldlm_pool_get_clv(struct ldlm_pool *pl)
  {
@@ -993,7 +994,7 @@ EXPORT_SYMBOL(ldlm_pool_get_clv);
  /**
   * Sets passed \a clv to \a pl.
   *
- * \pre ->pl_lock is not locked. 
+ * \pre ->pl_lock is not locked.
   */
  void ldlm_pool_set_clv(struct ldlm_pool *pl, __u64 clv)
  {
@@ -1041,16 +1042,17 @@ static struct shrinker *ldlm_pools_srv_shrinker;
  static struct shrinker *ldlm_pools_cli_shrinker;
  static struct completion ldlm_pools_comp;
  
-/* 
+/*
   * Cancel \a nr locks from all namespaces (if possible). Returns number of
   * cached locks after shrink is finished. All namespaces are asked to
   * cancel approximately equal amount of locks to keep balancing.
   */
-static int ldlm_pools_shrink(ldlm_side_t client, int nr, 
+static int ldlm_pools_shrink(ldlm_side_t client, int nr,
                               unsigned int gfp_mask)
  {
          int total = 0, cached = 0, nr_ns;
          struct ldlm_namespace *ns;
+        void *cookie;
  
          if (nr != 0 && !(gfp_mask & __GFP_FS))
                  return -1;
@@ -1058,15 +1060,18 @@ static int ldlm_pools_shrink(ldlm_side_t client, int nr,
          CDEBUG(D_DLMTRACE, "Request to shrink %d %s locks from all pools\n",
                 nr, client == LDLM_NAMESPACE_CLIENT ? "client" : "server");
  
-        /* 
-         * Find out how many resources we may release. 
+        cookie = cl_env_reenter();
+
+        /*
+         * Find out how many resources we may release.
           */
-        for (nr_ns = atomic_read(ldlm_namespace_nr(client)); 
-             nr_ns > 0; nr_ns--) 
+        for (nr_ns = atomic_read(ldlm_namespace_nr(client));
+             nr_ns > 0; nr_ns--)
          {
                  mutex_down(ldlm_namespace_lock(client));
                  if (list_empty(ldlm_namespace_list(client))) {
                          mutex_up(ldlm_namespace_lock(client));
+                        cl_env_reexit(cookie);
                          return 0;
                  }
                  ns = ldlm_namespace_first_locked(client);
@@ -1076,28 +1081,30 @@ static int ldlm_pools_shrink(ldlm_side_t client, int nr,
                  total += ldlm_pool_shrink(&ns->ns_pool, 0, gfp_mask);
                  ldlm_namespace_put(ns, 1);
          }
- 
-        if (nr == 0 || total == 0)
+
+        if (nr == 0 || total == 0) {
+                cl_env_reexit(cookie);
                  return total;
+        }
  
-        /* 
-         * Shrink at least ldlm_namespace_nr(client) namespaces. 
+        /*
+         * Shrink at least ldlm_namespace_nr(client) namespaces.
           */
-        for (nr_ns = atomic_read(ldlm_namespace_nr(client)); 
-             nr_ns > 0; nr_ns--) 
+        for (nr_ns = atomic_read(ldlm_namespace_nr(client));
+             nr_ns > 0; nr_ns--)
          {
                  int cancel, nr_locks;
  
-                /* 
-                 * Do not call shrink under ldlm_namespace_lock(client) 
+                /*
+                 * Do not call shrink under ldlm_namespace_lock(client)
                   */
                  mutex_down(ldlm_namespace_lock(client));
                  if (list_empty(ldlm_namespace_list(client))) {
                          mutex_up(ldlm_namespace_lock(client));
-                        /* 
+                        /*
                           * If list is empty, we can't return any @cached > 0,
                           * that probably would cause needless shrinker
-                         * call. 
+                         * call.
                           */
                          cached = 0;
                          break;
@@ -1106,13 +1113,14 @@ static int ldlm_pools_shrink(ldlm_side_t client, int nr,
                  ldlm_namespace_get(ns);
                  ldlm_namespace_move_locked(ns, client);
                  mutex_up(ldlm_namespace_lock(client));
-                
+
                  nr_locks = ldlm_pool_granted(&ns->ns_pool);
                  cancel = 1 + nr_locks * nr / total;
                  ldlm_pool_shrink(&ns->ns_pool, cancel, gfp_mask);
                  cached += ldlm_pool_granted(&ns->ns_pool);
                  ldlm_namespace_put(ns, 1);
          }
+        cl_env_reexit(cookie);
          return cached;
  }
  
@@ -1132,16 +1140,16 @@ void ldlm_pools_recalc(ldlm_side_t client)
          struct ldlm_namespace *ns;
          int nr, equal = 0;
  
-        /* 
+        /*
           * No need to setup pool limit for client pools.
           */
          if (client == LDLM_NAMESPACE_SERVER) {
-                /* 
-                 * Check all modest namespaces first. 
+                /*
+                 * Check all modest namespaces first.
                   */
                  mutex_down(ldlm_namespace_lock(client));
-                list_for_each_entry(ns, ldlm_namespace_list(client), 
-                                    ns_list_chain) 
+                list_for_each_entry(ns, ldlm_namespace_list(client),
+                                    ns_list_chain)
                  {
                          if (ns->ns_appetite != LDLM_NAMESPACE_MODEST)
                                  continue;
@@ -1150,9 +1158,9 @@ void ldlm_pools_recalc(ldlm_side_t client)
                          if (l == 0)
                                  l = 1;
  
-                        /* 
+                        /*
                           * Set the modest pools limit equal to their avg granted
-                         * locks + 5%. 
+                         * locks + 5%.
                           */
                          l += dru(l * LDLM_POOLS_MODEST_MARGIN, 100);
                          ldlm_pool_setup(&ns->ns_pool, l);
@@ -1160,9 +1168,9 @@ void ldlm_pools_recalc(ldlm_side_t client)
                          nr_p++;
                  }
  
-                /* 
-                 * Make sure that modest namespaces did not eat more that 2/3 
-                 * of limit. 
+                /*
+                 * Make sure that modest namespaces did not eat more that 2/3
+                 * of limit.
                   */
                  if (nr_l >= 2 * (LDLM_POOL_HOST_L / 3)) {
                          CWARN("\"Modest\" pools eat out 2/3 of server locks "
@@ -1172,25 +1180,25 @@ void ldlm_pools_recalc(ldlm_side_t client)
                          equal = 1;
                  }
  
-                /* 
-                 * The rest is given to greedy namespaces. 
+                /*
+                 * The rest is given to greedy namespaces.
                   */
-                list_for_each_entry(ns, ldlm_namespace_list(client), 
-                                    ns_list_chain) 
+                list_for_each_entry(ns, ldlm_namespace_list(client),
+                                    ns_list_chain)
                  {
                          if (!equal && ns->ns_appetite != LDLM_NAMESPACE_GREEDY)
                                  continue;
  
                          if (equal) {
-                                /* 
+                                /*
                                   * In the case 2/3 locks are eaten out by
                                   * modest pools, we re-setup equal limit
-                                 * for _all_ pools. 
+                                 * for _all_ pools.
                                   */
                                  l = LDLM_POOL_HOST_L /
                                          atomic_read(ldlm_namespace_nr(client));
                          } else {
-                                /* 
+                                /*
                                   * All the rest of greedy pools will have
                                   * all locks in equal parts.
                                   */
@@ -1203,16 +1211,16 @@ void ldlm_pools_recalc(ldlm_side_t client)
                  mutex_up(ldlm_namespace_lock(client));
          }
  
-        /* 
-         * Recalc at least ldlm_namespace_nr(client) namespaces. 
+        /*
+         * Recalc at least ldlm_namespace_nr(client) namespaces.
           */
          for (nr = atomic_read(ldlm_namespace_nr(client)); nr > 0; nr--) {
-                /* 
+                /*
                   * Lock the list, get first @ns in the list, getref, move it
                   * to the tail, unlock and call pool recalc. This way we avoid
                   * calling recalc under @ns lock what is really good as we get
                   * rid of potential deadlock on client nodes when canceling
-                 * locks synchronously. 
+                 * locks synchronously.
                   */
                  mutex_down(ldlm_namespace_lock(client));
                  if (list_empty(ldlm_namespace_list(client))) {
@@ -1224,8 +1232,8 @@ void ldlm_pools_recalc(ldlm_side_t client)
                  ldlm_namespace_move_locked(ns, client);
                  mutex_up(ldlm_namespace_lock(client));
  
-                /* 
-                 * After setup is done - recalc the pool. 
+                /*
+                 * After setup is done - recalc the pool.
                   */
                  ldlm_pool_recalc(&ns->ns_pool);
                  ldlm_namespace_put(ns, 1);
@@ -1250,14 +1258,14 @@ static int ldlm_pools_thread_main(void *arg)
                  struct l_wait_info lwi;
  
                  /*
-                 * Recal all pools on this tick. 
+                 * Recal all pools on this tick.
                   */
                  ldlm_pools_recalc(LDLM_NAMESPACE_SERVER);
                  ldlm_pools_recalc(LDLM_NAMESPACE_CLIENT);
-                
+
                  /*
                   * Wait until the next check time, or until we're
-                 * stopped. 
+                 * stopped.
                   */
                  lwi = LWI_TIMEOUT(cfs_time_seconds(LDLM_POOLS_THREAD_PERIOD),
                                    NULL, NULL);
@@ -1298,9 +1306,9 @@ static int ldlm_pools_thread_start(void)
          init_completion(&ldlm_pools_comp);
          cfs_waitq_init(&ldlm_pools_thread->t_ctl_waitq);
  
-        /* 
+        /*
           * CLONE_VM and CLONE_FILES just avoid a needless copy, because we
-         * just drop the VM and FILES in ptlrpc_daemonize() right away. 
+         * just drop the VM and FILES in ptlrpc_daemonize() right away.
           */
          rc = cfs_kernel_thread(ldlm_pools_thread_main, ldlm_pools_thread,
                                 CLONE_VM | CLONE_FILES);
@@ -1328,10 +1336,10 @@ static void ldlm_pools_thread_stop(void)
          ldlm_pools_thread->t_flags = SVC_STOPPING;
          cfs_waitq_signal(&ldlm_pools_thread->t_ctl_waitq);
  
-        /* 
+        /*
           * Make sure that pools thread is finished before freeing @thread.
           * This fixes possible race and oops due to accessing freed memory
-         * in pools thread. 
+         * in pools thread.
           */
          wait_for_completion(&ldlm_pools_comp);
          OBD_FREE_PTR(ldlm_pools_thread);
diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c

index 95ee14d..92068f6 100644 (file)
--- a/lustre/ldlm/ldlm_request.c
+++ b/lustre/ldlm/ldlm_request.c
@@ -161,8 +161,8 @@ static int ldlm_completion_tail(struct ldlm_lock *lock)
  }
  
  /**
- * Implementation of ->l_completion_ast() for a client that doesn't wait
- * until lock is granted. Suitable for locks enqueued through ptlrpcd or
+ * Implementation of ->l_completion_ast() for a client, that doesn't wait
+ * until lock is granted. Suitable for locks enqueued through ptlrpcd, of
   * other threads that cannot block for long.
   */
  int ldlm_completion_ast_async(struct ldlm_lock *lock, int flags, void *data)
@@ -183,6 +183,7 @@ int ldlm_completion_ast_async(struct ldlm_lock *lock, int flags, void *data)
          LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, "
                     "going forward");
          ldlm_lock_dump(D_OTHER, lock, 0);
+        ldlm_reprocess_all(lock->l_resource);
          RETURN(0);
  }
  
@@ -862,7 +863,9 @@ int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp,
          }
  
          LDLM_DEBUG(lock, "sending request");
+
          rc = ptlrpc_queue_wait(req);
+
          err = ldlm_cli_enqueue_fini(exp, req, einfo->ei_type, policy ? 1 : 0,
                                      einfo->ei_mode, flags, lvb, lvb_len,
                                      lvb_swabber, lockh, rc);
@@ -1119,7 +1122,7 @@ int ldlm_cli_cancel_req(struct obd_export *exp, struct list_head *cancels,
  
                  ptlrpc_request_set_replen(req);
                  if (flags & LDLM_FL_ASYNC) {
-                        ptlrpcd_add_req(req);
+                        ptlrpcd_add_req(req, PSCOPE_OTHER);
                          sent = count;
                          GOTO(out, 0);
                  } else {
@@ -1165,7 +1168,6 @@ int ldlm_cli_update_pool(struct ptlrpc_request *req)
          __u64 old_slv, new_slv;
          __u32 new_limit;
          ENTRY;
-
          if (unlikely(!req->rq_import || !req->rq_import->imp_obd ||
                       !imp_connect_lru_resize(req->rq_import)))
          {
@@ -1330,15 +1332,15 @@ static ldlm_policy_res_t ldlm_cancel_shrink_policy(struct ldlm_namespace *ns,
                           */
                          page_nr = lock->l_weigh_ast(lock);
                  } else {
-                struct ldlm_extent *l_extent;
+                        struct ldlm_extent *l_extent;
  
-                /*
-                 * For all extent locks cost is 1 + number of pages in
-                 * their extent.
-                 */
-                l_extent = &lock->l_policy_data.l_extent;
+                        /*
+                         * For all extent locks cost is 1 + number of pages in
+                         * their extent.
+                         */
+                        l_extent = &lock->l_policy_data.l_extent;
                          page_nr = l_extent->end - l_extent->start;
-                do_div(page_nr, CFS_PAGE_SIZE);
+                        do_div(page_nr, CFS_PAGE_SIZE);
                  }
                  lock_cost = 1 + page_nr;
          } else {
@@ -2182,7 +2184,7 @@ static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock)
          aa = ptlrpc_req_async_args(req);
          aa->lock_handle = body->lock_handle[0];
          req->rq_interpret_reply = (ptlrpc_interpterer_t)replay_lock_interpret;
-        ptlrpcd_add_req(req);
+        ptlrpcd_add_req(req, PSCOPE_OTHER);
  
          RETURN(0);
  }
diff --git a/lustre/liblustre/Makefile.am b/lustre/liblustre/Makefile.am

index 21de87c..116d0c9 100644 (file)
--- a/lustre/liblustre/Makefile.am
+++ b/lustre/liblustre/Makefile.am
@@ -59,11 +59,12 @@ install-exec-hook:
  endif
  
  libllite_a_SOURCES = llite_lib.c llite_fid.c super.c namei.c rw.c file.c dir.c \
-                    lutil.c lutil.h llite_lib.h
+                    lutil.c lutil.h llite_lib.h llite_cl.c \
+                     ../lclient/lcommon_cl.c ../lclient/glimpse.c
  
  # for make rpms -- need cleanup
  liblustre_a_SOURCES = llite_lib.c llite_fid.c super.c namei.c rw.c file.c dir.c \
-                    llite_lib.h
+                    llite_lib.h llite_cl.c
  
  liblustre.a : $(LUSTRE_LIBS) $(LND_LIBS) $(LNET_LIBS) $(SYSIO_LIBS) $(QUOTA_LIBS)
         sh $(srcdir)/genlib.sh "$(SYSIO)" "$(LIBS)" "$(LND_LIBS)" "$(PTHREAD_LIBS)" "$(QUOTA_LIBS)" "$(CAP_LIBS)" "$(ZLIB)"
diff --git a/lustre/liblustre/dir.c b/lustre/liblustre/dir.c

index 57960a2..8999ae6 100644 (file)
--- a/lustre/liblustre/dir.c
+++ b/lustre/liblustre/dir.c
@@ -194,12 +194,12 @@ static int filldir(char *buf, int buflen,
          return 0;
  }
  
-/* 
+/*
   * TODO: much of the code here is similar/identical to llite ll_readdir().
   * These code can be factored out and shared in a common module.
   */
  
-ssize_t llu_iop_filldirentries(struct inode *dir, _SYSIO_OFF_T *basep, 
+ssize_t llu_iop_filldirentries(struct inode *dir, _SYSIO_OFF_T *basep,
                                char *buf, size_t nbytes)
  {
          struct llu_inode_info *lli = llu_i2info(dir);
@@ -237,9 +237,9 @@ ssize_t llu_iop_filldirentries(struct inode *dir, _SYSIO_OFF_T *basep,
                  struct lu_dirent  *ent;
  
                  if (!IS_ERR(page)) {
-                        /* 
+                        /*
                           * If page is empty (end of directoryis reached),
-                         * use this value. 
+                         * use this value.
                           */
                          __u64 hash = DIR_END_OFF;
                          __u64 next;
diff --git a/lustre/liblustre/file.c b/lustre/liblustre/file.c

index aca3fde..07b7ddd 100644 (file)
--- a/lustre/liblustre/file.c
+++ b/lustre/liblustre/file.c
@@ -137,10 +137,10 @@ void obdo_refresh_inode(struct inode *dst,
  
          if (valid & OBD_MD_FLATIME && src->o_atime > LTIME_S(st->st_atime))
                  LTIME_S(st->st_atime) = src->o_atime;
-        
+
          /* mtime is always updated with ctime, but can be set in past.
             As write and utime(2) may happen within 1 second, and utime's
-           mtime has a priority over write's one, leave mtime from mds 
+           mtime has a priority over write's one, leave mtime from mds
             for the same ctimes. */
          if (valid & OBD_MD_FLCTIME && src->o_ctime > LTIME_S(st->st_ctime)) {
                  LTIME_S(st->st_ctime) = src->o_ctime;
@@ -340,10 +340,10 @@ int llu_sizeonmds_update(struct inode *inode, struct md_open_data *mod,
          struct obdo oa;
          int rc;
          ENTRY;
-        
+
          LASSERT(!(lli->lli_flags & LLIF_MDS_SIZE_LOCK));
          LASSERT(sbi->ll_lco.lco_flags & OBD_CONNECT_SOM);
-        
+
          rc = llu_inode_getattr(inode, &oa);
          if (rc == -ENOENT) {
                  oa.o_valid = 0;
@@ -356,7 +356,7 @@ int llu_sizeonmds_update(struct inode *inode, struct md_open_data *mod,
                         lli->lli_st_generation);
                  RETURN(rc);
          }
-        
+
          md_from_obdo(&op_data, &oa, oa.o_valid);
          memcpy(&op_data.op_handle, fh, sizeof(*fh));
          op_data.op_ioepoch = ioepoch;
@@ -387,7 +387,7 @@ int llu_md_close(struct obd_export *md_exp, struct inode *inode)
  
          op_data.op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET |
                                  ATTR_MTIME_SET | ATTR_CTIME_SET;
-        
+
          if (fd->fd_flags & FMODE_WRITE) {
                  struct llu_sb_info *sbi = llu_i2sbi(inode);
                  if (!(sbi->ll_lco.lco_flags & OBD_CONNECT_SOM) ||
@@ -400,11 +400,11 @@ int llu_md_close(struct obd_export *md_exp, struct inode *inode)
                           * are really changed.  */
                          op_data.op_flags |= MF_SOM_CHANGE;
  
-                        /* Pack Size-on-MDS attributes if we are in IO epoch and 
+                        /* Pack Size-on-MDS attributes if we are in IO epoch and
                           * attributes are valid. */
                          LASSERT(!(lli->lli_flags & LLIF_MDS_SIZE_LOCK));
-                        if (!llu_local_size(inode))
-                                op_data.op_attr.ia_valid |= 
+                        if (!cl_local_size(inode))
+                                op_data.op_attr.ia_valid |=
                                          OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
                  }
          }
@@ -513,71 +513,3 @@ _SYSIO_OFF_T llu_iop_pos(struct inode *ino, _SYSIO_OFF_T off)
  
          RETURN(off);
  }
-
-/* this isn't where truncate starts.  roughly:
- * llu_iop_{open,setattr}->llu_setattr_raw->llu_vmtruncate->llu_truncate
- * we grab the lock back in setattr_raw to avoid races. */
-static void llu_truncate(struct inode *inode, obd_flag flags)
-{
-        struct llu_inode_info *lli = llu_i2info(inode);
-        struct intnl_stat *st = llu_i2stat(inode);
-        struct obd_info oinfo = { { { 0 } } };
-        struct obdo oa = { 0 };
-        int rc;
-        ENTRY;
-        CDEBUG(D_VFSTRACE, "VFS Op:inode=%llu/%lu(%p) to %llu\n",
-               (long long)st->st_ino, lli->lli_st_generation, inode,
-               (long long)st->st_size);
-
-        if (!lli->lli_smd) {
-                CDEBUG(D_INODE, "truncate on inode %llu with no objects\n",
-                       (long long)st->st_ino);
-                EXIT;
-                return;
-        }
-
-        oinfo.oi_md = lli->lli_smd;
-        oinfo.oi_policy.l_extent.start = st->st_size;
-        oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
-        oinfo.oi_oa = &oa;
-        oa.o_id = lli->lli_smd->lsm_object_id;
-        oa.o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS;
-        oa.o_flags = flags; /* We don't actually want to copy inode flags */
- 
-        obdo_from_inode(&oa, inode,
-                        OBD_MD_FLTYPE | OBD_MD_FLMODE | OBD_MD_FLATIME |
-                        OBD_MD_FLMTIME | OBD_MD_FLCTIME);
-
-        obd_adjust_kms(llu_i2obdexp(inode), lli->lli_smd, st->st_size, 1);
-
-        CDEBUG(D_INFO, "calling punch for "LPX64" (all bytes after %Lu)\n",
-               oa.o_id, (long long)st->st_size);
-
-        /* truncate == punch from new size to absolute end of file */
-        rc = obd_punch_rqset(llu_i2obdexp(inode), &oinfo, NULL);
-        if (rc)
-                CERROR("obd_truncate fails (%d) ino %llu\n",
-                       rc, (long long)st->st_ino);
-        else
-                obdo_to_inode(inode, &oa, OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
-                                          OBD_MD_FLATIME | OBD_MD_FLMTIME |
-                                          OBD_MD_FLCTIME);
-
-        EXIT;
-        return;
-} /* llu_truncate */
-
-int llu_vmtruncate(struct inode * inode, loff_t offset, obd_flag flags)
-{
-        llu_i2stat(inode)->st_size = offset;
-
-        /*
-         * llu_truncate() is only called from this
-         * point. llu_vmtruncate/llu_truncate split exists to mimic the
-         * structure of Linux VFS truncate code path.
-         */
-
-        llu_truncate(inode, flags);
-
-        return 0;
-}
diff --git a/lustre/liblustre/llite_cl.c b/lustre/liblustre/llite_cl.c

new file mode 100644 (file)

index 0000000..ed19dd3
--- /dev/null
+++ b/lustre/liblustre/llite_cl.c
@@ -0,0 +1,835 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ *   Copyright (c) 2007 Cluster File Systems, Inc.
+ *   Author: Nikita Danilov <nikita@clusterfs.com>
+ *
+ *   This file is part of Lustre, http://www.lustre.org.
+ *
+ *   Lustre is free software; you can redistribute it and/or
+ *   modify it under the terms of version 2 of the GNU General Public
+ *   License as published by the Free Software Foundation.
+ *
+ *   Lustre is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with Lustre; if not, write to the Free Software
+ *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/queue.h>
+#ifndef __CYGWIN__
+# include <sys/statvfs.h>
+#else
+# include <sys/statfs.h>
+#endif
+
+#include <sysio.h>
+#ifdef HAVE_XTIO_H
+#include <xtio.h>
+#endif
+#include <fs.h>
+#include <mount.h>
+#include <inode.h>
+#ifdef HAVE_FILE_H
+#include <file.h>
+#endif
+#include <liblustre.h>
+
+#include <obd.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <lustre_lite.h>
+#include <lustre_dlm.h>
+#include <lustre_ver.h>
+#include <lustre_mdc.h>
+#include <cl_object.h>
+
+#include "llite_lib.h"
+
+/*
+ * slp_ prefix stands for "Sysio Library Posix". It corresponds to historical
+ * "llu_" prefix.
+ */
+
+static int   slp_type_init     (struct lu_device_type *t);
+static void  slp_type_fini     (struct lu_device_type *t);
+
+static struct cl_page * slp_page_init(const struct lu_env *env,
+                                     struct cl_object *obj,
+                                     struct cl_page *page, cfs_page_t *vmpage);
+static int   slp_attr_get     (const struct lu_env *env, struct cl_object *obj,
+                               struct cl_attr *attr);
+
+static struct lu_device  *slp_device_alloc(const struct lu_env *env,
+                                           struct lu_device_type *t,
+                                           struct lustre_cfg *cfg);
+
+static int slp_io_init(const struct lu_env *env, struct cl_object *obj,
+                       struct cl_io *io);
+static struct slp_io *cl2slp_io(const struct lu_env *env,
+                                const struct cl_io_slice *slice);
+
+
+static void llu_free_user_page(struct page *page);
+
+static const struct lu_object_operations      slp_lu_obj_ops;
+static const struct lu_device_operations      slp_lu_ops;
+static const struct cl_device_operations      slp_cl_ops;
+static const struct cl_io_operations          ccc_io_ops;
+static const struct lu_device_type_operations slp_device_type_ops;
+             //struct lu_device_type            slp_device_type;
+static const struct cl_page_operations        slp_page_ops;
+static const struct cl_page_operations        slp_transient_page_ops;
+static const struct cl_lock_operations        slp_lock_ops;
+
+
+/*****************************************************************************
+ *
+ * Slp device and device type functions.
+ *
+ */
+
+void *slp_session_key_init(const struct lu_context *ctx,
+                                  struct lu_context_key *key)
+{
+        struct slp_session *session;
+
+        OBD_ALLOC_PTR(session);
+        if (session == NULL)
+                session = ERR_PTR(-ENOMEM);
+        return session;
+}
+
+void slp_session_key_fini(const struct lu_context *ctx,
+                                 struct lu_context_key *key, void *data)
+{
+        struct slp_session *session = data;
+        OBD_FREE_PTR(session);
+}
+
+struct lu_context_key slp_session_key = {
+        .lct_tags = LCT_SESSION,
+        .lct_init = slp_session_key_init,
+        .lct_fini = slp_session_key_fini
+};
+
+/* type constructor/destructor: slp_type_{init,fini,start,stop}(). */
+LU_TYPE_INIT_FINI(slp, &ccc_key, &ccc_session_key, &slp_session_key);
+
+static struct lu_device *slp_device_alloc(const struct lu_env *env,
+                                          struct lu_device_type *t,
+                                          struct lustre_cfg *cfg)
+{
+        return ccc_device_alloc(env, t, cfg, &slp_lu_ops, &slp_cl_ops);
+}
+
+static int slp_lock_init(const struct lu_env *env,
+                         struct cl_object *obj, struct cl_lock *lock,
+                         const struct cl_io *io)
+{
+        return ccc_lock_init(env, obj, lock, io, &slp_lock_ops);
+}
+
+static const struct cl_object_operations slp_ops = {
+        .coo_page_init = slp_page_init,
+        .coo_lock_init = slp_lock_init,
+        .coo_io_init   = slp_io_init,
+        .coo_attr_get  = slp_attr_get,
+        .coo_attr_set  = ccc_attr_set,
+        .coo_conf_set  = ccc_conf_set,
+        .coo_glimpse   = ccc_object_glimpse
+};
+
+static int slp_object_print(const struct lu_env *env, void *cookie,
+                            lu_printer_t p, const struct lu_object *o)
+{
+        struct ccc_object *obj   = lu2ccc(o);
+        struct inode      *inode = obj->cob_inode;
+        struct intnl_stat *st = NULL;
+
+        if (inode)
+                st = llu_i2stat(inode);
+
+        return (*p)(env, cookie, LUSTRE_SLP_NAME"-object@%p(%p:%lu/%u)",
+                    obj, inode,
+                    st ? (unsigned long)st->st_ino : 0UL,
+                    inode ? (unsigned int)llu_i2info(inode)->lli_st_generation
+                    : 0);
+}
+
+static const struct lu_object_operations slp_lu_obj_ops = {
+        .loo_object_init      = ccc_object_init,
+        .loo_object_start     = NULL,
+        .loo_object_delete    = NULL,
+        .loo_object_release   = NULL,
+        .loo_object_free      = ccc_object_free,
+        .loo_object_print     = slp_object_print,
+        .loo_object_invariant = NULL
+};
+
+static struct lu_object *slp_object_alloc(const struct lu_env *env,
+                                          const struct lu_object_header *hdr,
+                                          struct lu_device *dev)
+{
+        return ccc_object_alloc(env, hdr, dev, &slp_ops, &slp_lu_obj_ops);
+}
+
+static const struct lu_device_operations slp_lu_ops = {
+        .ldo_object_alloc      = slp_object_alloc
+};
+
+static const struct cl_device_operations slp_cl_ops = {
+        .cdo_req_init = ccc_req_init
+};
+
+static const struct lu_device_type_operations slp_device_type_ops = {
+        .ldto_init = slp_type_init,
+        .ldto_fini = slp_type_fini,
+
+        .ldto_start = slp_type_start,
+        .ldto_stop  = slp_type_stop,
+
+        .ldto_device_alloc = slp_device_alloc,
+        .ldto_device_free  = ccc_device_free,
+        .ldto_device_init  = ccc_device_init,
+        .ldto_device_fini  = ccc_device_fini
+};
+
+struct lu_device_type slp_device_type = {
+        .ldt_tags     = LU_DEVICE_CL,
+        .ldt_name     = LUSTRE_SLP_NAME,
+        .ldt_ops      = &slp_device_type_ops,
+        .ldt_ctx_tags = LCT_CL_THREAD
+};
+
+int slp_global_init(void)
+{
+        int result;
+
+        result = ccc_global_init(&slp_device_type);
+        return result;
+}
+
+void slp_global_fini(void)
+{
+        ccc_global_fini(&slp_device_type);
+}
+
+/*****************************************************************************
+ *
+ * Object operations.
+ *
+ */
+
+static struct cl_page *slp_page_init(const struct lu_env *env,
+                                     struct cl_object *obj,
+                                     struct cl_page *page, cfs_page_t *vmpage)
+{
+        struct ccc_page *cpg;
+        int result;
+
+        CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+
+        OBD_ALLOC_PTR(cpg);
+        if (cpg != NULL) {
+                cpg->cpg_page = vmpage;
+
+                if (page->cp_type == CPT_CACHEABLE) {
+                        LBUG();
+                } else {
+                        struct ccc_object *clobj = cl2ccc(obj);
+
+                        cl_page_slice_add(page, &cpg->cpg_cl, obj,
+                                          &slp_transient_page_ops);
+                        clobj->cob_transient_pages++;
+                }
+                result = 0;
+        } else
+                result = -ENOMEM;
+        return ERR_PTR(result);
+}
+
+static int slp_io_init(const struct lu_env *env, struct cl_object *obj,
+                       struct cl_io *io)
+{
+        struct ccc_io      *vio   = ccc_env_io(env);
+        int result = 0;
+
+        CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+
+        cl_io_slice_add(io, &vio->cui_cl, obj, &ccc_io_ops);
+        if (io->ci_type == CIT_READ || io->ci_type == CIT_WRITE) {
+                size_t count;
+
+                count = io->u.ci_rw.crw_count;
+                /* "If nbyte is 0, read() will return 0 and have no other
+                 *  results."  -- Single Unix Spec */
+                if (count == 0)
+                        return 1;
+                /* "If nbyte is 0, read() will return 0 and have no other
+                 *  results."  -- Single Unix Spec */
+                if (count == 0)
+                        result = 1;
+                else {
+                        vio->cui_tot_count = count;
+                        vio->cui_tot_nrsegs = 0;
+                }
+
+        }
+        return 0;
+}
+
+static int slp_attr_get(const struct lu_env *env, struct cl_object *obj,
+                        struct cl_attr *attr)
+{
+        struct inode *inode = ccc_object_inode(obj);
+        struct intnl_stat *st = llu_i2stat(inode);
+
+        attr->cat_size = st->st_size;
+        attr->cat_blocks = st->st_blocks;
+        attr->cat_mtime  = st->st_mtime;
+        attr->cat_atime  = st->st_atime;
+        attr->cat_ctime  = st->st_ctime;
+        /* KMS is not known by this layer */
+        return 0; /* layers below have to fill in the rest */
+}
+
+/*****************************************************************************
+ *
+ * Page operations.
+ *
+ */
+
+static void slp_page_fini_common(struct ccc_page *cp)
+{
+        cfs_page_t *vmpage = cp->cpg_page;
+
+        LASSERT(vmpage != NULL);
+        llu_free_user_page(vmpage);
+        OBD_FREE_PTR(cp);
+}
+
+static void slp_page_completion_common(const struct lu_env *env,
+                                       struct ccc_page *cp, int ioret)
+{
+        struct cl_sync_io *anchor = cp->cpg_sync_io;
+
+        if (anchor) {
+                cp->cpg_sync_io  = NULL;
+                cl_sync_io_note(anchor, ioret);
+        } else {
+                LBUG();
+        }
+}
+
+static void slp_page_completion_read(const struct lu_env *env,
+                                     const struct cl_page_slice *slice,
+                                     int ioret)
+{
+        struct ccc_page *cp      = cl2ccc_page(slice);
+        ENTRY;
+
+        slp_page_completion_common(env, cp, ioret);
+
+        EXIT;
+}
+
+static void slp_page_completion_write_common(const struct lu_env *env,
+                                             const struct cl_page_slice *slice,
+                                             int ioret)
+{
+        struct ccc_page *cp     = cl2ccc_page(slice);
+
+        if (ioret == 0) {
+                cp->cpg_write_queued = 0;
+                /*
+                 * Only ioret == 0, write succeed, then this page could be
+                 * deleted from the pending_writing count.
+                 */
+        }
+        slp_page_completion_common(env, cp, ioret);
+}
+
+static int slp_page_is_vmlocked(const struct lu_env *env,
+                                const struct cl_page_slice *slice)
+{
+        return -EBUSY;
+}
+
+static void slp_transient_page_fini(const struct lu_env *env,
+                                    struct cl_page_slice *slice)
+{
+        struct ccc_page *cp = cl2ccc_page(slice);
+        struct cl_page *clp = slice->cpl_page;
+        struct ccc_object *clobj = cl2ccc(clp->cp_obj);
+
+        slp_page_fini_common(cp);
+        clobj->cob_transient_pages--;
+}
+
+
+static const struct cl_page_operations slp_transient_page_ops = {
+        .cpo_own           = ccc_transient_page_own,
+        .cpo_assume        = ccc_transient_page_assume,
+        .cpo_unassume      = ccc_transient_page_unassume,
+        .cpo_disown        = ccc_transient_page_disown,
+        .cpo_discard       = ccc_transient_page_discard,
+        .cpo_vmpage        = ccc_page_vmpage,
+        .cpo_is_vmlocked   = slp_page_is_vmlocked,
+        .cpo_fini          = slp_transient_page_fini,
+        .cpo_is_under_lock = ccc_page_is_under_lock,
+        .io = {
+                [CRT_READ] = {
+                        .cpo_completion  = slp_page_completion_read,
+                },
+                [CRT_WRITE] = {
+                        .cpo_completion  = slp_page_completion_write_common,
+                }
+        }
+};
+
+/*****************************************************************************
+ *
+ * Lock operations.
+ *
+ */
+
+static int slp_lock_enqueue(const struct lu_env *env,
+                           const struct cl_lock_slice *slice,
+                           struct cl_io *_, __u32 enqflags)
+{
+        CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj));
+
+        liblustre_wait_event(0);
+        return 0;
+}
+
+static const struct cl_lock_operations slp_lock_ops = {
+        .clo_fini      = ccc_lock_fini,
+        .clo_enqueue   = slp_lock_enqueue,
+        .clo_wait      = ccc_lock_wait,
+        .clo_unuse     = ccc_lock_unuse,
+        .clo_fits_into = ccc_lock_fits_into,
+};
+
+/*****************************************************************************
+ *
+ * io operations.
+ *
+ */
+
+static int slp_io_rw_lock(const struct lu_env *env, struct cl_io *io,
+                          enum cl_lock_mode mode, loff_t start, loff_t end)
+{
+        int result;
+
+        LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+
+        if (!io->u.ci_wr.wr_append) { // No lock without O_APPEND in liblustre
+                return 0;
+        }
+
+        result = ccc_io_one_lock(env, io, 0, mode, start, end);
+
+        return result;
+}
+
+static int slp_io_write_lock(const struct lu_env *env,
+                             const struct cl_io_slice *ios)
+{
+        struct cl_io *io = ios->cis_io;
+        loff_t start;
+        loff_t end;
+
+        if (io->u.ci_wr.wr_append) {
+                start = 0;
+                end   = OBD_OBJECT_EOF;
+        } else {
+                start = io->u.ci_wr.wr.crw_pos;
+                end   = start + io->u.ci_wr.wr.crw_count - 1;
+        }
+
+        return slp_io_rw_lock(env, io, CLM_WRITE, start, end);
+
+}
+
+static int slp_io_trunc_iter_init(const struct lu_env *env,
+                                  const struct cl_io_slice *ios)
+{
+        return 0;
+}
+
+static int slp_io_trunc_start(const struct lu_env *env,
+                              const struct cl_io_slice *ios)
+{
+        return 0;
+}
+
+static struct page *llu_get_user_page(int index, void *addr, int offset,
+                                      int count)
+{
+        struct page *page;
+
+        OBD_ALLOC_PTR(page);
+        if (!page)
+                return NULL;
+        page->index = index;
+        page->addr = addr;
+        page->_offset = offset;
+        page->_count = count;
+
+        CFS_INIT_LIST_HEAD(&page->list);
+        CFS_INIT_LIST_HEAD(&page->_node);
+
+        return page;
+}
+
+static void llu_free_user_page(struct page *page)
+{
+        OBD_FREE_PTR(page);
+}
+
+static int llu_queue_pio(const struct lu_env *env, struct cl_io *io,
+                         struct llu_io_group *group,
+                         char *buf, size_t count, loff_t pos)
+{
+        struct cl_object *obj = io->ci_obj;
+        struct inode *inode = ccc_object_inode(obj);
+        struct intnl_stat *st = llu_i2stat(inode);
+        struct obd_export *exp = llu_i2obdexp(inode);
+        struct page *page;
+        int  rc = 0, npages = 0, ret_bytes = 0;
+        int local_lock;
+        struct cl_page *clp;
+        struct ccc_page *clup;
+        struct cl_2queue *queue;
+        struct cl_sync_io *anchor = &ccc_env_info(env)->cti_sync_io;
+        ENTRY;
+
+        if (!exp)
+                RETURN(-EINVAL);
+
+        local_lock = group->lig_params->lrp_lock_mode != LCK_NL;
+
+        queue = &io->ci_queue;
+        cl_2queue_init(queue);
+
+
+        /* prepare the pages array */
+        do {
+                unsigned long index, offset, bytes;
+
+                offset = (pos & ~CFS_PAGE_MASK);
+                index = pos >> CFS_PAGE_SHIFT;
+                bytes = CFS_PAGE_SIZE - offset;
+                if (bytes > count)
+                        bytes = count;
+
+                /* prevent read beyond file range */
+                if (/* local_lock && */
+                    io->ci_type == CIT_READ && pos + bytes >= st->st_size) {
+                        if (pos >= st->st_size)
+                                break;
+                        bytes = st->st_size - pos;
+                }
+
+                /* prepare page for this index */
+                page = llu_get_user_page(index, buf - offset, offset, bytes);
+                if (!page) {
+                        rc = -ENOMEM;
+                        break;
+                }
+
+                clp = cl_page_find(env, obj,
+                                   cl_index(obj, pos),
+                                   page, CPT_TRANSIENT);
+
+                if (IS_ERR(clp)) {
+                        rc = PTR_ERR(clp);
+                        break;
+                }
+
+                rc = cl_page_own(env, io, clp);
+                if (rc) {
+                        LASSERT(clp->cp_state == CPS_FREEING);
+                        cl_page_put(env, clp);
+                        break;
+                }
+
+                clup = cl2ccc_page(cl_page_at(clp, &slp_device_type));
+                clup->cpg_sync_io = anchor;
+                cl_2queue_add(queue, clp);
+
+                /* drop the reference count for cl_page_find, so that the page
+                 * will be freed in cl_2queue_fini. */
+                cl_page_put(env, clp);
+
+                cl_page_clip(env, clp, offset, offset+bytes);
+
+                npages++;
+                count -= bytes;
+                pos += bytes;
+                buf += bytes;
+
+                group->lig_rwcount += bytes;
+                ret_bytes += bytes;
+                page++;
+        } while (count);
+
+        cl_sync_io_init(anchor, npages);
+        /* printk("Inited anchor with %d pages\n", npages); */
+
+        if (rc == 0) {
+                rc = cl_io_submit_rw(env, io,
+                                     io->ci_type == CIT_READ ? CRT_READ :
+                                                               CRT_WRITE,
+                                     queue);
+                if (rc == 0) {
+                        /* If some pages weren't sent for any reason, count
+                         * then as completed, to avoid infinite wait. */
+                        cl_page_list_for_each(clp, &queue->c2_qin) {
+                                CL_PAGE_DEBUG(D_ERROR, env, clp,
+                                              "not completed\n");
+                                cl_sync_io_note(anchor, +1);
+                        }
+                        /* wait for the IO to be finished. */
+                        rc = cl_sync_io_wait(env, io, &queue->c2_qout, anchor);
+                }
+        }
+
+        group->lig_rc = rc;
+
+        cl_2queue_discard(env, io, queue);
+        cl_2queue_disown(env, io, queue);
+        cl_2queue_fini(env, queue);
+
+        RETURN(ret_bytes);
+}
+
+static
+struct llu_io_group * get_io_group(struct inode *inode, int maxpages,
+                                   struct lustre_rw_params *params)
+{
+        struct llu_io_group *group;
+
+        OBD_ALLOC_PTR(group);
+        if (!group)
+                return ERR_PTR(-ENOMEM);
+
+        group->lig_params = params;
+
+        return group;
+}
+
+static int max_io_pages(ssize_t len, int iovlen)
+{
+        return (((len + CFS_PAGE_SIZE -1) / CFS_PAGE_SIZE) + 2 + iovlen - 1);
+}
+
+void put_io_group(struct llu_io_group *group)
+{
+        OBD_FREE_PTR(group);
+}
+
+static int slp_io_start(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+        struct ccc_io     *cio   = cl2ccc_io(env, ios);
+        struct cl_io      *io    = ios->cis_io;
+        struct cl_object  *obj   = io->ci_obj;
+        struct inode      *inode = ccc_object_inode(obj);
+        int     err, ret;
+        loff_t  pos;
+        size_t  cnt;
+        struct llu_io_group *iogroup;
+        struct lustre_rw_params p = {0};
+        int iovidx;
+        struct intnl_stat *st = llu_i2stat(inode);
+        struct llu_inode_info *lli = llu_i2info(inode);
+        struct llu_io_session *session = cl2slp_io(env, ios)->sio_session;
+        int write = io->ci_type == CIT_WRITE;
+
+        CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+
+        if (write) {
+                pos = io->u.ci_wr.wr.crw_pos;
+                cnt = io->u.ci_wr.wr.crw_count;
+        } else {
+                pos = io->u.ci_rd.rd.crw_pos;
+                cnt = io->u.ci_rd.rd.crw_count;
+        }
+        if (io->u.ci_wr.wr_append) {
+                p.lrp_lock_mode = LCK_PW;
+        } else {
+                p.lrp_brw_flags = OBD_BRW_SRVLOCK;
+                p.lrp_lock_mode = LCK_NL;
+        }
+
+        iogroup = get_io_group(inode, max_io_pages(cnt, cio->cui_nrsegs), &p);
+        if (IS_ERR(iogroup))
+                RETURN(PTR_ERR(iogroup));
+
+        err = ccc_prep_size(env, obj, io, pos + cnt - 1, 0);
+        if (err != 0)
+                GOTO(out, err);
+
+        CDEBUG(D_INODE,
+               "%s ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
+               write?"Write":"Read", (unsigned long)st->st_ino,
+               cnt, (__u64)pos, (__u64)st->st_size);
+
+        if (write && io->u.ci_wr.wr_append)
+                pos = io->u.ci_wr.wr.crw_pos = st->st_size; /* XXX? Do we need to change io content too here? */
+                /* XXX What about if one write syscall writes at 2 different offsets? */
+
+        for (iovidx = 0; iovidx < cio->cui_nrsegs; iovidx++) {
+                char *buf = (char *) cio->cui_iov[iovidx].iov_base;
+                size_t count = cio->cui_iov[iovidx].iov_len;
+
+                if (!count)
+                        continue;
+                if (cnt < count)
+                        count = cnt;
+                if (IS_BAD_PTR(buf) || IS_BAD_PTR(buf + count)) {
+                        GOTO(out, err = -EFAULT);
+                }
+
+                if (io->ci_type == CIT_READ) {
+                        if (/* local_lock && */ pos >= st->st_size)
+                                break;
+                } else if (io->ci_type == CIT_WRITE) {
+                        if (pos >= lli->lli_maxbytes) {
+                                GOTO(out, err = -EFBIG);
+                        }
+                        if (pos + count >= lli->lli_maxbytes)
+                                count = lli->lli_maxbytes - pos;
+                } else {
+                        LBUG();
+                }
+
+                ret = llu_queue_pio(env, io, iogroup, buf, count, pos);
+                if (ret < 0) {
+                        GOTO(out, err = ret);
+                } else {
+                        io->ci_nob += ret;
+                        pos += ret;
+                        cnt -= ret;
+                        if (io->ci_type == CIT_WRITE) {
+//                                obd_adjust_kms(exp, lsm, pos, 0); // XXX
+                                if (pos > st->st_size)
+                                        st->st_size = pos;
+                        }
+                        if (!cnt)
+                                break;
+                }
+        }
+        LASSERT(cnt == 0 || io->ci_type == CIT_READ); /* libsysio should guarantee this */
+
+        session->lis_groups[session->lis_ngroups++] = iogroup;
+
+        return 0;
+out:
+        put_io_group(iogroup);
+        return err;
+}
+
+static const struct cl_io_operations ccc_io_ops = {
+        .op = {
+                [CIT_READ] = {
+                        .cio_fini      = ccc_io_fini,
+                        .cio_start     = slp_io_start,
+                        .cio_end       = ccc_io_end
+                },
+                [CIT_WRITE] = {
+                        .cio_fini      = ccc_io_fini,
+                        .cio_lock      = slp_io_write_lock,
+                        .cio_start     = slp_io_start,
+                        .cio_end       = ccc_io_end
+                },
+                [CIT_TRUNC] = {
+                        .cio_fini       = ccc_io_fini,
+                        .cio_iter_init  = slp_io_trunc_iter_init,
+                        .cio_start      = slp_io_trunc_start
+                },
+                [CIT_MISC] = {
+                        .cio_fini   = ccc_io_fini
+                }
+        }
+};
+
+static struct slp_io *cl2slp_io(const struct lu_env *env,
+                                const struct cl_io_slice *slice)
+{
+        /* We call it just for assertion here */
+        cl2ccc_io(env, slice);
+
+        return slp_env_io(env);
+}
+
+/*****************************************************************************
+ *
+ * Temporary prototype thing: mirror obd-devices into cl devices.
+ *
+ */
+
+int cl_sb_init(struct llu_sb_info *sbi)
+{
+        struct cl_device  *cl;
+        struct lu_env     *env;
+        int rc = 0;
+        int refcheck;
+
+        env = cl_env_get(&refcheck);
+        if (IS_ERR(env))
+                RETURN(PTR_ERR(env));
+
+        cl = cl_type_setup(env, NULL, &slp_device_type,
+                           sbi->ll_dt_exp->exp_obd->obd_lu_dev);
+        if (IS_ERR(cl))
+                GOTO(out, rc = PTR_ERR(cl));
+
+        sbi->ll_cl = cl;
+        sbi->ll_site = cl2lu_dev(cl)->ld_site;
+out:
+        cl_env_put(env, &refcheck);
+        RETURN(rc);
+}
+
+int cl_sb_fini(struct llu_sb_info *sbi)
+{
+        struct lu_env *env;
+        int refcheck;
+
+        ENTRY;
+
+        env = cl_env_get(&refcheck);
+        if (IS_ERR(env))
+                RETURN(PTR_ERR(env));
+
+        if (sbi->ll_cl != NULL) {
+                cl_stack_fini(env, sbi->ll_cl);
+                sbi->ll_cl = NULL;
+                sbi->ll_site = NULL;
+        }
+        cl_env_put(env, &refcheck);
+        /*
+         * If mount failed (sbi->ll_cl == NULL), and this there are no other
+         * mounts, stop device types manually (this usually happens
+         * automatically when last device is destroyed).
+         */
+        lu_types_stop();
+        cl_env_cache_purge(~0);
+        RETURN(0);
+}
diff --git a/lustre/liblustre/llite_lib.c b/lustre/liblustre/llite_lib.c

index 31ee23b..232ce2b 100644 (file)
--- a/lustre/liblustre/llite_lib.c
+++ b/lustre/liblustre/llite_lib.c
@@ -67,6 +67,8 @@
  #include "lutil.h"
  #include "llite_lib.h"
  
+int slp_global_init(void);
+
  static int lllib_init(void)
  {
          if (liblustre_init_current("liblustre") ||
@@ -77,7 +79,8 @@ static int lllib_init(void)
              lmv_init() ||
              mdc_init() ||
              lov_init() ||
-            osc_init())
+            osc_init() ||
+            slp_global_init())
                  return -1;
  
          return _sysio_fssw_register("lustre", &llu_fssw_ops);
diff --git a/lustre/liblustre/llite_lib.h b/lustre/liblustre/llite_lib.h

index 57b98fc..931651e 100644 (file)
--- a/lustre/liblustre/llite_lib.h
+++ b/lustre/liblustre/llite_lib.h
@@ -47,7 +47,11 @@
  #include <sys/types.h>
  #include <sys/stat.h>
  
-/* This should not be "optimized" use ~0ULL because page->index is a long and 
+/* for struct cl_lock_descr and struct cl_io */
+#include <cl_object.h>
+#include <lclient.h>
+
+/* This should not be "optimized" use ~0ULL because page->index is a long and
   * 32-bit systems are therefore limited to 16TB in a mapping */
  #define PAGE_CACHE_MAXBYTES ((__u64)(~0UL) << CFS_PAGE_SHIFT)
  
@@ -70,6 +74,8 @@ struct llu_sb_info {
          struct obd_uuid          ll_mds_uuid;
          struct obd_uuid          ll_mds_peer_uuid;
          char                    *ll_instance;
+        struct lu_site           *ll_site;
+        struct cl_device         *ll_cl;
  };
  
  #define LL_SBI_NOLCK            0x1
@@ -109,8 +115,10 @@ struct llu_inode_info {
          /* not for stat, change it later */
          int                     lli_st_flags;
          unsigned long           lli_st_generation;
+        struct cl_object       *lli_clob;
  };
  
+
  static inline struct llu_sb_info *llu_fs2sbi(struct filesys *fs)
  {
          return (struct llu_sb_info*)(fs->fs_private);
@@ -218,8 +226,7 @@ int ll_parse_mount_target(const char *target, char **mgsnid,
  extern struct mount_option_s mount_option;
  
  /* super.c */
-void llu_update_inode(struct inode *inode, struct mdt_body *body,
-                      struct lov_stripe_md *lmm);
+void llu_update_inode(struct inode *inode, struct lustre_md *md);
  void obdo_to_inode(struct inode *dst, struct obdo *src, obd_flag valid);
  void obdo_from_inode(struct obdo *dst, struct inode *src, obd_flag valid);
  int ll_it_open_error(int phase, struct lookup_intent *it);
@@ -253,11 +260,7 @@ int llu_objects_destroy(struct ptlrpc_request *request, struct inode *dir);
  int llu_iop_read(struct inode *ino, struct ioctx *ioctxp);
  int llu_iop_write(struct inode *ino, struct ioctx *ioctxp);
  int llu_iop_iodone(struct ioctx *ioctxp);
-int llu_local_size(struct inode *inode);
  int llu_glimpse_size(struct inode *inode);
-int llu_extent_lock_cancel_cb(struct ldlm_lock *lock,
-                              struct ldlm_lock_desc *new, void *data,
-                              int flag);
  int llu_extent_lock(struct ll_file_data *fd, struct inode *inode,
                      struct lov_stripe_md *lsm, int mode,
                      ldlm_policy_data_t *policy, struct lustre_handle *lockh,
@@ -278,11 +281,11 @@ int llu_md_blocking_ast(struct ldlm_lock *lock,
                          void *data, int flag);
  
  /* dir.c */
-ssize_t llu_iop_filldirentries(struct inode *ino, _SYSIO_OFF_T *basep, 
+ssize_t llu_iop_filldirentries(struct inode *ino, _SYSIO_OFF_T *basep,
                                 char *buf, size_t nbytes);
  
  /* liblustre/llite_fid.c*/
-unsigned long llu_fid_build_ino(struct llu_sb_info *sbi, 
+unsigned long llu_fid_build_ino(struct llu_sb_info *sbi,
                                  struct lu_fid *fid);
  
  /* ext2 related */
@@ -306,6 +309,8 @@ static inline struct ext2_dirent *ext2_next_entry(struct ext2_dirent *p)
          return (struct ext2_dirent*)((char*) p + le16_to_cpu(p->rec_len));
  }
  
+int llu_merge_lvb(struct inode *inode);
+
  static inline void inode_init_lvb(struct inode *inode, struct ost_lvb *lvb)
  {
          struct intnl_stat *st = llu_i2stat(inode);
@@ -316,4 +321,91 @@ static inline void inode_init_lvb(struct inode *inode, struct ost_lvb *lvb)
          lvb->lvb_ctime = st->st_ctime;
  }
  
+#define LLU_IO_GROUP_SIZE(x) \
+        (sizeof(struct llu_io_group) + \
+         (sizeof(struct ll_async_page) + \
+          sizeof(cfs_page_t) + \
+          llap_cookie_size) * (x))
+
+#define LLU_IO_SESSION_SIZE(x)  \
+        (sizeof(struct llu_io_session) + (x) * 2 * sizeof(void *))
+
+struct llu_io_session {
+        struct inode           *lis_inode;
+        int                     lis_cmd;
+        int                     lis_max_groups;
+        int                     lis_ngroups;
+        struct llu_io_group    *lis_groups[0];
+};
+
+struct llu_io_group
+{
+        struct lustre_rw_params *lig_params;
+        int                     lig_rc;
+        __u64                   lig_rwcount;
+};
+
+struct llu_io_session;
+void put_io_group(struct llu_io_group *group);
+
+int cl_sb_init(struct llu_sb_info *sbi);
+int cl_sb_fini(struct llu_sb_info *sbi);
+int cl_inode_init(struct inode *inode, struct lustre_md *md);
+void cl_inode_fini(struct inode *inode);
+
+void llu_io_init(struct cl_io *io, struct inode *inode, int write);
+
+struct slp_io {
+        struct llu_io_session *sio_session;
+};
+
+struct slp_session {
+        struct slp_io ss_ios;
+};
+
+static inline struct slp_session *slp_env_session(const struct lu_env *env)
+{
+        extern struct lu_context_key slp_session_key;
+        struct slp_session *ses;
+        ses = lu_context_key_get(env->le_ses, &slp_session_key);
+        LASSERT(ses != NULL);
+        return ses;
+}
+static inline struct slp_io *slp_env_io(const struct lu_env *env)
+{
+        return &slp_env_session(env)->ss_ios;
+}
+
+/* lclient compat stuff */
+#define cl_inode_info llu_inode_info
+#define cl_i2info(info) llu_i2info(info)
+#define cl_inode_mode(inode) (llu_i2stat(inode)->st_mode)
+#define cl_i2sbi llu_i2sbi
+#define cl_isize_read(inode) (llu_i2stat(inode)->st_size)
+#define cl_isize_write(inode,kms) do{llu_i2stat(inode)->st_size = kms;}while(0)
+#define cl_isize_write_nolock(inode,kms) do{llu_i2stat(inode)->st_size = kms;}while(0)
+
+static inline void cl_isize_lock(struct inode *inode, int lsmlock)
+{
+}
+
+static inline void cl_isize_unlock(struct inode *inode, int lsmlock)
+{
+}
+
+static inline int cl_merge_lvb(struct inode *inode)
+{
+        return llu_merge_lvb(inode);
+}
+
+#define cl_inode_atime(inode) (llu_i2stat(inode)->st_atime)
+#define cl_inode_ctime(inode) (llu_i2stat(inode)->st_ctime)
+#define cl_inode_mtime(inode) (llu_i2stat(inode)->st_mtime)
+
+static inline struct obd_capa *cl_capa_lookup(struct inode *inode,
+                                              enum cl_req_type crt)
+{
+        return NULL;
+}
+
  #endif
diff --git a/lustre/liblustre/namei.c b/lustre/liblustre/namei.c

index 2857baf..cfd7100 100644 (file)
--- a/lustre/liblustre/namei.c
+++ b/lustre/liblustre/namei.c
@@ -212,7 +212,7 @@ static int pnode_revalidate_finish(struct ptlrpc_request *req,
          if (rc)
                  RETURN(rc);
  
-        llu_update_inode(inode, md.body, md.lsm);
+        llu_update_inode(inode, &md);
  
          RETURN(rc);
  }
@@ -381,7 +381,7 @@ static int lookup_it_finish(struct ptlrpc_request *request, int offset,
                          /* bug 2334: drop MDS lock before acquiring OST lock */
                          ll_intent_drop_lock(it);
  
-                        rc = llu_glimpse_size(inode);
+                        rc = cl_glimpse_size(inode);
                          if (rc) {
                                  I_RELE(inode);
                                  RETURN(rc);
diff --git a/lustre/liblustre/rw.c b/lustre/liblustre/rw.c

index 53bda1f..21d8e42 100644 (file)
--- a/lustre/liblustre/rw.c
+++ b/lustre/liblustre/rw.c
@@ -65,37 +65,6 @@
  
  #include "llite_lib.h"
  
-struct llu_io_group
-{
-        struct obd_io_group    *lig_oig;
-        struct inode           *lig_inode;
-        struct lustre_rw_params *lig_params;
-        int                     lig_maxpages;
-        int                     lig_npages;
-        __u64                   lig_rwcount;
-        struct ll_async_page   *lig_llaps;
-        cfs_page_t             *lig_pages;
-        void                   *lig_llap_cookies;
-};
-
-#define LLU_IO_GROUP_SIZE(x) \
-        (sizeof(struct llu_io_group) + \
-         (sizeof(struct ll_async_page) + \
-          sizeof(cfs_page_t) + \
-          llap_cookie_size) * (x))
-
-struct llu_io_session
-{
-        struct inode           *lis_inode;
-        int                     lis_cmd;
-        int                     lis_max_groups;
-        int                     lis_ngroups;
-        struct llu_io_group    *lis_groups[0];
-};
-#define LLU_IO_SESSION_SIZE(x)  \
-        (sizeof(struct llu_io_session) + (x) * 2 * sizeof(void *))
-
-
  typedef ssize_t llu_file_piov_t(const struct iovec *iovec, int iovlen,
                                  _SYSIO_OFF_T pos, ssize_t len,
                                  void *private);
@@ -177,7 +146,7 @@ int llu_extent_lock_cancel_cb(struct ldlm_lock *lock,
                  if (lsm->lsm_oinfo[stripe]->loi_kms != kms)
                          LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
                                     lsm->lsm_oinfo[stripe]->loi_kms, kms);
-                lsm->lsm_oinfo[stripe]->loi_kms = kms;
+                loi_kms_set(lsm->lsm_oinfo[stripe], kms);
  iput:
                  I_RELE(inode);
                  break;
@@ -222,7 +191,7 @@ static int llu_glimpse_callback(struct ldlm_lock *lock, void *reqp)
          lvb = req_capsule_server_get(&req->rq_pill, &RMF_DLM_LVB);
          lvb->lvb_size = lli->lli_smd->lsm_oinfo[stripe]->loi_kms;
  
-        LDLM_DEBUG(lock, "i_size: "LPU64" -> stripe number %u -> kms "LPU64,
+        LDLM_DEBUG(lock, "i_size: %llu -> stripe number %u -> kms "LPU64,
                     (__u64)llu_i2stat(inode)->st_size, stripe,lvb->lvb_size);
   iput:
          I_RELE(inode);
@@ -236,7 +205,7 @@ static int llu_glimpse_callback(struct ldlm_lock *lock, void *reqp)
          return rc;
  }
  
-static int llu_merge_lvb(struct inode *inode)
+int llu_merge_lvb(struct inode *inode)
  {
          struct llu_inode_info *lli = llu_i2info(inode);
          struct llu_sb_info *sbi = llu_i2sbi(inode);
@@ -259,81 +228,6 @@ static int llu_merge_lvb(struct inode *inode)
          RETURN(rc);
  }
  
-int llu_local_size(struct inode *inode)
-{
-        ldlm_policy_data_t policy = { .l_extent = { 0, OBD_OBJECT_EOF } };
-        struct llu_inode_info *lli = llu_i2info(inode);
-        struct llu_sb_info *sbi = llu_i2sbi(inode);
-        struct lustre_handle lockh = { 0 };
-        int flags = 0;
-        int rc;
-        ENTRY;
-
-        if (lli->lli_smd->lsm_stripe_count == 0)
-                RETURN(0);
-        
-        rc = obd_match(sbi->ll_dt_exp, lli->lli_smd, LDLM_EXTENT,
-                       &policy, LCK_PR, &flags, inode, &lockh);
-        if (rc < 0)
-                RETURN(rc);
-        else if (rc == 0)
-                RETURN(-ENODATA);
-        
-        rc = llu_merge_lvb(inode);
-        obd_cancel(sbi->ll_dt_exp, lli->lli_smd, LCK_PR, &lockh);
-        RETURN(rc);
-}
-
-/* NB: lov_merge_size will prefer locally cached writes if they extend the
- * file (because it prefers KMS over RSS when larger) */
-int llu_glimpse_size(struct inode *inode)
-{
-        struct llu_inode_info *lli = llu_i2info(inode);
-        struct intnl_stat *st = llu_i2stat(inode);
-        struct llu_sb_info *sbi = llu_i2sbi(inode);
-        struct lustre_handle lockh = { 0 };
-        struct ldlm_enqueue_info einfo = { 0 };
-        struct obd_info oinfo = { { { 0 } } };
-        int rc;
-        ENTRY;
-
-        /* If size is cached on the mds, skip glimpse. */
-        if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
-                RETURN(0);
-
-        CDEBUG(D_DLMTRACE, "Glimpsing inode "LPU64"\n", (__u64)st->st_ino);
-
-        if (!lli->lli_smd) {
-                CDEBUG(D_DLMTRACE, "No objects for inode "LPU64"\n", 
-                       (__u64)st->st_ino);
-                RETURN(0);
-        }
-
-        einfo.ei_type = LDLM_EXTENT;
-        einfo.ei_mode = LCK_PR;
-        einfo.ei_cb_bl = osc_extent_blocking_cb;
-        einfo.ei_cb_cp = ldlm_completion_ast;
-        einfo.ei_cb_gl = llu_glimpse_callback;
-        einfo.ei_cbdata = inode;
-
-        oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
-        oinfo.oi_lockh = &lockh;
-        oinfo.oi_md = lli->lli_smd;
-        oinfo.oi_flags = LDLM_FL_HAS_INTENT;
-
-        rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
-        if (rc) {
-                CERROR("obd_enqueue returned rc %d, returning -EIO\n", rc);
-                RETURN(rc > 0 ? -EIO : rc);
-        }
-
-        rc = llu_merge_lvb(inode);
-        CDEBUG(D_DLMTRACE, "glimpse: size: "LPU64", blocks: "LPU64"\n",
-               (__u64)st->st_size, (__u64)st->st_blocks);
-
-        RETURN(rc);
-}
-
  int llu_extent_lock(struct ll_file_data *fd, struct inode *inode,
                      struct lov_stripe_md *lsm, int mode,
                      ldlm_policy_data_t *policy, struct lustre_handle *lockh,
@@ -356,12 +250,12 @@ int llu_extent_lock(struct ll_file_data *fd, struct inode *inode,
                  RETURN(0);
  
          CDEBUG(D_DLMTRACE, "Locking inode %llu, start "LPU64" end "LPU64"\n",
-               (unsigned long long)st->st_ino, policy->l_extent.start,
+               (__u64)st->st_ino, policy->l_extent.start,
                 policy->l_extent.end);
  
          einfo.ei_type = LDLM_EXTENT;
          einfo.ei_mode = mode;
-        einfo.ei_cb_bl = osc_extent_blocking_cb;
+        einfo.ei_cb_bl = llu_extent_lock_cancel_cb;
          einfo.ei_cb_cp = ldlm_completion_ast;
          einfo.ei_cb_gl = llu_glimpse_callback;
          einfo.ei_cbdata = inode;
@@ -411,278 +305,6 @@ int llu_extent_unlock(struct ll_file_data *fd, struct inode *inode,
          RETURN(rc);
  }
  
-#define LLAP_MAGIC 12346789
-
-struct ll_async_page {
-        int             llap_magic;
-        void           *llap_cookie;
-        int             llap_queued;
-        cfs_page_t     *llap_page;
-        struct inode   *llap_inode;
-};
-
-static inline struct ll_async_page *llap_from_cookie(void *ptr)
-{
-        struct ll_async_page *ap = ptr;
-        LASSERT(ap->llap_magic == LLAP_MAGIC);
-        return ap;
-}
-
-static void llu_ap_fill_obdo(void *data, int cmd, struct obdo *oa)
-{
-        struct ll_async_page *llap;
-        struct inode *inode;
-        struct lov_stripe_md *lsm;
-        obd_flag valid_flags;
-        ENTRY;
-
-        llap = llap_from_cookie(data);
-        inode = llap->llap_inode;
-        lsm = llu_i2info(inode)->lli_smd;
-
-        oa->o_id = lsm->lsm_object_id;
-        oa->o_valid = OBD_MD_FLID;
-        valid_flags = OBD_MD_FLTYPE | OBD_MD_FLATIME;
-        if (cmd & OBD_BRW_WRITE)
-                valid_flags |= OBD_MD_FLMTIME | OBD_MD_FLCTIME |
-                        OBD_MD_FLUID | OBD_MD_FLGID |
-                        OBD_MD_FLFID | OBD_MD_FLGENER;
-
-        obdo_from_inode(oa, inode, valid_flags);
-        EXIT;
-}
-
-static void llu_ap_update_obdo(void *data, int cmd, struct obdo *oa,
-                               obd_valid valid)
-{
-        struct ll_async_page *llap;
-        ENTRY;
-
-        llap = llap_from_cookie(data);
-        obdo_from_inode(oa, llap->llap_inode, valid);
-
-        EXIT;
-}
-
-/* called for each page in a completed rpc.*/
-static int llu_ap_completion(void *data, int cmd, struct obdo *oa, int rc)
-{
-        struct ll_async_page *llap;
-        cfs_page_t *page;
-        ENTRY;
-
-        llap = llap_from_cookie(data);
-        llap->llap_queued = 0;
-        page = llap->llap_page;
-
-        if (rc != 0) {
-                if (cmd & OBD_BRW_WRITE)
-                        CERROR("writeback error on page %p index %ld: %d\n",
-                               page, page->index, rc);
-        }
-        RETURN(0);
-}
-
-static struct obd_capa * llu_ap_lookup_capa(void *data, int cmd)
-{
-        return NULL;
-}
-
-static struct obd_async_page_ops llu_async_page_ops = {
-        .ap_make_ready =        NULL,
-        .ap_refresh_count =     NULL,
-        .ap_fill_obdo =         llu_ap_fill_obdo,
-        .ap_update_obdo =       llu_ap_update_obdo,
-        .ap_completion =        llu_ap_completion,
-        .ap_lookup_capa =       llu_ap_lookup_capa,
-};
-
-static int llu_queue_pio(int cmd, struct llu_io_group *group,
-                         char *buf, size_t count, loff_t pos)
-{
-        struct llu_inode_info *lli = llu_i2info(group->lig_inode);
-        struct intnl_stat *st = llu_i2stat(group->lig_inode);
-        struct lov_stripe_md *lsm = lli->lli_smd;
-        struct obd_export *exp = llu_i2obdexp(group->lig_inode);
-        cfs_page_t *pages = &group->lig_pages[group->lig_npages],*page = pages;
-        struct ll_async_page *llap = &group->lig_llaps[group->lig_npages];
-        void *llap_cookie = group->lig_llap_cookies +
-                llap_cookie_size * group->lig_npages;
-        int i, rc, npages = 0, ret_bytes = 0;
-        int local_lock;
-        ENTRY;
-
-        if (!exp)
-                RETURN(-EINVAL);
-
-        local_lock = group->lig_params->lrp_lock_mode != LCK_NL;
-        /* prepare the pages array */
-       do {
-                unsigned long index, offset, bytes;
-
-                offset = (pos & ~CFS_PAGE_MASK);
-                index = pos >> CFS_PAGE_SHIFT;
-                bytes = CFS_PAGE_SIZE - offset;
-                if (bytes > count)
-                        bytes = count;
-
-                /* prevent read beyond file range */
-                if (/* local_lock && */
-                    cmd == OBD_BRW_READ && pos + bytes >= st->st_size) {
-                        if (pos >= st->st_size)
-                                break;
-                        bytes = st->st_size - pos;
-                }
-
-                /* prepare page for this index */
-                page->index = index;
-                page->addr = buf - offset;
-
-                page->_offset = offset;
-                page->_count = bytes;
-
-                page++;
-                npages++;
-                count -= bytes;
-                pos += bytes;
-                buf += bytes;
-
-                group->lig_rwcount += bytes;
-                ret_bytes += bytes;
-        } while (count);
-
-        group->lig_npages += npages;
-
-        for (i = 0, page = pages; i < npages;
-             i++, page++, llap++, llap_cookie += llap_cookie_size){
-                llap->llap_magic = LLAP_MAGIC;
-                llap->llap_cookie = llap_cookie;
-                rc = obd_prep_async_page(exp, lsm, NULL, page,
-                                         (obd_off)page->index << CFS_PAGE_SHIFT,
-                                         &llu_async_page_ops,
-                                         llap, &llap->llap_cookie,
-                                         1 /* no cache in liblustre at all */,
-                                         NULL);
-                if (rc) {
-                        LASSERT(rc < 0);
-                        llap->llap_cookie = NULL;
-                        RETURN(rc);
-                }
-
-                CDEBUG(D_CACHE, "llap %p page %p group %p obj off "LPU64"\n",
-                       llap, page, llap->llap_cookie,
-                       (obd_off)pages->index << CFS_PAGE_SHIFT);
-                page->private = (unsigned long)llap;
-                llap->llap_page = page;
-                llap->llap_inode = group->lig_inode;
-
-                rc = obd_queue_group_io(exp, lsm, NULL, group->lig_oig,
-                                        llap->llap_cookie, cmd,
-                                        page->_offset, page->_count,
-                                        group->lig_params->lrp_brw_flags,
-                                        ASYNC_READY | ASYNC_URGENT |
-                                        ASYNC_COUNT_STABLE | ASYNC_GROUP_SYNC);
-                if (!local_lock && cmd == OBD_BRW_READ) {
-                        /*
-                         * In OST-side locking case short reads cannot be
-                         * detected properly.
-                         *
-                         * The root of the problem is that
-                         *
-                         * kms = lov_merge_size(lsm, 1);
-                         * if (end >= kms)
-                         *         glimpse_size(inode);
-                         * else
-                         *         st->st_size = kms;
-                         *
-                         * logic in the read code (both llite and liblustre)
-                         * only works correctly when client holds DLM lock on
-                         * [start, end]. Without DLM lock KMS can be
-                         * completely out of date, and client can either make
-                         * spurious short-read (missing concurrent write), or
-                         * return stale data (missing concurrent
-                         * truncate). For llite client this is fatal, because
-                         * incorrect data are cached and can be later sent
-                         * back to the server (vide bug 5047). This is hard to
-                         * fix by handling short-reads on the server, as there
-                         * is no easy way to communicate file size (or amount
-                         * of bytes read/written) back to the client,
-                         * _especially_ because OSC pages can be sliced and
-                         * dices into multiple RPCs arbitrary. Fortunately,
-                         * liblustre doesn't cache data and the worst case is
-                         * that we get race with concurrent write or truncate.
-                         */
-                }
-                if (rc) {
-                        LASSERT(rc < 0);
-                        RETURN(rc);
-                }
-
-                llap->llap_queued = 1;
-        }
-
-        RETURN(ret_bytes);
-}
-
-static
-struct llu_io_group * get_io_group(struct inode *inode, int maxpages,
-                                   struct lustre_rw_params *params)
-{
-        struct llu_io_group *group;
-        int rc;
-
-        if (!llap_cookie_size)
-                llap_cookie_size = obd_prep_async_page(llu_i2obdexp(inode),
-                                                       NULL, NULL, NULL, 0,
-                                                       NULL, NULL, NULL, 0,
-                                                       NULL);
-
-        OBD_ALLOC(group, LLU_IO_GROUP_SIZE(maxpages));
-        if (!group)
-                return ERR_PTR(-ENOMEM);
-
-        I_REF(inode);
-        group->lig_inode = inode;
-        group->lig_maxpages = maxpages;
-        group->lig_params = params;
-        group->lig_llaps = (struct ll_async_page *)(group + 1);
-        group->lig_pages = (cfs_page_t *)(&group->lig_llaps[maxpages]);
-        group->lig_llap_cookies = (void *)(&group->lig_pages[maxpages]);
-
-        rc = oig_init(&group->lig_oig);
-        if (rc) {
-                OBD_FREE(group, LLU_IO_GROUP_SIZE(maxpages));
-                return ERR_PTR(rc);
-        }
-
-        return group;
-}
-
-static int max_io_pages(ssize_t len, int iovlen)
-{
-        return (((len + CFS_PAGE_SIZE -1) / CFS_PAGE_SIZE) + 2 + iovlen - 1);
-}
-
-static
-void put_io_group(struct llu_io_group *group)
-{
-        struct lov_stripe_md *lsm = llu_i2info(group->lig_inode)->lli_smd;
-        struct obd_export *exp = llu_i2obdexp(group->lig_inode);
-        struct ll_async_page *llap = group->lig_llaps;
-        int i;
-
-        for (i = 0; i < group->lig_npages; i++, llap++) {
-                if (llap->llap_cookie)
-                        obd_teardown_async_page(exp, lsm, NULL,
-                                                llap->llap_cookie);
-        }
-
-        I_RELE(group->lig_inode);
-
-        oig_release(group->lig_oig);
-        OBD_FREE(group, LLU_IO_GROUP_SIZE(group->lig_maxpages));
-}
-
  static
  ssize_t llu_file_prwv(const struct iovec *iovec, int iovlen,
                          _SYSIO_OFF_T pos, ssize_t len,
@@ -691,18 +313,11 @@ ssize_t llu_file_prwv(const struct iovec *iovec, int iovlen,
          struct llu_io_session *session = (struct llu_io_session *) private;
          struct inode *inode = session->lis_inode;
          struct llu_inode_info *lli = llu_i2info(inode);
-        struct intnl_stat *st = llu_i2stat(inode);
-        struct ll_file_data *fd = lli->lli_file_data;
-        struct lustre_handle lockh = {0};
-        struct lov_stripe_md *lsm = lli->lli_smd;
-        struct obd_export *exp = NULL;
-        struct llu_io_group *iogroup;
-        struct lustre_rw_params p;
-        struct ost_lvb lvb;
-        __u64 kms;
-        int err, is_read, iovidx, ret;
-        int local_lock;
-        ssize_t ret_len = len;
+        int err;
+        struct lu_env *env;
+        struct cl_io  *io;
+        struct slp_io *sio;
+        int refcheck;
          ENTRY;
  
          /* in a large iov read/write we'll be repeatedly called.
@@ -710,126 +325,40 @@ ssize_t llu_file_prwv(const struct iovec *iovec, int iovlen,
           */
          liblustre_wait_event(0);
  
-        exp = llu_i2obdexp(inode);
-        if (exp == NULL)
-                RETURN(-EINVAL);
-
          if (len == 0 || iovlen == 0)
                  RETURN(0);
  
          if (pos + len > lli->lli_maxbytes)
                  RETURN(-ERANGE);
  
-        lustre_build_lock_params(session->lis_cmd, lli->lli_open_flags,
-                                 lli->lli_sbi->ll_lco.lco_flags,
-                                 pos, len, &p);
-
-        iogroup = get_io_group(inode, max_io_pages(len, iovlen), &p);
-        if (IS_ERR(iogroup))
-                RETURN(PTR_ERR(iogroup));
-
-        local_lock = p.lrp_lock_mode != LCK_NL;
-
-        err = llu_extent_lock(fd, inode, lsm, p.lrp_lock_mode, &p.lrp_policy,
-                              &lockh, p.lrp_ast_flags);
-        if (err != ELDLM_OK)
-                GOTO(err_put, err);
-
-        is_read = (session->lis_cmd == OBD_BRW_READ);
-        if (is_read) {
-                /*
-                 * If OST-side locking is used, KMS can be completely out of
-                 * date, and, hence, cannot be used for short-read
-                 * detection. Rely in OST to handle short reads in that case.
-                 */
-                inode_init_lvb(inode, &lvb);
-                obd_merge_lvb(exp, lsm, &lvb, 1);
-                kms = lvb.lvb_size;
-                /* extent.end is last byte of the range */
-                if (p.lrp_policy.l_extent.end >= kms) {
-                        /* A glimpse is necessary to determine whether
-                         * we return a short read or some zeroes at
-                         * the end of the buffer
-                         *
-                         * In the case of OST-side locking KMS can be
-                         * completely out of date and short-reads maybe
-                         * mishandled. See llu_queue_pio() for more detailed
-                         * comment.
-                         */
-                        if ((err = llu_glimpse_size(inode))) {
-                                GOTO(err_unlock, err);
-                        }
-                } else {
-                        st->st_size = kms;
-                }
-        } else if (lli->lli_open_flags & O_APPEND) {
-                pos = st->st_size;
-        }
-
-        for (iovidx = 0; iovidx < iovlen; iovidx++) {
-                char *buf = (char *) iovec[iovidx].iov_base;
-                size_t count = iovec[iovidx].iov_len;
-
-                if (!count)
-                        continue;
-                if (len < count)
-                        count = len;
-                if (IS_BAD_PTR(buf) || IS_BAD_PTR(buf + count)) {
-                        GOTO(err_unlock, err = -EFAULT);
-                }
-
-                if (is_read) {
-                        if (/* local_lock && */ pos >= st->st_size)
-                                break;
-                } else {
-                        if (pos >= lli->lli_maxbytes) {
-                                GOTO(err_unlock, err = -EFBIG);
-                        }
-                        if (pos + count >= lli->lli_maxbytes)
-                                count = lli->lli_maxbytes - pos;
-                }
-
-                ret = llu_queue_pio(session->lis_cmd, iogroup, buf, count, pos);
-                if (ret < 0) {
-                        GOTO(err_unlock, err = ret);
-                } else {
-                        pos += ret;
-                        if (!is_read) {
-                                LASSERT(ret == count);
-                                obd_adjust_kms(exp, lsm, pos, 0);
-                                /* file size grow immediately */
-                                if (pos > st->st_size)
-                                        st->st_size = pos;
-                        }
-                        len -= ret;
-                        if (!len)
-                                break;
-                }
-        }
-        LASSERT(len == 0 || is_read); /* libsysio should guarantee this */
-
-        err = obd_trigger_group_io(exp, lsm, NULL, iogroup->lig_oig);
-        if (err)
-                GOTO(err_unlock, err);
-
-        err = oig_wait(iogroup->lig_oig);
-        if (err) {
-                CERROR("%s error: %s\n", is_read ? "read" : "write", strerror(-err));
-                GOTO(err_unlock, err);
+        env = cl_env_get(&refcheck);
+        if (IS_ERR(env))
+                RETURN(PTR_ERR(env));
+
+        io = &ccc_env_info(env)->cti_io;
+
+        if (cl_io_rw_init(env, io, session->lis_cmd == OBD_BRW_WRITE?CIT_WRITE:
+                                                                      CIT_READ,
+                          pos, len) == 0) {
+                struct ccc_io *cio;
+                sio = slp_env_io(env);
+                cio = ccc_env_io(env);
+                /* XXX this is not right: cio->cui_iov can be modified. */
+                cio->cui_iov = (struct iovec *)iovec;
+                cio->cui_nrsegs = iovlen;
+                sio->sio_session = session;
+                err = cl_io_loop(env, io);
+        } else {
+                /* XXX WTF? */
+                LBUG();
          }
+        cl_io_fini(env, io);
+        cl_env_put(env, &refcheck);
  
-        ret = llu_extent_unlock(fd, inode, lsm, p.lrp_lock_mode, &lockh);
-        if (ret)
-                CERROR("extent unlock error %d\n", ret);
+        if (err < 0)
+                RETURN(err);
  
-        session->lis_groups[session->lis_ngroups++] = iogroup;
-        RETURN(ret_len);
-
-err_unlock:
-        llu_extent_unlock(fd, inode, lsm, p.lrp_lock_mode, &lockh);
-err_put:
-        put_io_group(iogroup);
-        RETURN((ssize_t)err);
+        RETURN(len);
  }
  
  static
@@ -906,34 +435,91 @@ static int llu_file_rwx(struct inode *ino,
          RETURN(cc);
  }
  
+void llu_io_init(struct cl_io *io, struct inode *inode, int write)
+{
+        struct llu_inode_info *lli = llu_i2info(inode);
+
+        memset(io, 0, sizeof *io);
+
+        io->u.ci_rw.crw_nonblock = lli->lli_open_flags & O_NONBLOCK;
+        if (write)
+                io->u.ci_wr.wr_append = lli->lli_open_flags & O_APPEND;
+        io->ci_obj  = llu_i2info(inode)->lli_clob;
+
+        if (lli->lli_open_flags & O_APPEND)
+                io->ci_lockreq = CILR_MANDATORY;
+        else
+                io->ci_lockreq = CILR_NEVER;
+
+}
+
  int llu_iop_read(struct inode *ino,
                   struct ioctx *ioctx)
  {
-        /* BUG: 5972 */
          struct intnl_stat *st = llu_i2stat(ino);
+        struct lu_env *env;
+        struct cl_io  *io;
+        int refcheck;
+        int ret;
+
+        /* BUG: 5972 */
          st->st_atime = CURRENT_TIME;
  
-        return llu_file_rwx(ino, ioctx, 1);
+        env = cl_env_get(&refcheck);
+        if (IS_ERR(env))
+                RETURN(PTR_ERR(env));
+
+        io = &ccc_env_info(env)->cti_io;
+        llu_io_init(io, ino, 0);
+
+        ret = llu_file_rwx(ino, ioctx, 1);
+
+        cl_env_put(env, &refcheck);
+        return ret;
  }
  
  int llu_iop_write(struct inode *ino,
                    struct ioctx *ioctx)
  {
          struct intnl_stat *st = llu_i2stat(ino);
+        struct lu_env *env;
+        struct cl_io  *io;
+        int refcheck;
+        int ret;
+
          st->st_mtime = st->st_ctime = CURRENT_TIME;
  
-        return llu_file_rwx(ino, ioctx, 0);
+        env = cl_env_get(&refcheck);
+        if (IS_ERR(env))
+                RETURN(PTR_ERR(env));
+
+        io = &ccc_env_info(env)->cti_io;
+        llu_io_init(io, ino, 1);
+
+        ret = llu_file_rwx(ino, ioctx, 0);
+        cl_env_put(env, &refcheck);
+        return ret;
  }
  
  int llu_iop_iodone(struct ioctx *ioctx)
  {
          struct llu_io_session *session;
          struct llu_io_group *group;
-        int i, err = 0, rc = 0;
+        int i, rc = 0;
+        struct lu_env *env;
+        struct cl_io  *io;
+        int refcheck;
          ENTRY;
  
          liblustre_wait_event(0);
  
+        env = cl_env_get(&refcheck);
+        if (IS_ERR(env))
+                RETURN(PTR_ERR(env));
+
+        io = &ccc_env_info(env)->cti_io;
+        cl_io_fini(env, io);
+        cl_env_put(env, &refcheck);
          session = (struct llu_io_session *) ioctx->ioctx_private;
          LASSERT(session);
          LASSERT(!IS_ERR(session));
@@ -941,11 +527,8 @@ int llu_iop_iodone(struct ioctx *ioctx)
          for (i = 0; i < session->lis_ngroups; i++) {
                  group = session->lis_groups[i];
                  if (group) {
-                        if (!rc) {
-                                err = oig_wait(group->lig_oig);
-                                if (err)
-                                        rc = err;
-                        }
+                        if (!rc)
+                                rc = group->lig_rc;
                          if (!rc)
                                  ioctx->ioctx_cc += group->lig_rwcount;
                          put_io_group(group);
diff --git a/lustre/liblustre/super.c b/lustre/liblustre/super.c

index 258de2e..ff23356 100644 (file)
--- a/lustre/liblustre/super.c
+++ b/lustre/liblustre/super.c
@@ -112,8 +112,7 @@ static void llu_fsop_gone(struct filesys *fs)
          ENTRY;
  
          list_del(&sbi->ll_conn_chain);
-        obd_unregister_lock_cancel_cb(sbi->ll_dt_exp,
-                                      llu_extent_lock_cancel_cb);
+        cl_sb_fini(sbi);
          obd_disconnect(sbi->ll_dt_exp);
          obd_disconnect(sbi->ll_md_exp);
  
@@ -146,15 +145,23 @@ static ldlm_mode_t llu_take_md_lock(struct inode *inode, __u64 bits,
          RETURN(rc);
  }
  
-void llu_update_inode(struct inode *inode, struct mdt_body *body,
-                      struct lov_stripe_md *lsm)
+void llu_update_inode(struct inode *inode, struct lustre_md *md)
  {
          struct llu_inode_info *lli = llu_i2info(inode);
+        struct mdt_body *body = md->body;
+        struct lov_stripe_md *lsm = md->lsm;
          struct intnl_stat *st = llu_i2stat(inode);
  
          LASSERT ((lsm != NULL) == ((body->valid & OBD_MD_FLEASIZE) != 0));
+
+        if (body->valid & OBD_MD_FLMODE)
+                st->st_mode = (st->st_mode & S_IFMT)|(body->mode & ~S_IFMT);
+        if (body->valid & OBD_MD_FLTYPE)
+                st->st_mode = (st->st_mode & ~S_IFMT)|(body->mode & S_IFMT);
+
          if (lsm != NULL) {
                  if (lli->lli_smd == NULL) {
+                        cl_inode_init(inode, md);
                          lli->lli_smd = lsm;
                          lli->lli_maxbytes = lsm->lsm_maxbytes;
                          if (lli->lli_maxbytes > PAGE_CACHE_MAXBYTES)
@@ -185,11 +192,7 @@ void llu_update_inode(struct inode *inode, struct mdt_body *body,
                  if (body->valid & OBD_MD_FLMTIME)
                          LTIME_S(st->st_mtime) = body->mtime;
          }
-        if (body->valid & OBD_MD_FLMODE)
-                st->st_mode = (st->st_mode & S_IFMT)|(body->mode & ~S_IFMT);
-        if (body->valid & OBD_MD_FLTYPE)
-                st->st_mode = (st->st_mode & ~S_IFMT)|(body->mode & S_IFMT);
-        if (S_ISREG(st->st_mode))
+       if (S_ISREG(st->st_mode))
                  st->st_blksize = min(2UL * PTLRPC_MAX_BRW_SIZE, LL_MAX_BLKSIZE);
          else
                  st->st_blksize = 4096;
@@ -204,13 +207,13 @@ void llu_update_inode(struct inode *inode, struct mdt_body *body,
          if (body->valid & OBD_MD_FLFLAGS)
                  lli->lli_st_flags = body->flags;
          if (body->valid & OBD_MD_FLSIZE) {
-                if ((llu_i2sbi(inode)->ll_lco.lco_flags & OBD_CONNECT_SOM) && 
+                if ((llu_i2sbi(inode)->ll_lco.lco_flags & OBD_CONNECT_SOM) &&
                      S_ISREG(st->st_mode) && lli->lli_smd) {
                          struct lustre_handle lockh;
                          ldlm_mode_t mode;
-                        
+
                          /* As it is possible a blocking ast has been processed
-                         * by this time, we need to check there is an UPDATE 
+                         * by this time, we need to check there is an UPDATE
                           * lock on the client and set LLIF_MDS_SIZE_LOCK holding
                           * it. */
                          mode = llu_take_md_lock(inode, MDS_INODELOCK_UPDATE,
@@ -223,7 +226,7 @@ void llu_update_inode(struct inode *inode, struct mdt_body *body,
                  } else {
                      st->st_size = body->size;
                  }
-                
+
                  if (body->valid & OBD_MD_FLBLOCKS)
                          st->st_blocks = body->blocks;
          }
@@ -503,7 +506,7 @@ static int llu_inode_revalidate(struct inode *inode)
                  }
  
  
-                llu_update_inode(inode, md.body, md.lsm);
+                llu_update_inode(inode, &md);
                  if (md.lsm != NULL && llu_i2info(inode)->lli_smd != md.lsm)
                          obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
                  ptlrpc_req_finished(req);
@@ -515,7 +518,7 @@ static int llu_inode_revalidate(struct inode *inode)
  
          /* ll_glimpse_size will prefer locally cached writes if they extend
           * the file */
-        RETURN(llu_glimpse_size(inode));
+        RETURN(cl_glimpse_size(inode));
  }
  
  static void copy_stat_buf(struct inode *ino, struct intnl_stat *b)
@@ -583,6 +586,8 @@ void llu_clear_inode(struct inode *inode)
                  obd_change_cbdata(sbi->ll_dt_exp, lli->lli_smd,
                                    null_if_equal, inode);
  
+        cl_inode_fini(inode);
+
          if (lli->lli_smd) {
                  obd_free_memmd(sbi->ll_dt_exp, &lli->lli_smd);
                  lli->lli_smd = NULL;
@@ -675,7 +680,7 @@ int llu_md_setattr(struct inode *inode, struct md_op_data *op_data,
           * to call vmtruncate in inode_setattr to update inode->i_size
           * (bug 6196) */
          inode_setattr(inode, &op_data->op_attr);
-        llu_update_inode(inode, md.body, md.lsm);
+        llu_update_inode(inode, &md);
          ptlrpc_req_finished(request);
  
          RETURN(rc);
@@ -729,7 +734,6 @@ static int llu_setattr_done_writing(struct inode *inode,
  int llu_setattr_raw(struct inode *inode, struct iattr *attr)
  {
          struct lov_stripe_md *lsm = llu_i2info(inode)->lli_smd;
-        struct llu_sb_info *sbi = llu_i2sbi(inode);
          struct intnl_stat *st = llu_i2stat(inode);
          int ia_valid = attr->ia_valid;
          struct md_op_data op_data = { { 0 } };
@@ -766,7 +770,7 @@ int llu_setattr_raw(struct inode *inode, struct iattr *attr)
          if ((attr->ia_valid & ATTR_CTIME) && !(attr->ia_valid & ATTR_MTIME)) {
                  /* To avoid stale mtime on mds, obtain it from ost and send
                     to mds. */
-                rc = llu_glimpse_size(inode);
+                rc = cl_glimpse_size(inode);
                  if (rc)
                          RETURN(rc);
  
@@ -833,71 +837,11 @@ int llu_setattr_raw(struct inode *inode, struct iattr *attr)
          }
  
          if (ia_valid & ATTR_SIZE) {
-                ldlm_policy_data_t policy = { .l_extent = {attr->ia_size,
-                                                           OBD_OBJECT_EOF} };
-                struct lustre_handle lockh = { 0, };
-                struct lustre_handle match_lockh = { 0, };
-
-                int err;
-                int flags = LDLM_FL_TEST_LOCK; /* for assertion check below */
-                int lock_mode;
-                obd_flag obd_flags;
-
-                /* check that there are no matching locks */
-                LASSERT(obd_match(sbi->ll_dt_exp, lsm, LDLM_EXTENT, &policy,
-                                  LCK_PW, &flags, inode, &match_lockh) <= 0);
-
-                /* XXX when we fix the AST intents to pass the discard-range
-                 * XXX extent, make ast_flags always LDLM_AST_DISCARD_DATA
-                 * XXX here. */
-                flags = (attr->ia_size == 0) ? LDLM_AST_DISCARD_DATA : 0;
-
-                if (sbi->ll_lco.lco_flags & OBD_CONNECT_TRUNCLOCK) {
-                        lock_mode = LCK_NL;
-                        obd_flags = OBD_FL_TRUNCLOCK;
-                        CDEBUG(D_INODE, "delegating locking to the OST");
-                } else {
-                        lock_mode = LCK_PW;
-                        obd_flags = 0;
-                }
-
-                /* with lock_mode == LK_NL no lock is taken. */
-                rc = llu_extent_lock(NULL, inode, lsm, lock_mode, &policy,
-                                     &lockh, flags);
-                if (rc != ELDLM_OK) {
-                        if (rc > 0)
-                                GOTO(out, rc = -ENOLCK);
-                        GOTO(out, rc);
-                }
-                rc = llu_vmtruncate(inode, attr->ia_size, obd_flags);
-
-                /* unlock now as we don't mind others file lockers racing with
-                 * the mds updates below? */
-                err = llu_extent_unlock(NULL, inode, lsm, lock_mode, &lockh);
-                if (err) {
-                        CERROR("llu_extent_unlock failed: %d\n", err);
-                        if (!rc)
-                                rc = err;
-                }
+                rc = cl_setattr_do_truncate(inode, attr->ia_size, NULL);
          } else if (ia_valid & (ATTR_MTIME | ATTR_MTIME_SET)) {
-                struct obd_info oinfo = { { { 0 } } };
-                struct obdo oa;
-
-                CDEBUG(D_INODE, "set mtime on OST inode %llu to "CFS_TIME_T"\n",
-                       (long long)st->st_ino, LTIME_S(attr->ia_mtime));
-                oa.o_id = lsm->lsm_object_id;
-                oa.o_gr = lsm->lsm_object_gr;
-                oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
-
-                obdo_from_inode(&oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME |
-                                            OBD_MD_FLMTIME | OBD_MD_FLCTIME);
-
-                oinfo.oi_oa = &oa;
-                oinfo.oi_md = lsm;
-
-                rc = obd_setattr_rqset(sbi->ll_dt_exp, &oinfo, NULL);
-                if (rc)
-                        CERROR("obd_setattr_async fails: rc=%d\n", rc);
+                CDEBUG(D_INODE, "set mtime on OST inode %llu to %lu\n",
+                       (long long unsigned)st->st_ino, LTIME_S(attr->ia_mtime));
+                rc = cl_setattr_ost(inode, NULL);
          }
          EXIT;
  out:
@@ -976,7 +920,7 @@ static int llu_iop_symlink_raw(struct pnode *pno, const char *tgt)
          if (llu_i2stat(dir)->st_nlink >= EXT2_LINK_MAX)
                  RETURN(err);
  
-        llu_prep_md_op_data(&op_data, dir, NULL, name, len, 0, 
+        llu_prep_md_op_data(&op_data, dir, NULL, name, len, 0,
                              LUSTRE_OPC_SYMLINK);
  
          err = md_create(sbi->ll_md_exp, &op_data, tgt, strlen(tgt) + 1,
@@ -1135,7 +1079,7 @@ static int llu_iop_link_raw(struct pnode *old, struct pnode *new)
          LASSERT(dir);
  
          liblustre_wait_event(0);
-        llu_prep_md_op_data(&op_data, src, dir, name, namelen, 0, 
+        llu_prep_md_op_data(&op_data, src, dir, name, namelen, 0,
                              LUSTRE_OPC_ANY);
          rc = md_link(llu_i2sbi(src)->ll_md_exp, &op_data, &request);
          ptlrpc_req_finished(request);
@@ -1162,7 +1106,7 @@ static int llu_iop_unlink_raw(struct pnode *pno)
          LASSERT(target);
  
          liblustre_wait_event(0);
-        llu_prep_md_op_data(&op_data, dir, NULL, name, len, 0, 
+        llu_prep_md_op_data(&op_data, dir, NULL, name, len, 0,
                              LUSTRE_OPC_ANY);
          rc = md_unlink(llu_i2sbi(dir)->ll_md_exp, &op_data, &request);
          if (!rc)
@@ -1190,7 +1134,7 @@ static int llu_iop_rename_raw(struct pnode *old, struct pnode *new)
          LASSERT(tgt);
  
          liblustre_wait_event(0);
-        llu_prep_md_op_data(&op_data, src, tgt, NULL, 0, 0, 
+        llu_prep_md_op_data(&op_data, src, tgt, NULL, 0, 0,
                              LUSTRE_OPC_ANY);
          rc = md_rename(llu_i2sbi(src)->ll_md_exp, &op_data,
                         oldname, oldnamelen, newname, newnamelen,
@@ -1337,7 +1281,7 @@ static int llu_iop_mkdir_raw(struct pnode *pno, mode_t mode)
          if (st->st_nlink >= EXT2_LINK_MAX)
                  RETURN(err);
  
-        llu_prep_md_op_data(&op_data, dir, NULL, name, len, 0, 
+        llu_prep_md_op_data(&op_data, dir, NULL, name, len, 0,
                              LUSTRE_OPC_MKDIR);
  
          err = md_create(llu_i2sbi(dir)->ll_md_exp, &op_data, NULL, 0,
@@ -1364,7 +1308,7 @@ static int llu_iop_rmdir_raw(struct pnode *pno)
                 (long long)llu_i2stat(dir)->st_ino,
                 llu_i2info(dir)->lli_st_generation, dir);
  
-        llu_prep_md_op_data(&op_data, dir, NULL, name, len, S_IFDIR, 
+        llu_prep_md_op_data(&op_data, dir, NULL, name, len, S_IFDIR,
                              LUSTRE_OPC_ANY);
          rc = md_unlink(llu_i2sbi(dir)->ll_md_exp, &op_data, &request);
          ptlrpc_req_finished(request);
@@ -1466,9 +1410,9 @@ static int llu_file_flock(struct inode *ino,
  
                  if (lmv->desc.ld_tgt_count < 1)
                          RETURN(rc = -ENODEV);
-                
+
                  if (lmv->tgts[0].ltd_exp != NULL)
-                        rc = ldlm_cli_enqueue(lmv->tgts[0].ltd_exp, NULL, &einfo, &res_id, 
+                        rc = ldlm_cli_enqueue(lmv->tgts[0].ltd_exp, NULL, &einfo, &res_id,
                                                &flock, &flags, NULL, 0, NULL, &lockh, 0);
                  else
                          rc = -ENODEV;
@@ -1720,7 +1664,7 @@ static int llu_lov_dir_setstripe(struct inode *ino, unsigned long arg)
          struct lov_user_md lum, *lump = (struct lov_user_md *)arg;
          int rc = 0;
  
-        llu_prep_md_op_data(&op_data, ino, NULL, NULL, 0, 0, 
+        llu_prep_md_op_data(&op_data, ino, NULL, NULL, 0, 0,
                              LUSTRE_OPC_ANY);
  
          LASSERT(sizeof(lum) == sizeof(*lump));
@@ -1826,7 +1770,6 @@ static int llu_lov_setstripe_ea_info(struct inode *ino, int flags,
          if (rc)
                  GOTO(out, rc);
  
-        llu_update_inode(ino, md.body, md.lsm);
          lli->lli_smd = lli2->lli_smd;
          lli2->lli_smd = NULL;
  
@@ -1842,6 +1785,8 @@ static int llu_lov_setstripe_ea_info(struct inode *ino, int flags,
          rc = llu_file_release(ino);
   out:
          ino->i_private = lli;
+        if (!rc)
+                llu_update_inode(ino, &md);
          if (lli2)
                  OBD_FREE(lli2, sizeof(struct llu_inode_info));
          if (req != NULL)
@@ -1965,14 +1910,14 @@ struct inode *llu_iget(struct filesys *fs, struct lustre_md *md)
                          I_RELE(inode);
                  }
                  else {
-                        llu_update_inode(inode, md->body, md->lsm);
+                        llu_update_inode(inode, md);
                          return inode;
                  }
          }
  
          inode = llu_new_inode(fs, &fid);
          if (inode)
-                llu_update_inode(inode, md->body, md->lsm);
+                llu_update_inode(inode, md);
  
          return inode;
  }
@@ -2153,8 +2098,6 @@ llu_fsswop_mount(const char *source,
          sbi->ll_dt_exp = class_conn2export(&dt_conn);
          sbi->ll_lco.lco_flags = ocd.ocd_connect_flags;
  
-        err = obd_register_lock_cancel_cb(sbi->ll_dt_exp,
-                                          llu_extent_lock_cancel_cb);
          if (err) {
                  CERROR("cannot register lock cancel callback: rc = %d\n", err);
                  GOTO(out_dt, err);
@@ -2212,6 +2155,8 @@ llu_fsswop_mount(const char *source,
                  goto out_inode;
          }
  
+        cl_sb_init(sbi);
+
          ptlrpc_req_finished(request);
  
          CDEBUG(D_SUPER, "LibLustre: %s mounted successfully!\n", source);
@@ -2224,8 +2169,6 @@ out_inode:
  out_request:
          ptlrpc_req_finished(request);
  out_lock_cn_cb:
-        obd_unregister_lock_cancel_cb(sbi->ll_dt_exp,
-                                      llu_extent_lock_cancel_cb);
  out_dt:
          obd_disconnect(sbi->ll_dt_exp);
  out_md:
diff --git a/lustre/liblustre/tests/sanity.c b/lustre/liblustre/tests/sanity.c

index acb2385..070e3c6 100644 (file)
--- a/lustre/liblustre/tests/sanity.c
+++ b/lustre/liblustre/tests/sanity.c
@@ -1102,8 +1102,8 @@ int t52(char *name)
                          close(fd);
                          t_unlink(file);
                          return -1;
-                }       
-                atime = statbuf.st_atime; 
+                }
+                atime = statbuf.st_atime;
          }
          close(fd);
          t_unlink(file);
@@ -1117,26 +1117,26 @@ int t53(char *name)
          struct utimbuf times;   /* struct. buffer for utime() */
          struct stat stat_buf;   /* struct buffer to hold file info. */
          time_t mtime, atime;
- 
+
          ENTER("mtime/atime should be updated by utime() call");
          snprintf(file, MAX_PATH_LENGTH, "%s/test_t53_file", lustre_path);
  
          t_echo_create(file, "check mtime/atime update by utime() call");
- 
+
          /* Initialize the modification and access time in the times arg */
          times.actime = NEW_TIME+10;
          times.modtime = NEW_TIME;
- 
+
          /* file modification/access time */
          utime(file, &times);
- 
+
          if (stat(file, &stat_buf) < 0) {
                  printf("stat(2) of %s failed, error:%d %s\n",
-                        file, errno, strerror(errno)); 
+                        file, errno, strerror(errno));
          }
          mtime = stat_buf.st_mtime;
          atime = stat_buf.st_atime;
- 
+
          if ((mtime == NEW_TIME) && (atime == NEW_TIME + 10)) {
                  t_unlink(file);
                  LEAVE();
@@ -1144,7 +1144,7 @@ int t53(char *name)
  
          printf("mod time %ld, expected %ld\n", mtime, (long)NEW_TIME);
          printf("acc time %ld, expected %ld\n", atime, (long)NEW_TIME + 10);
- 
+
          t_unlink(file);
          return (-1);
  }
@@ -1170,7 +1170,7 @@ int t54(char *name)
          lock.l_whence = 0;
          lock.l_len    = 1;
          if ((err = t_fcntl(fd, F_SETLKW, &lock)) != 0) {
-                fprintf(stderr, "fcntl returned: %d (%s)\n", 
+                fprintf(stderr, "fcntl returned: %d (%s)\n",
                          err, strerror(err));
                  close(fd);
                  t_unlink(file);
@@ -1203,7 +1203,7 @@ int t55(char *name)
          ENTER("setstripe/getstripe");
          snprintf(path, MAX_PATH_LENGTH, "%s/test_t55", lustre_path);
          snprintf(file, MAX_PATH_LENGTH, "%s/test_t55/file_t55", lustre_path);
-      
+
          buflen = sizeof(struct lov_user_md);
          buflen += STRIPE_COUNT * sizeof(struct lov_user_ost_data);
          lum = (struct lov_user_md *)malloc(buflen);
@@ -1232,7 +1232,7 @@ int t55(char *name)
                  free(lum);
                  return -1;
          }
-        
+
          lum->lmm_magic = LOV_USER_MAGIC;
          lum->lmm_stripe_count = STRIPE_COUNT;
          rc = ioctl(fd, LL_IOC_LOV_GETSTRIPE, lum);
@@ -1255,7 +1255,7 @@ int t55(char *name)
                  printf("lmm_stripe_count:   %u\n", (int)lum->lmm_stripe_count);
                  printf("lmm_stripe_size:    %u\n",      lum->lmm_stripe_size);
                  printf("lmm_stripe_pattern: %x\n",      lum->lmm_pattern);
-        
+
                  for (index = 0; index < lum->lmm_stripe_count; index++) {
                          lo = lum->lmm_objects + index;
                          printf("object %d:\n", index);
@@ -1292,7 +1292,7 @@ int t55(char *name)
          }
          fd = open(file, O_RDWR, 0644);
          if (fd < 0) {
-                printf("failed to open(%s): rc = %d (%s)\n", 
+                printf("failed to open(%s): rc = %d (%s)\n",
                         file, fd, strerror(errno));
                  t_unlink(file);
                  t_rmdir(path);
@@ -1321,7 +1321,7 @@ int t55(char *name)
                  printf("lmm_stripe_count:   %u\n", (int)lum->lmm_stripe_count);
                  printf("lmm_stripe_size:    %u\n",      lum->lmm_stripe_size);
                  printf("lmm_stripe_pattern: %x\n",      lum->lmm_pattern);
-        
+
                  for (index = 0; index < lum->lmm_stripe_count; index++) {
                          lo = lum->lmm_objects + index;
                          printf("object %d:\n", index);
diff --git a/lustre/llite/Makefile.in b/lustre/llite/Makefile.in

index 8d02c85..b06f901 100644 (file)
--- a/lustre/llite/Makefile.in
+++ b/lustre/llite/Makefile.in
@@ -3,6 +3,8 @@ lustre-objs := dcache.o dir.o file.o llite_close.o llite_lib.o llite_nfs.o
  lustre-objs += llite_fid.o rw.o lproc_llite.o namei.o symlink.o llite_mmap.o
  lustre-objs += xattr.o remote_perm.o llite_rmtacl.o llite_capa.o
  lustre-objs += rw26.o super25.o statahead.o
+lustre-objs += ../lclient/glimpse.o ../lclient/lcommon_cl.o
+lustre-objs += vvp_dev.o vvp_page.o vvp_lock.o vvp_io.o vvp_object.o
  
  llite_lloop-objs := lloop.o
  
diff --git a/lustre/llite/autoMakefile.am b/lustre/llite/autoMakefile.am

index 2473676..d5d1c10 100644 (file)
--- a/lustre/llite/autoMakefile.am
+++ b/lustre/llite/autoMakefile.am
@@ -40,4 +40,5 @@ endif
  
  DIST_SOURCES := $(lustre-objs:.o=.c) llite_internal.h rw26.c super25.c 
  DIST_SOURCES += $(llite_lloop-objs:.o=.c)
+DIST_SOURCES += vvp_internal.h
  MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ 
diff --git a/lustre/llite/dcache.c b/lustre/llite/dcache.c

index 2627a45..95e0694 100644 (file)
--- a/lustre/llite/dcache.c
+++ b/lustre/llite/dcache.c
@@ -216,7 +216,7 @@ int ll_drop_dentry(struct dentry *dentry)
                  spin_lock(&dcache_lock);
                  return 1;
          }
-        /* disconected dentry can not be find without lookup, because we 
+        /* disconected dentry can not be find without lookup, because we
           * not need his to unhash or mark invalid. */
          if (dentry->d_flags & DCACHE_DISCONNECTED) {
                  unlock_dentry(dentry);
@@ -309,7 +309,7 @@ int ll_revalidate_it_finish(struct ptlrpc_request *request,
          if (!request)
                  RETURN(0);
  
-        if (it_disposition(it, DISP_LOOKUP_NEG)) 
+        if (it_disposition(it, DISP_LOOKUP_NEG))
                  RETURN(-ENOENT);
  
          rc = ll_prep_inode(&de->d_inode, request, NULL);
@@ -346,7 +346,7 @@ void ll_frob_intent(struct lookup_intent **itp, struct lookup_intent *deft)
          struct lookup_intent *it = *itp;
  #ifdef HAVE_VFS_INTENT_PATCHES
          if (it) {
-                LASSERTF(it->it_magic == INTENT_MAGIC, 
+                LASSERTF(it->it_magic == INTENT_MAGIC,
                           "%p has bad intent magic: %x\n",
                           it, it->it_magic);
          }
@@ -505,8 +505,8 @@ revalidate_finish:
                  GOTO(out, rc = 0);
          }
  
-        if ((it->it_op & IT_OPEN) && de->d_inode && 
-            !S_ISREG(de->d_inode->i_mode) && 
+        if ((it->it_op & IT_OPEN) && de->d_inode &&
+            !S_ISREG(de->d_inode->i_mode) &&
              !S_ISDIR(de->d_inode->i_mode)) {
                  ll_release_openhandle(de, it);
          }
diff --git a/lustre/llite/file.c b/lustre/llite/file.c

index e8978f3..35c1d87 100644 (file)
--- a/lustre/llite/file.c
+++ b/lustre/llite/file.c
@@ -49,7 +49,8 @@
  #include "llite_internal.h"
  #include <lustre/ll_fiemap.h>
  
-/* also used by llite/special.c:ll_special_open() */
+#include "cl_object.h"
+
  struct ll_file_data *ll_file_data_get(void)
  {
          struct ll_file_data *fd;
@@ -237,10 +238,12 @@ int ll_md_close(struct obd_export *md_exp, struct inode *inode,
  
          /* clear group lock, if present */
          if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
+#if 0 /* XXX */
                  struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
                  fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
                  rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP,
                                        &fd->fd_cwlockh);
+#endif
          }
  
          /* Let's see if we have good enough OPEN lock on the file and if
@@ -506,6 +509,7 @@ int ll_file_open(struct inode *inode, struct file *file)
          if (fd == NULL)
                  RETURN(-ENOMEM);
  
+        fd->fd_file = file;
          if (S_ISDIR(inode->i_mode)) {
  again:
                  spin_lock(&lli->lli_lock);
@@ -743,281 +747,7 @@ int ll_inode_getattr(struct inode *inode, struct obdo *obdo)
          RETURN(0);
  }
  
-static int ll_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock)
-{
-        struct ll_inode_info *lli = ll_i2info(inode);
-        struct lov_stripe_md *lsm = lli->lli_smd;
-        struct obd_export *exp = ll_i2dtexp(inode);
-        struct {
-                char name[16];
-                struct ldlm_lock *lock;
-        } key = { .name = KEY_LOCK_TO_STRIPE, .lock = lock };
-        __u32 stripe, vallen = sizeof(stripe);
-        struct lov_oinfo *loinfo;
-        int rc;
-        ENTRY;
-
-        if (lsm->lsm_stripe_count == 1)
-                GOTO(check, stripe = 0);
-
-        /* get our offset in the lov */
-        rc = obd_get_info(exp, sizeof(key), &key, &vallen, &stripe, lsm);
-        if (rc != 0) {
-                CERROR("obd_get_info: rc = %d\n", rc);
-                RETURN(rc);
-        }
-        LASSERT(stripe < lsm->lsm_stripe_count);
-
-check:
-        loinfo = lsm->lsm_oinfo[stripe];
-        if (!osc_res_name_eq(loinfo->loi_id, loinfo->loi_gr,
-                            &lock->l_resource->lr_name)){
-                LDLM_ERROR(lock, "resource doesn't match object "LPU64"/"LPU64,
-                           loinfo->loi_id, loinfo->loi_gr);
-                RETURN(-ELDLM_NO_LOCK_DATA);
-        }
-
-        RETURN(stripe);
-}
-
-/* Get extra page reference to ensure it is not going away */
-void ll_pin_extent_cb(void *data)
-{
-        struct page *page = data;
-
-        page_cache_get(page);
-
-        return;
-}
-
-/* Flush the page from page cache for an extent as its canceled.
- * Page to remove is delivered as @data.
- *
- * No one can dirty the extent until we've finished our work and they cannot
- * enqueue another lock.  The DLM protects us from ll_file_read/write here,
- * but other kernel actors could have pages locked.
- *
- * If @discard is set, there is no need to write the page if it is dirty.
- *
- * Called with the DLM lock held. */
-int ll_page_removal_cb(void *data, int discard)
-{
-        int rc;
-        struct page *page = data;
-        struct address_space *mapping;
-
-        ENTRY;
-
-        /* We have page reference already from ll_pin_page */
-        lock_page(page);
-
-        /* Already truncated by somebody */
-        if (!page->mapping)
-                GOTO(out, rc = 0);
-        mapping = page->mapping;
-
-        ll_teardown_mmaps(mapping,
-                          (__u64)page->index << PAGE_CACHE_SHIFT,
-                          ((__u64)page->index<<PAGE_CACHE_SHIFT)|
-                                                              ~PAGE_CACHE_MASK);
-        LL_CDEBUG_PAGE(D_PAGE, page, "removing page\n");
-
-        if (!discard && clear_page_dirty_for_io(page)) {
-                LASSERT(page->mapping);
-                rc = ll_call_writepage(page->mapping->host, page);
-                /* either waiting for io to complete or reacquiring
-                 * the lock that the failed writepage released */
-                lock_page(page);
-                wait_on_page_writeback(page);
-                if (rc != 0) {
-                        CERROR("writepage inode %lu(%p) of page %p "
-                               "failed: %d\n", mapping->host->i_ino,
-                               mapping->host, page, rc);
-                        if (rc == -ENOSPC)
-                                set_bit(AS_ENOSPC, &mapping->flags);
-                        else
-                                set_bit(AS_EIO, &mapping->flags);
-                }
-                set_bit(AS_EIO, &mapping->flags);
-        }
-        if (page->mapping != NULL) {
-                struct ll_async_page *llap = llap_cast_private(page);
-                /* checking again to account for writeback's lock_page() */
-                LL_CDEBUG_PAGE(D_PAGE, page, "truncating\n");
-                if (llap)
-                        ll_ra_accounting(llap, page->mapping);
-                ll_truncate_complete_page(page);
-        }
-        EXIT;
-out:
-        LASSERT(!PageWriteback(page));
-        unlock_page(page);
-        page_cache_release(page);
-
-        return 0;
-}
-
-int ll_extent_lock_cancel_cb(struct ldlm_lock *lock, struct ldlm_lock_desc *new,
-                             void *data, int flag)
-{
-        struct inode *inode;
-        struct ll_inode_info *lli;
-        struct lov_stripe_md *lsm;
-        int stripe;
-        __u64 kms;
-
-        ENTRY;
-
-        if ((unsigned long)data > 0 && (unsigned long)data < 0x1000) {
-                LDLM_ERROR(lock, "cancelling lock with bad data %p", data);
-                LBUG();
-        }
-
-        inode = ll_inode_from_lock(lock);
-        if (inode == NULL)
-                RETURN(0);
-        lli = ll_i2info(inode);
-        if (lli == NULL)
-                GOTO(iput, 0);
-        if (lli->lli_smd == NULL)
-                GOTO(iput, 0);
-        lsm = lli->lli_smd;
-
-        stripe = ll_lock_to_stripe_offset(inode, lock);
-        if (stripe < 0)
-                GOTO(iput, 0);
-
-        lov_stripe_lock(lsm);
-        lock_res_and_lock(lock);
-        kms = ldlm_extent_shift_kms(lock,
-                                    lsm->lsm_oinfo[stripe]->loi_kms);
-
-        if (lsm->lsm_oinfo[stripe]->loi_kms != kms)
-                LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
-                           lsm->lsm_oinfo[stripe]->loi_kms, kms);
-        lsm->lsm_oinfo[stripe]->loi_kms = kms;
-        unlock_res_and_lock(lock);
-        lov_stripe_unlock(lsm);
-        ll_queue_done_writing(inode, 0);
-        EXIT;
-iput:
-        iput(inode);
-
-        return 0;
-}
-
-#if 0
-int ll_async_completion_ast(struct ldlm_lock *lock, int flags, void *data)
-{
-        /* XXX ALLOCATE - 160 bytes */
-        struct inode *inode = ll_inode_from_lock(lock);
-        struct ll_inode_info *lli = ll_i2info(inode);
-        struct lustre_handle lockh = { 0 };
-        struct ost_lvb *lvb;
-        int stripe;
-        ENTRY;
-
-        if (flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED |
-                     LDLM_FL_BLOCK_CONV)) {
-                LBUG(); /* not expecting any blocked async locks yet */
-                LDLM_DEBUG(lock, "client-side async enqueue returned a blocked "
-                           "lock, returning");
-                ldlm_lock_dump(D_OTHER, lock, 0);
-                ldlm_reprocess_all(lock->l_resource);
-                RETURN(0);
-        }
-
-        LDLM_DEBUG(lock, "client-side async enqueue: granted/glimpsed");
-
-        stripe = ll_lock_to_stripe_offset(inode, lock);
-        if (stripe < 0)
-                goto iput;
-
-        if (lock->l_lvb_len) {
-                struct lov_stripe_md *lsm = lli->lli_smd;
-                __u64 kms;
-                lvb = lock->l_lvb_data;
-                lsm->lsm_oinfo[stripe].loi_rss = lvb->lvb_size;
-
-                lock_res_and_lock(lock);
-                ll_inode_size_lock(inode, 1);
-                kms = MAX(lsm->lsm_oinfo[stripe].loi_kms, lvb->lvb_size);
-                kms = ldlm_extent_shift_kms(NULL, kms);
-                if (lsm->lsm_oinfo[stripe].loi_kms != kms)
-                        LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64,
-                                   lsm->lsm_oinfo[stripe].loi_kms, kms);
-                lsm->lsm_oinfo[stripe].loi_kms = kms;
-                ll_inode_size_unlock(inode, 1);
-                unlock_res_and_lock(lock);
-        }
-
-iput:
-        iput(inode);
-        wake_up(&lock->l_waitq);
-
-        ldlm_lock2handle(lock, &lockh);
-        ldlm_lock_decref(&lockh, LCK_PR);
-        RETURN(0);
-}
-#endif
-
-static int ll_glimpse_callback(struct ldlm_lock *lock, void *reqp)
-{
-        struct ptlrpc_request *req = reqp;
-        struct inode *inode = ll_inode_from_lock(lock);
-        struct ll_inode_info *lli;
-        struct lov_stripe_md *lsm;
-        struct ost_lvb *lvb;
-        int rc, stripe;
-        ENTRY;
-
-        if (inode == NULL)
-                GOTO(out, rc = -ELDLM_NO_LOCK_DATA);
-        lli = ll_i2info(inode);
-        if (lli == NULL)
-                GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
-        lsm = lli->lli_smd;
-        if (lsm == NULL)
-                GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
-
-        /* First, find out which stripe index this lock corresponds to. */
-        stripe = ll_lock_to_stripe_offset(inode, lock);
-        if (stripe < 0)
-                GOTO(iput, rc = -ELDLM_NO_LOCK_DATA);
-
-        req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_CALLBACK);
-        req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
-                             sizeof(*lvb));
-        rc = req_capsule_server_pack(&req->rq_pill);
-        if (rc) {
-                CERROR("lustre_pack_reply: %d\n", rc);
-                GOTO(iput, rc);
-        }
-
-        lvb = req_capsule_server_get(&req->rq_pill, &RMF_DLM_LVB);
-        lvb->lvb_size = lli->lli_smd->lsm_oinfo[stripe]->loi_kms;
-        lvb->lvb_mtime = LTIME_S(inode->i_mtime);
-        lvb->lvb_atime = LTIME_S(inode->i_atime);
-        lvb->lvb_ctime = LTIME_S(inode->i_ctime);
-
-        LDLM_DEBUG(lock, "i_size: %llu -> stripe number %u -> kms "LPU64
-                   " atime "LPU64", mtime "LPU64", ctime "LPU64,
-                   i_size_read(inode), stripe, lvb->lvb_size, lvb->lvb_mtime,
-                   lvb->lvb_atime, lvb->lvb_ctime);
- iput:
-        iput(inode);
-
- out:
-        /* These errors are normal races, so we don't want to fill the console
-         * with messages by calling ptlrpc_error() */
-        if (rc == -ELDLM_NO_LOCK_DATA)
-                lustre_pack_reply(req, 1, NULL, NULL);
-
-        req->rq_status = rc;
-        return rc;
-}
-
-static int ll_merge_lvb(struct inode *inode)
+int ll_merge_lvb(struct inode *inode)
  {
          struct ll_inode_info *lli = ll_i2info(inode);
          struct ll_sb_info *sbi = ll_i2sbi(inode);
@@ -1040,824 +770,324 @@ static int ll_merge_lvb(struct inode *inode)
          RETURN(rc);
  }
  
-int ll_local_size(struct inode *inode)
-{
-        ldlm_policy_data_t policy = { .l_extent = { 0, OBD_OBJECT_EOF } };
-        struct ll_inode_info *lli = ll_i2info(inode);
-        struct ll_sb_info *sbi = ll_i2sbi(inode);
-        struct lustre_handle lockh = { 0 };
-        int flags = 0;
-        int rc;
-        ENTRY;
-
-        if (lli->lli_smd->lsm_stripe_count == 0)
-                RETURN(0);
-
-        rc = obd_match(sbi->ll_dt_exp, lli->lli_smd, LDLM_EXTENT,
-                       &policy, LCK_PR, &flags, inode, &lockh);
-        if (rc < 0)
-                RETURN(rc);
-        else if (rc == 0)
-                RETURN(-ENODATA);
-
-        rc = ll_merge_lvb(inode);
-        obd_cancel(sbi->ll_dt_exp, lli->lli_smd, LCK_PR, &lockh);
-        RETURN(rc);
-}
-
  int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm,
                       lstat_t *st)
  {
-        struct lustre_handle lockh = { 0 };
-        struct ldlm_enqueue_info einfo = { 0 };
-        struct obd_info oinfo = { { { 0 } } };
-        struct ost_lvb lvb;
-        int rc;
-
-        ENTRY;
-
-        einfo.ei_type = LDLM_EXTENT;
-        einfo.ei_mode = LCK_PR;
-        einfo.ei_cb_bl = osc_extent_blocking_cb;
-        einfo.ei_cb_cp = ldlm_completion_ast;
-        einfo.ei_cb_gl = ll_glimpse_callback;
-        einfo.ei_cbdata = NULL;
-
-        oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
-        oinfo.oi_lockh = &lockh;
-        oinfo.oi_md = lsm;
-        oinfo.oi_flags = LDLM_FL_HAS_INTENT;
-
-        rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
-        if (rc == -ENOENT)
-                RETURN(rc);
-        if (rc != 0) {
-                CERROR("obd_enqueue returned rc %d, "
-                       "returning -EIO\n", rc);
-                RETURN(rc > 0 ? -EIO : rc);
-        }
-
-        lov_stripe_lock(lsm);
-        memset(&lvb, 0, sizeof(lvb));
-        obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 0);
-        st->st_size = lvb.lvb_size;
-        st->st_blocks = lvb.lvb_blocks;
-        st->st_mtime = lvb.lvb_mtime;
-        st->st_atime = lvb.lvb_atime;
-        st->st_ctime = lvb.lvb_ctime;
-        lov_stripe_unlock(lsm);
-
-        RETURN(rc);
+        /* XXX */
+        return -ENOSYS;
  }
  
-/* NB: obd_merge_lvb will prefer locally cached writes if they extend the
- * file (because it prefers KMS over RSS when larger) */
-int ll_glimpse_size(struct inode *inode, int ast_flags)
+void ll_io_init(struct cl_io *io, const struct file *file, int write)
  {
-        struct ll_inode_info *lli = ll_i2info(inode);
-        struct ll_sb_info *sbi = ll_i2sbi(inode);
-        struct lustre_handle lockh = { 0 };
-        struct ldlm_enqueue_info einfo = { 0 };
-        struct obd_info oinfo = { { { 0 } } };
-        int rc;
+        struct inode *inode     = file->f_dentry->d_inode;
+        struct ll_sb_info *sbi  = ll_i2sbi(inode);
+        struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+
+        LASSERT(fd != NULL);
+        memset(io, 0, sizeof *io);
+        io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
+        if (write)
+                io->u.ci_wr.wr_append = file->f_flags & O_APPEND;
+        io->ci_obj     = ll_i2info(inode)->lli_clob;
+        io->ci_lockreq = CILR_MAYBE;
+        if (fd->fd_flags & LL_FILE_IGNORE_LOCK || sbi->ll_flags & LL_SBI_NOLCK)
+                io->ci_lockreq = CILR_NEVER;
+        else if (file->f_flags & O_APPEND)
+                io->ci_lockreq = CILR_MANDATORY;
+}
+
+static ssize_t ll_file_io_generic(const struct lu_env *env,
+                struct ccc_io_args *args, struct file *file,
+                enum cl_io_type iot, loff_t *ppos, size_t count)
+{
+        struct cl_io       *io;
+        ssize_t             result;
          ENTRY;
  
-        if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
-                RETURN(0);
+        io = &ccc_env_info(env)->cti_io;
+        ll_io_init(io, file, iot == CIT_WRITE);
  
-        CDEBUG(D_DLMTRACE, "Glimpsing inode %lu\n", inode->i_ino);
+        if (iot == CIT_READ)
+                io->u.ci_rd.rd_is_sendfile = args->cia_is_sendfile;
  
-        if (!lli->lli_smd) {
-                CDEBUG(D_DLMTRACE, "No objects for inode %lu\n", inode->i_ino);
-                RETURN(0);
-        }
-
-        /* NOTE: this looks like DLM lock request, but it may not be one. Due
-         *       to LDLM_FL_HAS_INTENT flag, this is glimpse request, that
-         *       won't revoke any conflicting DLM locks held. Instead,
-         *       ll_glimpse_callback() will be called on each client
-         *       holding a DLM lock against this file, and resulting size
-         *       will be returned for each stripe. DLM lock on [0, EOF] is
-         *       acquired only if there were no conflicting locks. */
-        einfo.ei_type = LDLM_EXTENT;
-        einfo.ei_mode = LCK_PR;
-        einfo.ei_cb_bl = osc_extent_blocking_cb;
-        einfo.ei_cb_cp = ldlm_completion_ast;
-        einfo.ei_cb_gl = ll_glimpse_callback;
-        einfo.ei_cbdata = inode;
-
-        oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
-        oinfo.oi_lockh = &lockh;
-        oinfo.oi_md = lli->lli_smd;
-        oinfo.oi_flags = ast_flags | LDLM_FL_HAS_INTENT;
-
-        rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo);
-        if (rc == -ENOENT)
-                RETURN(rc);
-        if (rc != 0) {
-                CERROR("obd_enqueue returned rc %d, returning -EIO\n", rc);
-                RETURN(rc > 0 ? -EIO : rc);
+        if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
+                struct vvp_io *vio = vvp_env_io(env);
+                struct ccc_io *cio = ccc_env_io(env);
+                if (cl_io_is_sendfile(io)) {
+                        vio->u.read.cui_actor = args->cia_actor;
+                        vio->u.read.cui_target = args->cia_target;
+                } else {
+                        cio->cui_iov = args->cia_iov;
+                        cio->cui_nrsegs = args->cia_nrsegs;
+#ifndef HAVE_FILE_WRITEV
+                        cio->cui_iocb = args->cia_iocb;
+#endif
+                }
+                cio->cui_fd  = LUSTRE_FPRIVATE(file);
+                result = cl_io_loop(env, io);
+        } else
+                /* cl_io_rw_init() handled IO */
+                result = io->ci_result;
+        if (io->ci_nob > 0) {
+                result = io->ci_nob;
+                *ppos = io->u.ci_wr.wr.crw_pos;
          }
-
-        rc = ll_merge_lvb(inode);
-
-        CDEBUG(D_DLMTRACE, "glimpse: size: %llu, blocks: %llu\n",
-               i_size_read(inode), (unsigned long long)inode->i_blocks);
-
-        RETURN(rc);
+        cl_io_fini(env, io);
+        RETURN(result);
  }
  
-int ll_extent_lock(struct ll_file_data *fd, struct inode *inode,
-                   struct lov_stripe_md *lsm, int mode,
-                   ldlm_policy_data_t *policy, struct lustre_handle *lockh,
-                   int ast_flags)
-{
-        struct ll_sb_info *sbi = ll_i2sbi(inode);
-        struct ost_lvb lvb;
-        struct ldlm_enqueue_info einfo = { 0 };
-        struct obd_info oinfo = { { { 0 } } };
-        int rc;
-        ENTRY;
-
-        LASSERT(!lustre_handle_is_used(lockh));
-        LASSERT(lsm != NULL);
-
-        /* XXX phil: can we do this?  won't it screw the file size up? */
-        if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
-            (sbi->ll_flags & LL_SBI_NOLCK))
-                RETURN(0);
-
-        CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n",
-               inode->i_ino, policy->l_extent.start, policy->l_extent.end);
  
-        einfo.ei_type = LDLM_EXTENT;
-        einfo.ei_mode = mode;
-        einfo.ei_cb_bl = osc_extent_blocking_cb;
-        einfo.ei_cb_cp = ldlm_completion_ast;
-        einfo.ei_cb_gl = ll_glimpse_callback;
-        einfo.ei_cbdata = inode;
-
-        oinfo.oi_policy = *policy;
-        oinfo.oi_lockh = lockh;
-        oinfo.oi_md = lsm;
-        oinfo.oi_flags = ast_flags;
+/*
+ * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
+ */
+static int ll_file_get_iov_count(const struct iovec *iov,
+                                 unsigned long *nr_segs, size_t *count)
+{
+        size_t cnt = 0;
+        unsigned long seg;
  
-        rc = obd_enqueue(sbi->ll_dt_exp, &oinfo, &einfo, NULL);
-        *policy = oinfo.oi_policy;
-        if (rc > 0)
-                rc = -EIO;
+        for (seg = 0; seg < *nr_segs; seg++) {
+                const struct iovec *iv = &iov[seg];
  
-        ll_inode_size_lock(inode, 1);
-        inode_init_lvb(inode, &lvb);
-        obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 1);
-
-        if (policy->l_extent.start == 0 &&
-            policy->l_extent.end == OBD_OBJECT_EOF) {
-                /* vmtruncate()->ll_truncate() first sets the i_size and then
-                 * the kms under both a DLM lock and the
-                 * ll_inode_size_lock().  If we don't get the
-                 * ll_inode_size_lock() here we can match the DLM lock and
-                 * reset i_size from the kms before the truncating path has
-                 * updated the kms.  generic_file_write can then trust the
-                 * stale i_size when doing appending writes and effectively
-                 * cancel the result of the truncate.  Getting the
-                 * ll_inode_size_lock() after the enqueue maintains the DLM
-                 * -> ll_inode_size_lock() acquiring order. */
-                i_size_write(inode, lvb.lvb_size);
-                CDEBUG(D_INODE, "inode=%lu, updating i_size %llu\n",
-                       inode->i_ino, i_size_read(inode));
-        }
-
-        if (rc == 0) {
-                LTIME_S(inode->i_mtime) = lvb.lvb_mtime;
-                LTIME_S(inode->i_atime) = lvb.lvb_atime;
-                LTIME_S(inode->i_ctime) = lvb.lvb_ctime;
+                /*
+                 * If any segment has a negative length, or the cumulative
+                 * length ever wraps negative then return -EINVAL.
+                 */
+                cnt += iv->iov_len;
+                if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
+                        return -EINVAL;
+                if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
+                        continue;
+                if (seg == 0)
+                        return -EFAULT;
+                *nr_segs = seg;
+                cnt -= iv->iov_len;   /* This segment is no good */
+                break;
          }
-        ll_inode_size_unlock(inode, 1);
-
-        RETURN(rc);
+        *count = cnt;
+        return 0;
  }
  
-int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode,
-                     struct lov_stripe_md *lsm, int mode,
-                     struct lustre_handle *lockh)
+#ifdef HAVE_FILE_READV
+static ssize_t ll_file_readv(struct file *file, const struct iovec *iov,
+                              unsigned long nr_segs, loff_t *ppos)
  {
-        struct ll_sb_info *sbi = ll_i2sbi(inode);
-        int rc;
+        struct lu_env      *env;
+        struct ccc_io_args *args;
+        size_t              count;
+        ssize_t             result;
+        int                 refcheck;
          ENTRY;
  
-        /* XXX phil: can we do this?  won't it screw the file size up? */
-        if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) ||
-            (sbi->ll_flags & LL_SBI_NOLCK))
-                RETURN(0);
-
-        rc = obd_cancel(sbi->ll_dt_exp, lsm, mode, lockh);
+        result = ll_file_get_iov_count(iov, &nr_segs, &count);
+        if (result)
+                RETURN(result);
  
-        RETURN(rc);
-}
-
-static void ll_set_file_contended(struct inode *inode)
-{
-        struct ll_inode_info *lli = ll_i2info(inode);
-        cfs_time_t now = cfs_time_current();
-
-        spin_lock(&lli->lli_lock);
-        lli->lli_contention_time = now;
-        lli->lli_flags |= LLIF_CONTENDED;
-        spin_unlock(&lli->lli_lock);
-}
-
-void ll_clear_file_contended(struct inode *inode)
-{
-        struct ll_inode_info *lli = ll_i2info(inode);
+        env = cl_env_get(&refcheck);
+        if (IS_ERR(env))
+                RETURN(PTR_ERR(env));
  
-        spin_lock(&lli->lli_lock);
-        lli->lli_flags &= ~LLIF_CONTENDED;
-        spin_unlock(&lli->lli_lock);
+        args = &vvp_env_info(env)->vti_args;
+        args->cia_is_sendfile = 0;
+        args->cia_iov = (struct iovec *)iov;
+        args->cia_nrsegs = nr_segs;
+        result = ll_file_io_generic(env, args, file, CIT_READ, ppos, count);
+        cl_env_put(env, &refcheck);
+        RETURN(result);
  }
  
-static int ll_is_file_contended(struct file *file)
+static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
+                            loff_t *ppos)
  {
-        struct inode *inode = file->f_dentry->d_inode;
-        struct ll_inode_info *lli = ll_i2info(inode);
-        struct ll_sb_info *sbi = ll_i2sbi(inode);
-        struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
+        struct lu_env *env;
+        struct iovec  *local_iov;
+        ssize_t        result;
+        int            refcheck;
          ENTRY;
  
-        if (!(sbi->ll_lco.lco_flags & OBD_CONNECT_SRVLOCK)) {
-                CDEBUG(D_INFO, "the server does not support SRVLOCK feature,"
-                       " osc connect flags = 0x"LPX64"\n",
-                       sbi->ll_lco.lco_flags);
-                RETURN(0);
-        }
-        if (fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK))
-                RETURN(1);
-        if (lli->lli_flags & LLIF_CONTENDED) {
-                cfs_time_t cur_time = cfs_time_current();
-                cfs_time_t retry_time;
-
-                retry_time = cfs_time_add(
-                        lli->lli_contention_time,
-                        cfs_time_seconds(sbi->ll_contention_time));
-                if (cfs_time_after(cur_time, retry_time)) {
-                        ll_clear_file_contended(inode);
-                        RETURN(0);
-                }
-                RETURN(1);
-        }
-        RETURN(0);
+        env = cl_env_get(&refcheck);
+        if (IS_ERR(env))
+                RETURN(PTR_ERR(env));
+
+        local_iov = &vvp_env_info(env)->vti_local_iov;
+        local_iov->iov_base = (void __user *)buf;
+        local_iov->iov_len = count;
+        result = ll_file_readv(file, local_iov, 1, ppos);
+        cl_env_put(env, &refcheck);
+        RETURN(result);
  }
  
-static int ll_file_get_tree_lock(struct ll_lock_tree *tree, struct file *file,
-                                 const char *buf, size_t count,
-                                 loff_t start, loff_t end, int rw)
-{
-        int append;
-        int tree_locked = 0;
-        int rc;
-        struct inode * inode = file->f_dentry->d_inode;
+#else
+static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
+                                unsigned long nr_segs, loff_t pos)
+{
+        struct lu_env      *env;
+        struct ccc_io_args *args;
+        size_t              count;
+        ssize_t             result;
+        int                 refcheck;
          ENTRY;
  
-        append = (rw == OBD_BRW_WRITE) && (file->f_flags & O_APPEND);
+        result = ll_file_get_iov_count(iov, &nr_segs, &count);
+        if (result)
+                RETURN(result);
  
-        if (append || !ll_is_file_contended(file)) {
-                struct ll_lock_tree_node *node;
-                int ast_flags;
+        env = cl_env_get(&refcheck);
+        if (IS_ERR(env))
+                RETURN(PTR_ERR(env));
  
-                ast_flags = append ? 0 : LDLM_FL_DENY_ON_CONTENTION;
-                if (file->f_flags & O_NONBLOCK)
-                        ast_flags |= LDLM_FL_BLOCK_NOWAIT;
-                node = ll_node_from_inode(inode, start, end,
-                                          (rw == OBD_BRW_WRITE) ? LCK_PW : LCK_PR);
-                if (IS_ERR(node)) {
-                        rc = PTR_ERR(node);
-                        GOTO(out, rc);
-                }
-                tree->lt_fd = LUSTRE_FPRIVATE(file);
-                rc = ll_tree_lock(tree, node, buf, count, ast_flags);
-                if (rc == 0)
-                        tree_locked = 1;
-                else if (rc == -EUSERS)
-                        ll_set_file_contended(inode);
-                else
-                        GOTO(out, rc);
-        }
-        RETURN(tree_locked);
-out:
-        return rc;
+        args = &vvp_env_info(env)->vti_args;
+        args->cia_is_sendfile = 0;
+        args->cia_iov = (struct iovec *)iov;
+        args->cia_nrsegs = nr_segs;
+        args->cia_iocb = iocb;
+        result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
+                                    &iocb->ki_pos, count);
+        cl_env_put(env, &refcheck);
+        RETURN(result);
  }
  
-/**
- * Checks if requested extent lock is compatible with a lock under a page.
- *
- * Checks if the lock under \a page is compatible with a read or write lock
- * (specified by \a rw) for an extent [\a start , \a end].
- *
- * \param page the page under which lock is considered
- * \param rw OBD_BRW_READ if requested for reading,
- *           OBD_BRW_WRITE if requested for writing
- * \param start start of the requested extent
- * \param end end of the requested extent
- * \param cookie transparent parameter for passing locking context
- *
- * \post result == 1, *cookie == context, appropriate lock is referenced or
- * \post result == 0
- *
- * \retval 1 owned lock is reused for the request
- * \retval 0 no lock reused for the request
- *
- * \see ll_release_short_lock
- */
-static int ll_reget_short_lock(struct page *page, int rw,
-                               obd_off start, obd_off end,
-                               void **cookie)
+static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
+                            loff_t *ppos)
  {
-        struct ll_async_page *llap;
-        struct obd_export *exp;
-        struct inode *inode = page->mapping->host;
-
+        struct lu_env *env;
+        struct iovec  *local_iov;
+        struct kiocb  *kiocb;
+        ssize_t        result;
+        int            refcheck;
          ENTRY;
  
-        exp = ll_i2dtexp(inode);
-        if (exp == NULL)
-                RETURN(0);
+        env = cl_env_get(&refcheck);
+        if (IS_ERR(env))
+                RETURN(PTR_ERR(env));
  
-        llap = llap_cast_private(page);
-        if (llap == NULL)
-                RETURN(0);
+        local_iov = &vvp_env_info(env)->vti_local_iov;
+        kiocb = &vvp_env_info(env)->vti_kiocb;
+        local_iov->iov_base = (void __user *)buf;
+        local_iov->iov_len = count;
+        init_sync_kiocb(kiocb, file);
+        kiocb->ki_pos = *ppos;
+        kiocb->ki_left = count;
  
-        RETURN(obd_reget_short_lock(exp, ll_i2info(inode)->lli_smd,
-                                    &llap->llap_cookie, rw, start, end,
-                                    cookie));
-}
+        result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos);
+        *ppos = kiocb->ki_pos;
  
-/**
- * Releases a reference to a lock taken in a "fast" way.
- *
- * Releases a read or a write (specified by \a rw) lock
- * referenced by \a cookie.
- *
- * \param inode inode to which data belong
- * \param end end of the locked extent
- * \param rw OBD_BRW_READ if requested for reading,
- *           OBD_BRW_WRITE if requested for writing
- * \param cookie transparent parameter for passing locking context
- *
- * \post appropriate lock is dereferenced
- *
- * \see ll_reget_short_lock
- */
-static void ll_release_short_lock(struct inode *inode, obd_off end,
-                                  void *cookie, int rw)
-{
-        struct obd_export *exp;
-        int rc;
-
-        exp = ll_i2dtexp(inode);
-        if (exp == NULL)
-                return;
-
-        rc = obd_release_short_lock(exp, ll_i2info(inode)->lli_smd, end,
-                                    cookie, rw);
-        if (rc < 0)
-                CERROR("unlock failed (%d)\n", rc);
+        cl_env_put(env, &refcheck);
+        RETURN(result);
  }
+#endif
  
-/**
- * Checks if requested extent lock is compatible
- * with a lock under a page in page cache.
- *
- * Checks if a lock under some \a page is compatible with a read or write lock
- * (specified by \a rw) for an extent [\a start , \a end].
- *
- * \param file the file under which lock is considered
- * \param rw OBD_BRW_READ if requested for reading,
- *           OBD_BRW_WRITE if requested for writing
- * \param ppos start of the requested extent
- * \param end end of the requested extent
- * \param cookie transparent parameter for passing locking context
- * \param buf userspace buffer for the data
- *
- * \post result == 1, *cookie == context, appropriate lock is referenced
- * \post retuls == 0
- *
- * \retval 1 owned lock is reused for the request
- * \retval 0 no lock reused for the request
- *
- * \see ll_file_put_fast_lock
+/*
+ * Write to a file (through the page cache).
   */
-static inline int ll_file_get_fast_lock(struct file *file,
-                                        obd_off ppos, obd_off end,
-                                        char *buf, void **cookie, int rw)
-{
-        int rc = 0;
-        struct page *page;
-
+#ifdef HAVE_FILE_WRITEV
+static ssize_t ll_file_writev(struct file *file, const struct iovec *iov,
+                              unsigned long nr_segs, loff_t *ppos)
+{
+        struct lu_env      *env;
+        struct ccc_io_args *args;
+        size_t              count;
+        ssize_t             result;
+        int                 refcheck;
          ENTRY;
  
-        if (!ll_region_mapped((unsigned long)buf, end - ppos)) {
-                page = find_lock_page(file->f_dentry->d_inode->i_mapping,
-                                      ppos >> CFS_PAGE_SHIFT);
-                if (page) {
-                        if (ll_reget_short_lock(page, rw, ppos, end, cookie))
-                                rc = 1;
+        result = ll_file_get_iov_count(iov, &nr_segs, &count);
+        if (result)
+                RETURN(result);
  
-                        unlock_page(page);
-                        page_cache_release(page);
-                }
-        }
+        env = cl_env_get(&refcheck);
+        if (IS_ERR(env))
+                RETURN(PTR_ERR(env));
  
-        RETURN(rc);
-}
-
-/**
- * Releases a reference to a lock taken in a "fast" way.
- *
- * Releases a read or a write (specified by \a rw) lock
- * referenced by \a cookie.
- *
- * \param inode inode to which data belong
- * \param end end of the locked extent
- * \param rw OBD_BRW_READ if requested for reading,
- *           OBD_BRW_WRITE if requested for writing
- * \param cookie transparent parameter for passing locking context
- *
- * \post appropriate lock is dereferenced
- *
- * \see ll_file_get_fast_lock
- */
-static inline void ll_file_put_fast_lock(struct inode *inode, obd_off end,
-                                         void *cookie, int rw)
-{
-        ll_release_short_lock(inode, end, cookie, rw);
+        args = &vvp_env_info(env)->vti_args;
+        args->cia_iov = (struct iovec *)iov;
+        args->cia_nrsegs = nr_segs;
+        result = ll_file_io_generic(env, args, file, CIT_WRITE, ppos, count);
+        cl_env_put(env, &refcheck);
+        RETURN(result);
  }
  
-enum ll_lock_style {
-        LL_LOCK_STYLE_NOLOCK   = 0,
-        LL_LOCK_STYLE_FASTLOCK = 1,
-        LL_LOCK_STYLE_TREELOCK = 2
-};
-
-/**
- * Checks if requested extent lock is compatible with a lock
- * under a page cache page.
- *
- * Checks if the lock under \a page is compatible with a read or write lock
- * (specified by \a rw) for an extent [\a start , \a end].
- *
- * \param file file under which I/O is processed
- * \param rw OBD_BRW_READ if requested for reading,
- *           OBD_BRW_WRITE if requested for writing
- * \param ppos start of the requested extent
- * \param end end of the requested extent
- * \param cookie transparent parameter for passing locking context
- *           (only used with LL_LOCK_STYLE_FASTLOCK)
- * \param tree lock tree (only used with LL_LOCK_STYLE_TREELOCK)
- * \param buf userspace buffer for the data
- *
- * \retval LL_LOCK_STYLE_FASTLOCK owned lock is reused through fast lock
- * \retval LL_LOCK_STYLE_TREELOCK got a lock through tree lock
- * \retval LL_LOCK_STYLE_NOLOCK got no lock
- *
- * \see ll_file_put_lock
- */
-static inline int ll_file_get_lock(struct file *file, obd_off ppos,
-                                   obd_off end, char *buf, void **cookie,
-                                   struct ll_lock_tree *tree, int rw)
+static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
+                             loff_t *ppos)
  {
-        int rc;
-
+        struct lu_env    *env;
+        struct iovec     *local_iov;
+        ssize_t           result;
+        int               refcheck;
          ENTRY;
  
-        if (ll_file_get_fast_lock(file, ppos, end, buf, cookie, rw))
-                RETURN(LL_LOCK_STYLE_FASTLOCK);
-
-        rc = ll_file_get_tree_lock(tree, file, buf, ppos - end, ppos, end, rw);
-        /* rc: 1 for tree lock, 0 for no lock, <0 for error */
-        switch (rc) {
-        case 1:
-                RETURN(LL_LOCK_STYLE_TREELOCK);
-        case 0:
-                RETURN(LL_LOCK_STYLE_NOLOCK);
-        }
+        env = cl_env_get(&refcheck);
+        if (IS_ERR(env))
+                RETURN(PTR_ERR(env));
  
-        /* an error happened if we reached this point, rc = -errno here */
-        RETURN(rc);
-}
+        local_iov = &vvp_env_info(env)->vti_local_iov;
+        local_iov->iov_base = (void __user *)buf;
+        local_iov->iov_len = count;
  
-/**
- * Drops the lock taken by ll_file_get_lock.
- *
- * Releases a read or a write (specified by \a rw) lock
- * referenced by \a tree or \a cookie.
- *
- * \param inode inode to which data belong
- * \param end end of the locked extent
- * \param lockstyle facility through which the lock was taken
- * \param rw OBD_BRW_READ if requested for reading,
- *           OBD_BRW_WRITE if requested for writing
- * \param cookie transparent parameter for passing locking context
- *           (only used with LL_LOCK_STYLE_FASTLOCK)
- * \param tree lock tree (only used with LL_LOCK_STYLE_TREELOCK)
- *
- * \post appropriate lock is dereferenced
- *
- * \see ll_file_get_lock
- */
-static inline void ll_file_put_lock(struct inode *inode, obd_off end,
-                                    enum ll_lock_style lock_style,
-                                    void *cookie, struct ll_lock_tree *tree,
-                                    int rw)
-
-{
-        switch (lock_style) {
-        case LL_LOCK_STYLE_TREELOCK:
-                ll_tree_unlock(tree);
-                break;
-        case LL_LOCK_STYLE_FASTLOCK:
-                ll_file_put_fast_lock(inode, end, cookie, rw);
-                break;
-        default:
-                CERROR("invalid locking style (%d)\n", lock_style);
-        }
+        result = ll_file_writev(file, local_iov, 1, ppos);
+        cl_env_put(env, &refcheck);
+        RETURN(result);
  }
  
-static ssize_t ll_file_read(struct file *file, char *buf, size_t count,
-                            loff_t *ppos)
+#else /* AIO stuff */
+static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+                                 unsigned long nr_segs, loff_t pos)
  {
-        struct inode *inode = file->f_dentry->d_inode;
-        struct ll_inode_info *lli = ll_i2info(inode);
-        struct lov_stripe_md *lsm = lli->lli_smd;
-        struct ll_sb_info *sbi = ll_i2sbi(inode);
-        struct ll_lock_tree tree;
-        struct ost_lvb lvb;
-        struct ll_ra_read bead;
-        int ra = 0;
-        obd_off end;
-        ssize_t retval, chunk, sum = 0;
-        int lock_style;
-        void *cookie;
-
-        __u64 kms;
+        struct lu_env      *env;
+        struct ccc_io_args *args;
+        size_t              count;
+        ssize_t             result;
+        int                 refcheck;
          ENTRY;
-        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
-               inode->i_ino, inode->i_generation, inode, count, *ppos);
-        /* "If nbyte is 0, read() will return 0 and have no other results."
-         *                      -- Single Unix Spec */
-        if (count == 0)
-                RETURN(0);
-
-        ll_stats_ops_tally(sbi, LPROC_LL_READ_BYTES, count);
-
-        if (!lsm) {
-                /* Read on file with no objects should return zero-filled
-                 * buffers up to file size (we can get non-zero sizes with
-                 * mknod + truncate, then opening file for read. This is a
-                 * common pattern in NFS case, it seems). Bug 6243 */
-                int notzeroed;
-                /* Since there are no objects on OSTs, we have nothing to get
-                 * lock on and so we are forced to access inode->i_size
-                 * unguarded */
-
-                /* Read beyond end of file */
-                if (*ppos >= i_size_read(inode))
-                        RETURN(0);
-
-                if (count > i_size_read(inode) - *ppos)
-                        count = i_size_read(inode) - *ppos;
-                /* Make sure to correctly adjust the file pos pointer for
-                 * EFAULT case */
-                notzeroed = clear_user(buf, count);
-                count -= notzeroed;
-                *ppos += count;
-                if (!count)
-                        RETURN(-EFAULT);
-                RETURN(count);
-        }
-repeat:
-        if (sbi->ll_max_rw_chunk != 0) {
-                /* first, let's know the end of the current stripe */
-                end = *ppos;
-                obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END, &end);
-
-                /* correct, the end is beyond the request */
-                if (end > *ppos + count - 1)
-                        end = *ppos + count - 1;
-
-                /* and chunk shouldn't be too large even if striping is wide */
-                if (end - *ppos > sbi->ll_max_rw_chunk)
-                        end = *ppos + sbi->ll_max_rw_chunk - 1;
-        } else {
-                end = *ppos + count - 1;
-        }
-
-        lock_style = ll_file_get_lock(file, (obd_off)(*ppos), end,
-                                      buf, &cookie, &tree, OBD_BRW_READ);
-        if (lock_style < 0)
-                GOTO(out, retval = lock_style);
-
-        ll_inode_size_lock(inode, 1);
-        /*
-         * Consistency guarantees: following possibilities exist for the
-         * relation between region being read and real file size at this
-         * moment:
-         *
-         *  (A): the region is completely inside of the file;
-         *
-         *  (B-x): x bytes of region are inside of the file, the rest is
-         *  outside;
-         *
-         *  (C): the region is completely outside of the file.
-         *
-         * This classification is stable under DLM lock acquired by
-         * ll_tree_lock() above, because to change class, other client has to
-         * take DLM lock conflicting with our lock. Also, any updates to
-         * ->i_size by other threads on this client are serialized by
-         * ll_inode_size_lock(). This guarantees that short reads are handled
-         * correctly in the face of concurrent writes and truncates.
-         */
-        inode_init_lvb(inode, &lvb);
-        obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 1);
-        kms = lvb.lvb_size;
-        if (*ppos + count - 1 > kms) {
-                /* A glimpse is necessary to determine whether we return a
-                 * short read (B) or some zeroes at the end of the buffer (C) */
-                ll_inode_size_unlock(inode, 1);
-                retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
-                if (retval) {
-                        if (lock_style != LL_LOCK_STYLE_NOLOCK)
-                                ll_file_put_lock(inode, end, lock_style,
-                                                 cookie, &tree, OBD_BRW_READ);
-                        goto out;
-                }
-        } else {
-                /* region is within kms and, hence, within real file size (A).
-                 * We need to increase i_size to cover the read region so that
-                 * generic_file_read() will do its job, but that doesn't mean
-                 * the kms size is _correct_, it is only the _minimum_ size.
-                 * If someone does a stat they will get the correct size which
-                 * will always be >= the kms value here.  b=11081 */
-                if (i_size_read(inode) < kms)
-                        i_size_write(inode, kms);
-                ll_inode_size_unlock(inode, 1);
-        }
-
-        chunk = end - *ppos + 1;
-        CDEBUG(D_INODE,"Read ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
-               inode->i_ino, chunk, *ppos, i_size_read(inode));
-
-        if (lock_style != LL_LOCK_STYLE_NOLOCK) {
-                /* turn off the kernel's read-ahead */
-                file->f_ra.ra_pages = 0;
-
-                /* initialize read-ahead window once per syscall */
-                if (ra == 0) {
-                        ra = 1;
-                        bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
-                        bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
-                        ll_ra_read_in(file, &bead);
-                }
  
-                /* BUG: 5972 */
-                file_accessed(file);
-                retval = generic_file_read(file, buf, chunk, ppos);
-                ll_file_put_lock(inode, end, lock_style, cookie, &tree,
-                                 OBD_BRW_READ);
-        } else {
-                retval = ll_file_lockless_io(file, buf, chunk, ppos, READ);
-        }
+        result = ll_file_get_iov_count(iov, &nr_segs, &count);
+        if (result)
+                RETURN(result);
  
-        ll_rw_stats_tally(sbi, current->pid, file, chunk, 0);
+        env = cl_env_get(&refcheck);
+        if (IS_ERR(env))
+                RETURN(PTR_ERR(env));
  
-        if (retval > 0) {
-                buf += retval;
-                count -= retval;
-                sum += retval;
-                if (retval == chunk && count > 0)
-                        goto repeat;
-        }
-
- out:
-        if (ra != 0)
-                ll_ra_read_ex(file, &bead);
-        retval = (sum > 0) ? sum : retval;
-        RETURN(retval);
+        args = &vvp_env_info(env)->vti_args;
+        args->cia_iov = (struct iovec *)iov;
+        args->cia_nrsegs = nr_segs;
+        args->cia_iocb = iocb;
+        result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
+                                  &iocb->ki_pos, count);
+        cl_env_put(env, &refcheck);
+        RETURN(result);
  }
  
-/*
- * Write to a file (through the page cache).
- */
  static ssize_t ll_file_write(struct file *file, const char *buf, size_t count,
                               loff_t *ppos)
  {
-        struct inode *inode = file->f_dentry->d_inode;
-        struct ll_sb_info *sbi = ll_i2sbi(inode);
-        struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
-        struct ll_lock_tree tree;
-        loff_t maxbytes = ll_file_maxbytes(inode);
-        loff_t lock_start, lock_end, end;
-        ssize_t retval, chunk, sum = 0;
-        int tree_locked;
+        struct lu_env *env;
+        struct iovec  *local_iov;
+        struct kiocb  *kiocb;
+        ssize_t        result;
+        int            refcheck;
          ENTRY;
  
-        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
-               inode->i_ino, inode->i_generation, inode, count, *ppos);
+        env = cl_env_get(&refcheck);
+        if (IS_ERR(env))
+                RETURN(PTR_ERR(env));
  
-        SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
+        local_iov = &vvp_env_info(env)->vti_local_iov;
+        kiocb = &vvp_env_info(env)->vti_kiocb;
+        local_iov->iov_base = (void __user *)buf;
+        local_iov->iov_len = count;
+        init_sync_kiocb(kiocb, file);
+        kiocb->ki_pos = *ppos;
+        kiocb->ki_left = count;
  
-        /* POSIX, but surprised the VFS doesn't check this already */
-        if (count == 0)
-                RETURN(0);
-
-        /* If file was opened for LL_IOC_LOV_SETSTRIPE but the ioctl wasn't
-         * called on the file, don't fail the below assertion (bug 2388). */
-        if (file->f_flags & O_LOV_DELAY_CREATE &&
-            ll_i2info(inode)->lli_smd == NULL)
-                RETURN(-EBADF);
-
-        LASSERT(ll_i2info(inode)->lli_smd != NULL);
-
-        down(&ll_i2info(inode)->lli_write_sem);
-
-repeat:
-        chunk = 0; /* just to fix gcc's warning */
-        end = *ppos + count - 1;
-
-        if (file->f_flags & O_APPEND) {
-                lock_start = 0;
-                lock_end = OBD_OBJECT_EOF;
-        } else if (sbi->ll_max_rw_chunk != 0) {
-                /* first, let's know the end of the current stripe */
-                end = *ppos;
-                obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END,
-                                (obd_off *)&end);
-
-                /* correct, the end is beyond the request */
-                if (end > *ppos + count - 1)
-                        end = *ppos + count - 1;
-
-                /* and chunk shouldn't be too large even if striping is wide */
-                if (end - *ppos > sbi->ll_max_rw_chunk)
-                        end = *ppos + sbi->ll_max_rw_chunk - 1;
-                lock_start = *ppos;
-                lock_end = end;
-        } else {
-                lock_start = *ppos;
-                lock_end = *ppos + count - 1;
-        }
-
-        tree_locked = ll_file_get_tree_lock(&tree, file, buf, count,
-                                            lock_start, lock_end, OBD_BRW_WRITE);
-        if (tree_locked < 0)
-                GOTO(out, retval = tree_locked);
-
-        /* This is ok, g_f_w will overwrite this under i_sem if it races
-         * with a local truncate, it just makes our maxbyte checking easier.
-         * The i_size value gets updated in ll_extent_lock() as a consequence
-         * of the [0,EOF] extent lock we requested above. */
-        if (file->f_flags & O_APPEND) {
-                *ppos = i_size_read(inode);
-                end = *ppos + count - 1;
-        }
+        result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos);
+        *ppos = kiocb->ki_pos;
  
-        if (*ppos >= maxbytes) {
-                send_sig(SIGXFSZ, current, 0);
-                GOTO(out_unlock, retval = -EFBIG);
-        }
-        if (end > maxbytes - 1)
-                end = maxbytes - 1;
-
-        /* generic_file_write handles O_APPEND after getting i_mutex */
-        chunk = end - *ppos + 1;
-        CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n",
-               inode->i_ino, chunk, *ppos);
-        if (tree_locked)
-                retval = generic_file_write(file, buf, chunk, ppos);
-        else
-                retval = ll_file_lockless_io(file, (char*)buf, chunk,
-                                             ppos, WRITE);
-        ll_rw_stats_tally(ll_i2sbi(inode), current->pid, file, chunk, 1);
-
-out_unlock:
-        if (tree_locked)
-                ll_tree_unlock(&tree);
-
-out:
-        if (retval > 0) {
-                buf += retval;
-                count -= retval;
-                sum += retval;
-                if (retval == chunk && count > 0)
-                        goto repeat;
-        }
-
-        up(&ll_i2info(inode)->lli_write_sem);
-
-        retval = (sum > 0) ? sum : retval;
-        ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
-                           retval > 0 ? retval : 0);
-        RETURN(retval);
+        cl_env_put(env, &refcheck);
+        RETURN(result);
  }
+#endif
+
  
  /*
   * Send file content (through pagecache) somewhere with helper
@@ -1865,100 +1095,28 @@ out:
  static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count,
                                  read_actor_t actor, void *target)
  {
-        struct inode *inode = in_file->f_dentry->d_inode;
-        struct ll_inode_info *lli = ll_i2info(inode);
-        struct lov_stripe_md *lsm = lli->lli_smd;
-        struct ll_lock_tree tree;
-        struct ll_lock_tree_node *node;
-        struct ost_lvb lvb;
-        struct ll_ra_read bead;
-        int rc;
-        ssize_t retval;
-        __u64 kms;
+        struct lu_env      *env;
+        struct ccc_io_args *args;
+        ssize_t             result;
+        int                 refcheck;
          ENTRY;
-        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n",
-               inode->i_ino, inode->i_generation, inode, count, *ppos);
-
-        /* "If nbyte is 0, read() will return 0 and have no other results."
-         *                      -- Single Unix Spec */
-        if (count == 0)
-                RETURN(0);
-
-        ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_READ_BYTES, count);
-        /* turn off the kernel's read-ahead */
-        in_file->f_ra.ra_pages = 0;
-
-        /* File with no objects, nothing to lock */
-        if (!lsm)
-                RETURN(generic_file_sendfile(in_file, ppos,count,actor,target));
-
-        node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, LCK_PR);
-        if (IS_ERR(node))
-                RETURN(PTR_ERR(node));
-
-        tree.lt_fd = LUSTRE_FPRIVATE(in_file);
-        rc = ll_tree_lock(&tree, node, NULL, count,
-                          in_file->f_flags & O_NONBLOCK?LDLM_FL_BLOCK_NOWAIT:0);
-        if (rc != 0)
-                RETURN(rc);
-
-        ll_clear_file_contended(inode);
-        ll_inode_size_lock(inode, 1);
-        /*
-         * Consistency guarantees: following possibilities exist for the
-         * relation between region being read and real file size at this
-         * moment:
-         *
-         *  (A): the region is completely inside of the file;
-         *
-         *  (B-x): x bytes of region are inside of the file, the rest is
-         *  outside;
-         *
-         *  (C): the region is completely outside of the file.
-         *
-         * This classification is stable under DLM lock acquired by
-         * ll_tree_lock() above, because to change class, other client has to
-         * take DLM lock conflicting with our lock. Also, any updates to
-         * ->i_size by other threads on this client are serialized by
-         * ll_inode_size_lock(). This guarantees that short reads are handled
-         * correctly in the face of concurrent writes and truncates.
-         */
-        inode_init_lvb(inode, &lvb);
-        obd_merge_lvb(ll_i2sbi(inode)->ll_dt_exp, lsm, &lvb, 1);
-        kms = lvb.lvb_size;
-        if (*ppos + count - 1 > kms) {
-                /* A glimpse is necessary to determine whether we return a
-                 * short read (B) or some zeroes at the end of the buffer (C) */
-                ll_inode_size_unlock(inode, 1);
-                retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
-                if (retval)
-                        goto out;
-        } else {
-                /* region is within kms and, hence, within real file size (A) */
-                i_size_write(inode, kms);
-                ll_inode_size_unlock(inode, 1);
-        }
  
-        CDEBUG(D_INFO, "Send ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n",
-               inode->i_ino, count, *ppos, i_size_read(inode));
+        env = cl_env_get(&refcheck);
+        if (IS_ERR(env))
+                RETURN(PTR_ERR(env));
  
-        bead.lrr_start = *ppos >> CFS_PAGE_SHIFT;
-        bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
-        ll_ra_read_in(in_file, &bead);
-        /* BUG: 5972 */
-        file_accessed(in_file);
-        retval = generic_file_sendfile(in_file, ppos, count, actor, target);
-        ll_ra_read_ex(in_file, &bead);
-
- out:
-        ll_tree_unlock(&tree);
-        RETURN(retval);
+        args = &vvp_env_info(env)->vti_args;
+        args->cia_is_sendfile = 1;
+        args->cia_target = target;
+        args->cia_actor = actor;
+        result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
+        cl_env_put(env, &refcheck);
+        RETURN(result);
  }
  
  static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
                                 unsigned long arg)
  {
-        struct ll_inode_info *lli = ll_i2info(inode);
          struct obd_export *exp = ll_i2dtexp(inode);
          struct ll_recreate_obj ucreatp;
          struct obd_trans_info oti = { 0 };
@@ -1979,8 +1137,8 @@ static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
          if (oa == NULL)
                  RETURN(-ENOMEM);
  
-        down(&lli->lli_size_sem);
-        lsm = lli->lli_smd;
+        ll_inode_size_lock(inode, 0);
+        lsm = ll_i2info(inode)->lli_smd;
          if (lsm == NULL)
                  GOTO(out, rc = -ENOENT);
          lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) *
@@ -2004,7 +1162,7 @@ static int ll_lov_recreate_obj(struct inode *inode, struct file *file,
          OBD_FREE(lsm2, lsm_size);
          GOTO(out, rc);
  out:
-        up(&lli->lli_size_sem);
+        ll_inode_size_unlock(inode, 0);
          OBDO_FREE(oa);
          return rc;
  }
@@ -2012,16 +1170,15 @@ out:
  int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
                               int flags, struct lov_user_md *lum, int lum_size)
  {
-        struct ll_inode_info *lli = ll_i2info(inode);
          struct lov_stripe_md *lsm;
          struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags};
          int rc = 0;
          ENTRY;
  
-        down(&lli->lli_size_sem);
-        lsm = lli->lli_smd;
+        ll_inode_size_lock(inode, 0);
+        lsm = ll_i2info(inode)->lli_smd;
          if (lsm) {
-                up(&lli->lli_size_sem);
+                ll_inode_size_unlock(inode, 0);
                  CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n",
                         inode->i_ino);
                  RETURN(-EEXIST);
@@ -2039,7 +1196,7 @@ int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file,
          ll_release_openhandle(file->f_dentry, &oit);
  
   out:
-        up(&lli->lli_size_sem);
+        ll_inode_size_unlock(inode, 0);
          ll_intent_release(&oit);
          RETURN(rc);
  out_req_free:
@@ -2251,61 +1408,15 @@ static int ll_lov_getstripe(struct inode *inode, unsigned long arg)
  static int ll_get_grouplock(struct inode *inode, struct file *file,
                              unsigned long arg)
  {
-        struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
-        ldlm_policy_data_t policy = { .l_extent = { .start = 0,
-                                                    .end = OBD_OBJECT_EOF}};
-        struct lustre_handle lockh = { 0 };
-        struct ll_inode_info *lli = ll_i2info(inode);
-        struct lov_stripe_md *lsm = lli->lli_smd;
-        int flags = 0, rc;
-        ENTRY;
-
-        if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
-                RETURN(-EINVAL);
-        }
-
-        policy.l_extent.gid = arg;
-        if (file->f_flags & O_NONBLOCK)
-                flags = LDLM_FL_BLOCK_NOWAIT;
-
-        rc = ll_extent_lock(fd, inode, lsm, LCK_GROUP, &policy, &lockh, flags);
-        if (rc)
-                RETURN(rc);
-
-        fd->fd_flags |= LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK;
-        fd->fd_gid = arg;
-        memcpy(&fd->fd_cwlockh, &lockh, sizeof(lockh));
-
-        RETURN(0);
+        /* XXX */
+        return -ENOSYS;
  }
  
  static int ll_put_grouplock(struct inode *inode, struct file *file,
                              unsigned long arg)
  {
-        struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
-        struct ll_inode_info *lli = ll_i2info(inode);
-        struct lov_stripe_md *lsm = lli->lli_smd;
-        int rc;
-        ENTRY;
-
-        if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
-                /* Ugh, it's already unlocked. */
-                RETURN(-EINVAL);
-        }
-
-        if (fd->fd_gid != arg) /* Ugh? Unlocking with different gid? */
-                RETURN(-EINVAL);
-
-        fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK);
-
-        rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP, &fd->fd_cwlockh);
-        if (rc)
-                RETURN(rc);
-
-        fd->fd_gid = 0;
-        memset(&fd->fd_cwlockh, 0, sizeof(fd->fd_cwlockh));
-
-        RETURN(0);
+        /* XXX */
+        return -ENOSYS;
  }
  
  #if LUSTRE_FIX >= 50
@@ -2745,8 +1856,6 @@ error:
  loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
  {
          struct inode *inode = file->f_dentry->d_inode;
-        struct ll_inode_info *lli = ll_i2info(inode);
-        struct lov_stripe_md *lsm = lli->lli_smd;
          loff_t retval;
          ENTRY;
          retval = offset + ((origin == 2) ? i_size_read(inode) :
@@ -2762,11 +1871,9 @@ loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
                  if (file->f_flags & O_NONBLOCK)
                          nonblock = LDLM_FL_BLOCK_NOWAIT;
  
-                if (lsm != NULL) {
-                        rc = ll_glimpse_size(inode, nonblock);
-                        if (rc != 0)
-                                RETURN(rc);
-                }
+                rc = cl_glimpse_size(inode);
+                if (rc != 0)
+                        RETURN(rc);
  
                  ll_inode_size_lock(inode, 0);
                  offset += i_size_read(inode);
@@ -3121,9 +2228,9 @@ int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it)
          if (ll_i2info(inode)->lli_smd == NULL)
                  GOTO(out, rc = 0);
  
-        /* ll_glimpse_size will prefer locally cached writes if they extend
+        /* cl_glimpse_size will prefer locally cached writes if they extend
           * the file */
-        rc = ll_glimpse_size(inode, 0);
+        rc = cl_glimpse_size(inode);
          EXIT;
  out:
          ptlrpc_req_finished(req);
@@ -3260,10 +2367,24 @@ check_capabilities:
  }
  #endif
  
+#ifdef HAVE_FILE_READV
+#define READ_METHOD readv
+#define READ_FUNCTION ll_file_readv
+#define WRITE_METHOD writev
+#define WRITE_FUNCTION ll_file_writev
+#else
+#define READ_METHOD aio_read
+#define READ_FUNCTION ll_file_aio_read
+#define WRITE_METHOD aio_write
+#define WRITE_FUNCTION ll_file_aio_write
+#endif
+
  /* -o localflock - only provides locally consistent flock locks */
  struct file_operations ll_file_operations = {
          .read           = ll_file_read,
+        .READ_METHOD    = READ_FUNCTION,
          .write          = ll_file_write,
+        .WRITE_METHOD   = WRITE_FUNCTION,
          .ioctl          = ll_file_ioctl,
          .open           = ll_file_open,
          .release        = ll_file_release,
@@ -3275,7 +2396,9 @@ struct file_operations ll_file_operations = {
  
  struct file_operations ll_file_operations_flock = {
          .read           = ll_file_read,
+        .READ_METHOD    = READ_FUNCTION,
          .write          = ll_file_write,
+        .WRITE_METHOD   = WRITE_FUNCTION,
          .ioctl          = ll_file_ioctl,
          .open           = ll_file_open,
          .release        = ll_file_release,
@@ -3292,7 +2415,9 @@ struct file_operations ll_file_operations_flock = {
  /* These are for -o noflock - to return ENOSYS on flock calls */
  struct file_operations ll_file_operations_noflock = {
          .read           = ll_file_read,
+        .READ_METHOD    = READ_FUNCTION,
          .write          = ll_file_write,
+        .WRITE_METHOD   = WRITE_FUNCTION,
          .ioctl          = ll_file_ioctl,
          .open           = ll_file_open,
          .release        = ll_file_release,
diff --git a/lustre/llite/llite_capa.c b/lustre/llite/llite_capa.c

index ab515b2..818008a 100644 (file)
--- a/lustre/llite/llite_capa.c
+++ b/lustre/llite/llite_capa.c
@@ -339,10 +339,11 @@ struct obd_capa *ll_osscapa_get(struct inode *inode, __u64 opc)
          struct obd_capa *ocapa;
          int found = 0;
  
-        if ((ll_i2sbi(inode)->ll_flags & LL_SBI_OSS_CAPA) == 0)
-                return NULL;
          ENTRY;
  
+        if ((ll_i2sbi(inode)->ll_flags & LL_SBI_OSS_CAPA) == 0)
+                RETURN(NULL);
+
          LASSERT(opc == CAPA_OPC_OSS_WRITE || opc == CAPA_OPC_OSS_RW ||
                  opc == CAPA_OPC_OSS_TRUNC);
  
@@ -393,7 +394,7 @@ struct obd_capa *ll_mdscapa_get(struct inode *inode)
          ENTRY;
  
          LASSERT(inode != NULL);
-        
+
          if ((ll_i2sbi(inode)->ll_flags & LL_SBI_MDS_CAPA) == 0)
                  RETURN(NULL);
  
@@ -630,7 +631,7 @@ void ll_clear_inode_capas(struct inode *inode)
          ocapa = lli->lli_mds_capa;
          if (ocapa)
                  ll_delete_capa(ocapa);
-                
+
          list_for_each_entry_safe(ocapa, tmp, &lli->lli_oss_capas,
                                   u.cli.lli_list)
                  ll_delete_capa(ocapa);
diff --git a/lustre/llite/llite_close.c b/lustre/llite/llite_close.c

index 1c973b6..53b8514 100644 (file)
--- a/lustre/llite/llite_close.c
+++ b/lustre/llite/llite_close.c
@@ -46,38 +46,39 @@
  #include <lustre_lite.h>
  #include "llite_internal.h"
  
-/* record that a write is in flight */
-void llap_write_pending(struct inode *inode, struct ll_async_page *llap)
+/** records that a write is in flight */
+void vvp_write_pending(struct ccc_object *club, struct ccc_page *page)
  {
-        struct ll_inode_info *lli = ll_i2info(inode);
+        struct ll_inode_info *lli = ll_i2info(club->cob_inode);
  
          ENTRY;
          spin_lock(&lli->lli_lock);
          lli->lli_flags |= LLIF_SOM_DIRTY;
-        if (llap && list_empty(&llap->llap_pending_write))
-                list_add(&llap->llap_pending_write, 
-                         &lli->lli_pending_write_llaps);
+        if (page != NULL && list_empty(&page->cpg_pending_linkage))
+                list_add(&page->cpg_pending_linkage, &club->cob_pending_list);
          spin_unlock(&lli->lli_lock);
          EXIT;
  }
  
-/* record that a write has completed */
-int llap_write_complete(struct inode *inode, struct ll_async_page *llap)
+/** records that a write has completed */
+void vvp_write_complete(struct ccc_object *club, struct ccc_page *page)
  {
-        struct ll_inode_info *lli = ll_i2info(inode);
+        struct ll_inode_info *lli = ll_i2info(club->cob_inode);
          int rc = 0;
-        
+
          ENTRY;
          spin_lock(&lli->lli_lock);
-        if (llap && !list_empty(&llap->llap_pending_write)) {
-                list_del_init(&llap->llap_pending_write);
+        if (page != NULL && !list_empty(&page->cpg_pending_linkage)) {
+                list_del_init(&page->cpg_pending_linkage);
                  rc = 1;
          }
          spin_unlock(&lli->lli_lock);
-        RETURN(rc);
+        if (rc)
+                ll_queue_done_writing(club->cob_inode, 0);
+        EXIT;
  }
  
-/* Queue DONE_WRITING if 
+/** Queues DONE_WRITING if
   * - done writing is allowed;
   * - inode has no no dirty pages; */
  void ll_queue_done_writing(struct inode *inode, unsigned long flags)
@@ -94,7 +95,7 @@ void ll_queue_done_writing(struct inode *inode, unsigned long flags)
                  if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
                           CWARN("ino %lu/%u(flags %lu) som valid it just after "
                                 "recovery\n",
-                               inode->i_ino, inode->i_generation, 
+                               inode->i_ino, inode->i_generation,
                                 lli->lli_flags);
                  /* DONE_WRITING is allowed and inode has no dirty page. */
                  spin_lock(&lcq->lcq_lock);
@@ -118,8 +119,8 @@ void ll_queue_done_writing(struct inode *inode, unsigned long flags)
          spin_unlock(&lli->lli_lock);
  }
  
-/* Close epoch and send Size-on-MDS attribute update if possible. 
- * Call this under @lli->lli_lock spinlock. */
+/** Closes epoch and sends Size-on-MDS attribute update if possible.  Call
+ * this under ll_inode_info::lli_lock spinlock. */
  void ll_epoch_close(struct inode *inode, struct md_op_data *op_data,
                      struct obd_client_handle **och, unsigned long flags)
  {
@@ -140,7 +141,7 @@ void ll_epoch_close(struct inode *inode, struct md_op_data *op_data,
                          inode = igrab(inode);
                          LASSERT(inode);
                          GOTO(out, 0);
-                } 
+                }
                  if (flags & LLIF_DONE_WRITING) {
                          /* Some pages are still dirty, it is early to send
                           * DONE_WRITE. Wait untill all pages will be flushed
@@ -190,7 +191,7 @@ void ll_epoch_close(struct inode *inode, struct md_op_data *op_data,
                        "recovery\n",
                        inode->i_ino, inode->i_generation, lli->lli_flags);
  
-        if (!ll_local_size(inode)) {
+        if (!cl_local_size(inode)) {
                  /* Send Size-on-MDS Attributes if valid. Atime is sent along
                   * with all the attributes. */
                  op_data->op_attr.ia_valid |= ATTR_MTIME_SET | ATTR_CTIME_SET |
@@ -209,7 +210,7 @@ int ll_sizeonmds_update(struct inode *inode, struct md_open_data *mod,
          struct obdo *oa;
          int rc;
          ENTRY;
-        
+
          /* LASSERT(!(lli->lli_flags & LLIF_MDS_SIZE_LOCK)); */
          /* After recovery that can be valid. */
          if (lli->lli_flags & LLIF_MDS_SIZE_LOCK)
@@ -252,7 +253,8 @@ out:
          return rc;
  }
  
-/* Send a DONE_WRITING rpc, pack Size-on-MDS attributes into it, if possible */
+/** Sends a DONE_WRITING rpc, packs Size-on-MDS attributes into it, if
+ * possible */
  static void ll_done_writing(struct inode *inode)
  {
          struct obd_client_handle *och = NULL;
@@ -261,7 +263,7 @@ static void ll_done_writing(struct inode *inode)
          ENTRY;
  
          LASSERT(ll_i2mdexp(inode)->exp_connect_flags & OBD_CONNECT_SOM);
-        
+
          OBD_ALLOC_PTR(op_data);
          if (op_data == NULL) {
                  CERROR("can't allocate op_data\n");
@@ -273,12 +275,12 @@ static void ll_done_writing(struct inode *inode)
          /* If there is no @och, we do not do D_W yet. */
          if (och == NULL)
                  GOTO(out, 0);
-        
+
          ll_pack_inode2opdata(inode, op_data, &och->och_fh);
  
          rc = md_done_writing(ll_i2sbi(inode)->ll_md_exp, op_data, och->och_mod);
          if (rc == -EAGAIN) {
-                /* MDS has instructed us to obtain Size-on-MDS attribute from 
+                /* MDS has instructed us to obtain Size-on-MDS attribute from
                   * OSTs and send setattr to back to MDS. */
                  rc = ll_sizeonmds_update(inode, och->och_mod,
                                           &och->och_fh, op_data->op_ioepoch);
@@ -322,7 +324,7 @@ static int ll_close_thread(void *arg)
                  snprintf(name, sizeof(name) - 1, "ll_close");
                  cfs_daemonize(name);
          }
-        
+
          complete(&lcq->lcq_comp);
  
          while (1) {
diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h

index 07e2976..2c6153b 100644 (file)
--- a/lustre/llite/llite_internal.h
+++ b/lustre/llite/llite_internal.h
@@ -44,6 +44,10 @@
  #include <lustre_disk.h>  /* for s2sbi */
  #include <lustre_eacl.h>
  
+/* for struct cl_lock_descr and struct cl_io */
+#include <cl_object.h>
+#include <lclient.h>
+
  #ifndef FMODE_EXEC
  #define FMODE_EXEC 0
  #endif
@@ -127,7 +131,6 @@ struct ll_inode_info {
           * Open handle data are needed for the recovery to reconstruct
           * the inode state on the MDS. XXX: recovery is not ready yet. */
          struct obd_client_handle *lli_pending_och;
-        atomic_t                lli_mmap_cnt;
  
          /* for writepage() only to communicate to fsync */
          int                     lli_async_rc;
@@ -171,12 +174,13 @@ struct ll_inode_info {
           * dir statahead.
           */
          pid_t                   lli_opendir_pid;
-        /* 
+        /*
           * since parent-child threads can share the same @file struct,
           * "opendir_key" is the token when dir close for case of parent exit
           * before child -- it is me should cleanup the dir readahead. */
          void                   *lli_opendir_key;
          struct ll_statahead_info *lli_sai;
+        struct cl_object       *lli_clob;
  };
  
  /*
@@ -220,28 +224,31 @@ enum ra_stat {
          _NR_RA_STAT,
  };
  
-#define LL_RA_STAT      _NR_RA_STAT
-#define LL_RA_STAT_STRINGS           {                                  \
-        [RA_STAT_HIT]               = "hits",                           \
-        [RA_STAT_MISS]              = "misses",                         \
-        [RA_STAT_DISTANT_READPAGE]  = "readpage not consecutive",       \
-        [RA_STAT_MISS_IN_WINDOW]    = "miss inside window",             \
-        [RA_STAT_FAILED_GRAB_PAGE]  = "failed grab_cache_page",         \
-        [RA_STAT_FAILED_MATCH]      = "failed lock match",              \
-        [RA_STAT_DISCARDED]         = "read but discarded",             \
-        [RA_STAT_ZERO_LEN]          = "zero length file",               \
-        [RA_STAT_ZERO_WINDOW]       = "zero size window",               \
-        [RA_STAT_EOF]               = "read-ahead to EOF",              \
-        [RA_STAT_MAX_IN_FLIGHT]     = "hit max r-a issue",              \
-        [RA_STAT_WRONG_GRAB_PAGE]   = "wrong page from grab_cache_page",\
-} 
-
  struct ll_ra_info {
          atomic_t                  ra_cur_pages;
          unsigned long             ra_max_pages;
          unsigned long             ra_max_read_ahead_whole_pages;
  };
  
+/* ra_io_arg will be filled in the beginning of ll_readahead with
+ * ras_lock, then the following ll_read_ahead_pages will read RA
+ * pages according to this arg, all the items in this structure are
+ * counted by page index.
+ */
+struct ra_io_arg {
+        unsigned long ria_start;  /* start offset of read-ahead*/
+        unsigned long ria_end;    /* end offset of read-ahead*/
+        /* If stride read pattern is detected, ria_stoff means where
+         * stride read is started. Note: for normal read-ahead, the
+         * value here is meaningless, and also it will not be accessed*/
+        pgoff_t ria_stoff;
+        /* ria_length and ria_pages are the length and pages length in the
+         * stride I/O mode. And they will also be used to check whether
+         * it is stride I/O read-ahead in the read-ahead pages*/
+        unsigned long ria_length;
+        unsigned long ria_pages;
+};
+
  /* LL_HIST_MAX=32 causes an overflow */
  #define LL_HIST_MAX 28
  #define LL_HIST_START 12 /* buckets start at 2^12 = 4k */
@@ -267,7 +274,7 @@ struct ll_rw_process_info {
          loff_t                    rw_offset;
          size_t                    rw_smallest_extent;
          size_t                    rw_largest_extent;
-        struct file               *rw_last_file;
+        struct ll_file_data      *rw_last_file;
  };
  
  enum stats_track_type {
@@ -323,30 +330,10 @@ struct eacl_table {
          struct list_head et_entries[EE_HASHES];
  };
  
-/* percpu data structure for lustre lru page list */
-struct ll_pglist_data {
-        spinlock_t                llpd_lock; /* lock to protect llpg_list */
-        struct list_head          llpd_list; /* all pages (llap_pglist_item) */
-        unsigned long             llpd_gen;  /* generation # of this list */
-        unsigned long             llpd_count; /* How many pages in this list */
-        atomic_t                  llpd_sample_count;
-        unsigned long             llpd_reblnc_count;
-        /* the pages in this list shouldn't be over this number */
-        unsigned long             llpd_budget; 
-        int                       llpd_cpu;
-        /* which page the pglist data is in */
-        struct page              *llpd_page; 
-
-        /* stats */
-        unsigned long             llpd_hit;
-        unsigned long             llpd_miss;
-        unsigned long             llpd_cross;
-};
-
  struct ll_sb_info {
          struct list_head          ll_list;
-        /* this protects pglist(only ll_async_page_max) and ra_info.  
-         * It isn't safe to grab from interrupt contexts. */
+        /* this protects pglist and ra_info.  It isn't safe to
+         * grab from interrupt contexts */
          spinlock_t                ll_lock;
          spinlock_t                ll_pp_extent_lock; /* Lock for pp_extent entries */
          spinlock_t                ll_process_lock; /* Lock for ll_rw_process_info */
@@ -365,23 +352,11 @@ struct ll_sb_info {
  
          struct lprocfs_stats     *ll_stats; /* lprocfs stats counter */
  
-        /* reblnc lock protects llpd_budget */
-        spinlock_t                ll_async_page_reblnc_lock;
-        unsigned long             ll_async_page_reblnc_count;
-        unsigned long             ll_async_page_sample_max;
-        /* I defined this array here rather than in ll_pglist_data
-         * because it is always accessed by only one cpu. -jay */
-        unsigned long            *ll_async_page_sample;
          unsigned long             ll_async_page_max;
-        unsigned long             ll_async_page_clock_hand;
-        lcounter_t                ll_async_page_count;
-        struct ll_pglist_data   **ll_pglist;
+        unsigned long             ll_async_page_count;
  
          struct lprocfs_stats     *ll_ra_stats;
  
-        unsigned                  ll_contention_time; /* seconds */
-        unsigned                  ll_lockless_truncate_enable; /* true/false */
-
          struct ll_ra_info         ll_ra_info;
          unsigned int              ll_namelen;
          struct file_operations   *ll_fop;
@@ -394,6 +369,8 @@ struct ll_sb_info {
           * >0 - max. chunk to be read/written w/o lock re-acquiring */
          unsigned long             ll_max_rw_chunk;
  
+        struct lu_site           *ll_site;
+        struct cl_device         *ll_cl;
          /* Statistics */
          struct ll_rw_extents_info ll_rw_extents_info;
          int                       ll_extent_process_count;
@@ -425,68 +402,6 @@ struct ll_sb_info {
  
  #define LL_DEFAULT_MAX_RW_CHUNK      (32 * 1024 * 1024)
  
-#define LL_PGLIST_DATA_CPU(sbi, cpu) ((sbi)->ll_pglist[cpu])
-#define LL_PGLIST_DATA(sbi)          LL_PGLIST_DATA_CPU(sbi, smp_processor_id())
-
-static inline struct ll_pglist_data *ll_pglist_cpu_lock(
-                struct ll_sb_info *sbi, 
-                int cpu)
-{
-        spin_lock(&sbi->ll_pglist[cpu]->llpd_lock);
-        return LL_PGLIST_DATA_CPU(sbi, cpu);
-}
-
-static inline void ll_pglist_cpu_unlock(struct ll_sb_info *sbi, int cpu)
-{
-        spin_unlock(&sbi->ll_pglist[cpu]->llpd_lock);
-}
-
-static inline struct ll_pglist_data *ll_pglist_double_lock(
-                struct ll_sb_info *sbi, 
-                int cpu, struct ll_pglist_data **pd_cpu)
-{
-        int current_cpu = get_cpu();
-
-        if (cpu == current_cpu) {
-                ll_pglist_cpu_lock(sbi, cpu);
-        } else if (current_cpu < cpu) {
-                ll_pglist_cpu_lock(sbi, current_cpu);
-                ll_pglist_cpu_lock(sbi, cpu);
-        } else {
-                ll_pglist_cpu_lock(sbi, cpu);
-                ll_pglist_cpu_lock(sbi, current_cpu);
-        }
-
-        if (pd_cpu)
-                *pd_cpu = LL_PGLIST_DATA_CPU(sbi, cpu);
-
-        return LL_PGLIST_DATA(sbi);
-}
-
-static inline void ll_pglist_double_unlock(struct ll_sb_info *sbi, int cpu)
-{
-        int current_cpu = smp_processor_id();
-        if (cpu == current_cpu) {
-                ll_pglist_cpu_unlock(sbi, cpu);
-        } else {
-                ll_pglist_cpu_unlock(sbi, cpu);
-                ll_pglist_cpu_unlock(sbi, current_cpu);
-        }
-        put_cpu();
-}
-
-static inline struct ll_pglist_data *ll_pglist_lock(struct ll_sb_info *sbi)
-{
-        ll_pglist_cpu_lock(sbi, get_cpu());
-        return LL_PGLIST_DATA(sbi);
-}
-
-static inline void ll_pglist_unlock(struct ll_sb_info *sbi)
-{
-        ll_pglist_cpu_unlock(sbi, smp_processor_id());
-        put_cpu();
-}
-
  struct ll_ra_read {
          pgoff_t             lrr_start;
          pgoff_t             lrr_count;
@@ -557,12 +472,12 @@ struct ll_readahead_state {
           * protected by ->ras_lock.
           */
          struct list_head ras_read_beads;
-        /* 
+        /*
           * The following 3 items are used for detecting the stride I/O
-         * mode. 
-        * In stride I/O mode, 
-         * ...............|-----data-----|****gap*****|--------|******|.... 
-         *    offset      |-stride_pages-|-stride_gap-| 
+         * mode.
+        * In stride I/O mode,
+         * ...............|-----data-----|****gap*****|--------|******|....
+         *    offset      |-stride_pages-|-stride_gap-|
           * ras_stride_offset = offset;
           * ras_stride_length = stride_pages + stride_gap;
           * ras_stride_pages = stride_pages;
@@ -571,7 +486,7 @@ struct ll_readahead_state {
          unsigned long ras_stride_length;
          unsigned long ras_stride_pages;
          pgoff_t ras_stride_offset;
-        /* 
+        /*
           * number of consecutive stride request count, and it is similar as
           * ras_consecutive_requests, but used for stride I/O mode.
           * Note: only more than 2 consecutive stride request are detected,
@@ -592,6 +507,7 @@ struct ll_file_data {
          unsigned long fd_gid;
          struct ll_file_dir fd_dir;
          __u32 fd_flags;
+        struct file *fd_file;
  };
  
  struct lov_stripe_md;
@@ -618,48 +534,6 @@ void ll_i2gids(__u32 *suppgids, struct inode *i1,struct inode *i2);
  
  extern cfs_mem_cache_t *ll_async_page_slab;
  extern size_t ll_async_page_slab_size;
-struct ll_async_page {
-        int              llap_magic;
-         /* only trust these if the page lock is providing exclusion */
-        unsigned int     llap_write_queued:1,
-                         llap_defer_uptodate:1,
-                         llap_origin:3,
-                         llap_ra_used:1,
-                         llap_ignore_quota:1,
-                         llap_nocache:1,
-                         llap_lockless_io_page:1,
-                         llap_reserved:7;
-        unsigned int     llap_pglist_cpu:16;
-        void            *llap_cookie;
-        struct page     *llap_page;
-        struct list_head llap_pending_write;
-        struct list_head llap_pglist_item;
-        /* checksum for paranoid I/O debugging */
-        __u32            llap_checksum;
-};
-
-static inline struct ll_async_page *llap_from_cookie(void *ptr)
-{
-        struct ll_async_page *ap = ptr;
-        LASSERT(ap->llap_magic == LLAP_MAGIC);
-        return ap;
-}
-
-/*
- * enumeration of llap_from_page() call-sites. Used to export statistics in
- * /proc/fs/lustre/llite/fsN/dump_page_cache.
- */
-enum {
-        LLAP_ORIGIN_UNKNOWN = 0,
-        LLAP_ORIGIN_READPAGE,
-        LLAP_ORIGIN_READAHEAD,
-        LLAP_ORIGIN_COMMIT_WRITE,
-        LLAP_ORIGIN_WRITEPAGE,
-        LLAP_ORIGIN_REMOVEPAGE,
-        LLAP_ORIGIN_LOCKLESS_IO,
-        LLAP__ORIGIN_MAX,
-};
-extern char *llap_origins[];
  
  #ifdef HAVE_REGISTER_CACHE
  #include <linux/cache_def.h>
@@ -738,22 +612,17 @@ void ll_lookup_finish_locks(struct lookup_intent *it, struct dentry *dentry);
  /* llite/rw.c */
  int ll_prepare_write(struct file *, struct page *, unsigned from, unsigned to);
  int ll_commit_write(struct file *, struct page *, unsigned from, unsigned to);
-int ll_writepage(struct page *page);
-void ll_inode_fill_obdo(struct inode *inode, int cmd, struct obdo *oa);
-int ll_ap_completion(void *data, int cmd, struct obdo *oa, int rc);
-int llap_shrink_cache(struct ll_sb_info *sbi, int shrink_fraction);
-struct ll_async_page *llap_from_page(struct page *page, unsigned origin);
-extern struct cache_definition ll_cache_definition;
+int ll_writepage(struct page *page, struct writeback_control *wbc);
  void ll_removepage(struct page *page);
  int ll_readpage(struct file *file, struct page *page);
-struct ll_async_page *llap_cast_private(struct page *page);
  void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras);
-void ll_ra_accounting(struct ll_async_page *llap,struct address_space *mapping);
  void ll_truncate(struct inode *inode);
  int ll_file_punch(struct inode *, loff_t, int);
  ssize_t ll_file_lockless_io(struct file *, char *, size_t, loff_t *, int);
  void ll_clear_file_contended(struct inode*);
  int ll_sync_page_range(struct inode *, struct address_space *, loff_t, size_t);
+int ll_readahead(const struct lu_env *env, struct cl_io *io, struct ll_readahead_state *ras,
+                 struct address_space *mapping, struct cl_page_list *queue, int flags);
  
  /* llite/file.c */
  extern struct file_operations ll_file_operations;
@@ -764,19 +633,11 @@ extern int ll_inode_revalidate_it(struct dentry *, struct lookup_intent *);
  extern int ll_have_md_lock(struct inode *inode, __u64 bits);
  extern ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits,
                                     struct lustre_handle *lockh);
-int ll_region_mapped(unsigned long addr, size_t count);
-int ll_extent_lock(struct ll_file_data *, struct inode *,
-                   struct lov_stripe_md *, int mode, ldlm_policy_data_t *,
-                   struct lustre_handle *, int ast_flags);
-int ll_extent_unlock(struct ll_file_data *, struct inode *,
-                     struct lov_stripe_md *, int mode, struct lustre_handle *);
  int ll_file_open(struct inode *inode, struct file *file);
  int ll_file_release(struct inode *inode, struct file *file);
  int ll_lsm_getattr(struct obd_export *, struct lov_stripe_md *, struct obdo *);
-int ll_local_size(struct inode *inode);
  int ll_glimpse_ioctl(struct ll_sb_info *sbi,
                       struct lov_stripe_md *lsm, lstat_t *st);
-int ll_glimpse_size(struct inode *inode, int ast_flags);
  int ll_local_open(struct file *file,
                    struct lookup_intent *it, struct ll_file_data *fd,
                    struct obd_client_handle *och);
@@ -793,8 +654,9 @@ int ll_md_setattr(struct inode *inode, struct md_op_data *op_data,
                    struct md_open_data **mod);
  void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data,
                            struct lustre_handle *fh);
-extern void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid, struct file
-                               *file, size_t count, int rw);
+extern void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid,
+                              struct ll_file_data *file, loff_t pos,
+                              size_t count, int rw);
  int ll_getattr_it(struct vfsmount *mnt, struct dentry *de,
                 struct lookup_intent *it, struct kstat *stat);
  int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat);
@@ -808,15 +670,12 @@ int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
                               struct ptlrpc_request **request);
  int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump,
                       int set_default);
-int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmm, 
+int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmm,
                       int *lmm_size, struct ptlrpc_request **request);
  int ll_fsync(struct file *file, struct dentry *dentry, int data);
  int ll_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap,
                int num_bytes);
-void ll_pin_extent_cb(void *data);
-int ll_page_removal_cb(void *data, int discard);
-int ll_extent_lock_cancel_cb(struct ldlm_lock *lock, struct ldlm_lock_desc *new,
-                             void *data, int flag);
+int ll_merge_lvb(struct inode *inode);
  
  /* llite/dcache.c */
  /* llite/namei.c */
@@ -873,8 +732,6 @@ int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req,
                    struct super_block *);
  void lustre_dump_dentry(struct dentry *, int recur);
  void lustre_dump_inode(struct inode *);
-struct ll_async_page *llite_pglist_next_llap(struct list_head *head,
-                                             struct list_head *list);
  int ll_obd_statfs(struct inode *inode, void *arg);
  int ll_get_max_mdsize(struct ll_sb_info *sbi, int *max_mdsize);
  int ll_process_config(struct lustre_cfg *lcfg);
@@ -912,9 +769,96 @@ struct ll_close_queue {
          atomic_t                lcq_stop;
  };
  
-void llap_write_pending(struct inode *inode, struct ll_async_page *llap);
-int llap_write_complete(struct inode *inode, struct ll_async_page *llap);
-int ll_inode_dirty(struct inode *inode, unsigned long flags);
+struct vvp_thread_info {
+        struct ost_lvb       vti_lvb;
+        struct cl_2queue     vti_queue;
+        struct iovec         vti_local_iov;
+        struct ccc_io_args   vti_args;
+        struct ra_io_arg     vti_ria;
+        struct kiocb         vti_kiocb;
+};
+
+struct ccc_object *cl_inode2ccc(struct inode *inode);
+
+static inline struct vvp_thread_info *vvp_env_info(const struct lu_env *env)
+{
+        extern struct lu_context_key vvp_key;
+        struct vvp_thread_info      *info;
+
+        info = lu_context_key_get(&env->le_ctx, &vvp_key);
+        LASSERT(info != NULL);
+        return info;
+}
+
+void vvp_write_pending (struct ccc_object *club, struct ccc_page *page);
+void vvp_write_complete(struct ccc_object *club, struct ccc_page *page);
+
+struct vvp_io {
+        union {
+                struct {
+                        read_actor_t      cui_actor;
+                        void             *cui_target;
+                } read;
+                struct vvp_fault_io {
+                        /**
+                         * Inode modification time that is checked across DLM
+                         * lock request.
+                         */
+                        time_t                 ft_mtime;
+                        struct vm_area_struct *ft_vma;
+                        /**
+                         * Virtual address at which fault occurred.
+                         */
+                        unsigned long          ft_address;
+                        /**
+                         * Fault type, as to be supplied to filemap_nopage().
+                         */
+                        int                   *ft_type;
+                } fault;
+        } u;
+        /**
+         * Read-ahead state used by read and page-fault IO contexts.
+         */
+        struct ll_ra_read    cui_bead;
+        /**
+         * Set when cui_bead has been initialized.
+         */
+        int                  cui_ra_window_set;
+        /**
+         * If IO was created directly in low level method like
+         * ->prepare_write(), this field stores the number of method calls
+         * that constitute this IO. This field is decremented by ll_cl_fini(),
+         * and cl_io is destroyed, when it reaches 0. When oneshot IO
+         * completes, this fields is set to -1.
+         */
+
+        int                  cui_oneshot;
+        /**
+         * Partially truncated page, that vvp_io_trunc_start() keeps locked
+         * across truncate.
+         */
+        struct cl_page      *cui_partpage;
+};
+
+struct vvp_session {
+        struct vvp_io vs_ios;
+};
+
+static inline struct vvp_session *vvp_env_session(const struct lu_env *env)
+{
+        extern struct lu_context_key vvp_session_key;
+        struct vvp_session *ses;
+
+        ses = lu_context_key_get(env->le_ses, &vvp_session_key);
+        LASSERT(ses != NULL);
+        return ses;
+}
+
+static inline struct vvp_io *vvp_env_io(const struct lu_env *env)
+{
+        return &vvp_env_session(env)->vs_ios;
+}
+
  void ll_queue_done_writing(struct inode *inode, unsigned long flags);
  void ll_close_thread_shutdown(struct ll_close_queue *lcq);
  int ll_close_thread_start(struct ll_close_queue **lcq_ret);
@@ -934,10 +878,9 @@ int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last);
  int ll_file_mmap(struct file * file, struct vm_area_struct * vma);
  struct ll_lock_tree_node * ll_node_from_inode(struct inode *inode, __u64 start,
                                                __u64 end, ldlm_mode_t mode);
-int ll_tree_lock(struct ll_lock_tree *tree,
-                 struct ll_lock_tree_node *first_node,
-                 const char *buf, size_t count, int ast_flags);
-int ll_tree_unlock(struct ll_lock_tree *tree);
+void policy_from_vma(ldlm_policy_data_t *policy,
+                struct vm_area_struct *vma, unsigned long addr, size_t count);
+struct vm_area_struct *our_vma(unsigned long addr, size_t count);
  
  #define    ll_s2sbi(sb)        (s2lsi(sb)->lsi_llsbi)
  
@@ -1042,6 +985,28 @@ void ll_truncate_free_capa(struct obd_capa *ocapa);
  void ll_clear_inode_capas(struct inode *inode);
  void ll_print_capa_stat(struct ll_sb_info *sbi);
  
+/* llite/llite_cl.c */
+extern struct lu_device_type vvp_device_type;
+
+/**
+ * Common IO arguments for various VFS I/O interfaces.
+ */
+
+int cl_sb_init(struct super_block *sb);
+int cl_sb_fini(struct super_block *sb);
+int cl_inode_init(struct inode *inode, struct lustre_md *md);
+void cl_inode_fini(struct inode *inode);
+
+enum cl_lock_mode  vvp_mode_from_vma(struct vm_area_struct *vma);
+void ll_io_init(struct cl_io *io, const struct file *file, int write);
+
+void ras_update(struct ll_sb_info *sbi, struct inode *inode,
+                struct ll_readahead_state *ras, unsigned long index,
+                unsigned hit);
+void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len);
+int ll_is_file_contended(struct file *file);
+void ll_ra_stats_inc(struct address_space *mapping, enum ra_stat which);
+
  /* llite/llite_rmtacl.c */
  #ifdef CONFIG_FS_POSIX_ACL
  obd_valid rce_ops2valid(int ops);
@@ -1150,7 +1115,7 @@ int ll_statahead_enter(struct inode *dir, struct dentry **dentryp, int lookup)
           * "IT_GETATTR" for the first time, and the subsequent "IT_GETATTR"
           * will bypass interacting with statahead thread for checking:
           * "lld_sa_generation == lli_sai->sai_generation"
-         */ 
+         */
          if (ldd && lli->lli_sai &&
              ldd->lld_sa_generation == lli->lli_sai->sai_generation)
                  return -EAGAIN;
@@ -1188,36 +1153,66 @@ enum llioc_iter {
   * Parameters:
   *  @magic: Dynamic ioctl call routine will feed this vaule with the pointer
   *      returned to ll_iocontrol_register.  Callback functions should use this
- *      data to check the potential collasion of ioctl cmd. If collasion is 
+ *      data to check the potential collasion of ioctl cmd. If collasion is
   *      found, callback function should return LLIOC_CONT.
   *  @rcp: The result of ioctl command.
   *
   *  Return values:
- *      If @magic matches the pointer returned by ll_iocontrol_data, the 
+ *      If @magic matches the pointer returned by ll_iocontrol_data, the
   *      callback should return LLIOC_STOP; return LLIOC_STOP otherwise.
   */
-typedef enum llioc_iter (*llioc_callback_t)(struct inode *inode, 
+typedef enum llioc_iter (*llioc_callback_t)(struct inode *inode,
                  struct file *file, unsigned int cmd, unsigned long arg,
                  void *magic, int *rcp);
  
-enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file, 
+enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file,
                  unsigned int cmd, unsigned long arg, int *rcp);
  
  /* export functions */
-/* Register ioctl block dynamatically for a regular file. 
+/* Register ioctl block dynamatically for a regular file.
   *
   * @cmd: the array of ioctl command set
   * @count: number of commands in the @cmd
- * @cb: callback function, it will be called if an ioctl command is found to 
+ * @cb: callback function, it will be called if an ioctl command is found to
   *      belong to the command list @cmd.
   *
   * Return vaule:
- *      A magic pointer will be returned if success; 
- *      otherwise, NULL will be returned. 
+ *      A magic pointer will be returned if success;
+ *      otherwise, NULL will be returned.
   * */
  void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd);
  void ll_iocontrol_unregister(void *magic);
  
  #endif
  
+/* lclient compat stuff */
+#define cl_inode_info ll_inode_info
+#define cl_i2info(info) ll_i2info(info)
+#define cl_inode_mode(inode) ((inode)->i_mode)
+#define cl_i2sbi ll_i2sbi
+#define cl_isize_read(inode) i_size_read(inode)
+#define cl_isize_write(inode,kms) i_size_write(inode, kms)
+#define cl_isize_write_nolock(inode,kms) do {(inode)->i_size=(kms);}while(0)
+
+static inline void cl_isize_lock(struct inode *inode, int lsmlock)
+{
+        ll_inode_size_lock(inode, lsmlock);
+}
+
+static inline void cl_isize_unlock(struct inode *inode, int lsmlock)
+{
+        ll_inode_size_unlock(inode, lsmlock);
+}
+
+static inline int cl_merge_lvb(struct inode *inode)
+{
+        return ll_merge_lvb(inode);
+}
+
+#define cl_inode_atime(inode) LTIME_S((inode)->i_atime)
+#define cl_inode_ctime(inode) LTIME_S((inode)->i_ctime)
+#define cl_inode_mtime(inode) LTIME_S((inode)->i_mtime)
+
+struct obd_capa *cl_capa_lookup(struct inode *inode, enum cl_req_type crt);
+
  #endif /* LLITE_INTERNAL_H */
diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c

index ca9a7f8..3ed9c85 100644 (file)
--- a/lustre/llite/llite_lib.c
+++ b/lustre/llite/llite_lib.c
@@ -53,8 +53,8 @@
  #include <lustre_disk.h>
  #include <lustre_param.h>
  #include <lustre_log.h>
+#include <cl_object.h>
  #include <obd_cksum.h>
-#include <lustre_cache.h>
  #include "llite_internal.h"
  
  cfs_mem_cache_t *ll_file_data_slab;
@@ -69,65 +69,6 @@ extern struct address_space_operations ll_dir_aops;
  #define log2(n) ffz(~(n))
  #endif
  
-static inline void ll_pglist_fini(struct ll_sb_info *sbi)
-{
-        struct page *page;
-        int i;
-
-        if (sbi->ll_pglist == NULL)
-                return;
-
-        for_each_possible_cpu(i) {
-                page = sbi->ll_pglist[i]->llpd_page;
-                if (page) {
-                        sbi->ll_pglist[i] = NULL;
-                        __free_page(page);
-                }
-        }
-
-        OBD_FREE(sbi->ll_pglist, sizeof(void *)*num_possible_cpus());
-        sbi->ll_pglist = NULL;
-}
-
-static inline int ll_pglist_init(struct ll_sb_info *sbi)
-{
-        struct ll_pglist_data *pd;
-        unsigned long budget;
-        int i, color = 0;
-        ENTRY;
-
-        OBD_ALLOC(sbi->ll_pglist, sizeof(void *) * num_possible_cpus());
-        if (sbi->ll_pglist == NULL)
-                RETURN(-ENOMEM);
-
-        budget = sbi->ll_async_page_max / num_online_cpus();
-        for_each_possible_cpu(i) {
-                struct page *page = alloc_pages_node(cpu_to_node(i),
-                                                    GFP_KERNEL, 0);
-                if (page == NULL) {
-                        ll_pglist_fini(sbi);
-                        RETURN(-ENOMEM);
-                }
-
-                if (color + L1_CACHE_ALIGN(sizeof(*pd)) > PAGE_SIZE)
-                        color = 0;
-
-                pd = (struct ll_pglist_data *)(page_address(page) + color);
-                memset(pd, 0, sizeof(*pd));
-                spin_lock_init(&pd->llpd_lock);
-                INIT_LIST_HEAD(&pd->llpd_list);
-                if (cpu_online(i))
-                        pd->llpd_budget = budget;
-                pd->llpd_cpu = i;
-                pd->llpd_page = page;
-                atomic_set(&pd->llpd_sample_count, 0);
-                sbi->ll_pglist[i] = pd;
-                color += L1_CACHE_ALIGN(sizeof(*pd));
-        }
-
-        RETURN(0);
-}
-
  static struct ll_sb_info *ll_init_sbi(void)
  {
          struct ll_sb_info *sbi = NULL;
@@ -141,10 +82,6 @@ static struct ll_sb_info *ll_init_sbi(void)
          if (!sbi)
                  RETURN(NULL);
  
-        OBD_ALLOC(sbi->ll_async_page_sample, sizeof(long)*num_possible_cpus());
-        if (sbi->ll_async_page_sample == NULL)
-                GOTO(out, 0);
-
          spin_lock_init(&sbi->ll_lock);
          spin_lock_init(&sbi->ll_lco.lco_lock);
          spin_lock_init(&sbi->ll_pp_extent_lock);
@@ -162,20 +99,11 @@ static struct ll_sb_info *ll_init_sbi(void)
          } else {
                  sbi->ll_async_page_max = (pages / 4) * 3;
          }
-        lcounter_init(&sbi->ll_async_page_count);
-        spin_lock_init(&sbi->ll_async_page_reblnc_lock);
-        sbi->ll_async_page_sample_max = 64 * num_online_cpus();
-        sbi->ll_async_page_reblnc_count = 0;
-        sbi->ll_async_page_clock_hand = 0;
-        if (ll_pglist_init(sbi))
-                GOTO(out, 0);
  
          sbi->ll_ra_info.ra_max_pages = min(pages / 32,
                                             SBI_DEFAULT_READAHEAD_MAX);
          sbi->ll_ra_info.ra_max_read_ahead_whole_pages =
                                             SBI_DEFAULT_READAHEAD_WHOLE_MAX;
-        sbi->ll_contention_time = SBI_DEFAULT_CONTENTION_SECONDS;
-        sbi->ll_lockless_truncate_enable = SBI_DEFAULT_LOCKLESS_TRUNCATE_ENABLE;
          INIT_LIST_HEAD(&sbi->ll_conn_chain);
          INIT_LIST_HEAD(&sbi->ll_orphan_dentry_list);
  
@@ -208,14 +136,6 @@ static struct ll_sb_info *ll_init_sbi(void)
          sbi->ll_sa_max = LL_SA_RPC_DEF;
  
          RETURN(sbi);
-
-out:
-        if (sbi->ll_async_page_sample)
-                OBD_FREE(sbi->ll_async_page_sample,
-                         sizeof(long) * num_possible_cpus());
-        ll_pglist_fini(sbi);
-        OBD_FREE(sbi, sizeof(*sbi));
-        RETURN(NULL);
  }
  
  void ll_free_sbi(struct super_block *sb)
@@ -224,13 +144,9 @@ void ll_free_sbi(struct super_block *sb)
          ENTRY;
  
          if (sbi != NULL) {
-                ll_pglist_fini(sbi);
                  spin_lock(&ll_sb_lock);
                  list_del(&sbi->ll_list);
                  spin_unlock(&ll_sb_lock);
-                lcounter_destroy(&sbi->ll_async_page_count);
-                OBD_FREE(sbi->ll_async_page_sample,
-                         sizeof(long) * num_possible_cpus());
                  OBD_FREE(sbi, sizeof(*sbi));
          }
          EXIT;
@@ -523,45 +439,12 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt)
          sbi->ll_lco.lco_flags = data->ocd_connect_flags;
          spin_unlock(&sbi->ll_lco.lco_lock);
  
-        err = obd_register_page_removal_cb(sbi->ll_dt_exp,
-                                           ll_page_removal_cb,
-                                           ll_pin_extent_cb);
-        if (err) {
-                CERROR("cannot register page removal callback: rc = %d\n",err);
-                GOTO(out_dt, err);
-        }
-        err = obd_register_lock_cancel_cb(sbi->ll_dt_exp,
-                                          ll_extent_lock_cancel_cb);
-        if (err) {
-                CERROR("cannot register lock cancel callback: rc = %d\n", err);
-                GOTO(out_page_rm_cb, err);
-        }
-
          err = ll_init_ea_size(sbi->ll_md_exp, sbi->ll_dt_exp);;
          if (err) {
                  CERROR("cannot set max EA and cookie sizes: rc = %d\n", err);
                  GOTO(out_lock_cn_cb, err);
          }
  
-        err = obd_prep_async_page(sbi->ll_dt_exp, NULL, NULL, NULL,
-                                  0, NULL, NULL, NULL, 0, NULL);
-        if (err < 0) {
-                LCONSOLE_ERROR_MSG(0x151, "There are no OST's in this "
-                                   "filesystem. There must be at least one "
-                                   "active OST for a client to start.\n");
-                GOTO(out_lock_cn_cb, err);
-        }
-
-        if (!ll_async_page_slab) {
-                ll_async_page_slab_size =
-                        size_round(sizeof(struct ll_async_page)) + err;
-                ll_async_page_slab = cfs_mem_cache_create("ll_async_page",
-                                                          ll_async_page_slab_size,
-                                                          0, 0);
-                if (!ll_async_page_slab)
-                        GOTO(out_lock_cn_cb, err = -ENOMEM);
-        }
-
          fid_zero(&sbi->ll_root_fid);
          err = md_getstatus(sbi->ll_md_exp, &sbi->ll_root_fid, &oc);
          if (err) {
@@ -585,7 +468,7 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt)
          else if (sbi->ll_flags & LL_SBI_ACL)
                  valid |= OBD_MD_FLACL;
  
-        err = md_getattr(sbi->ll_md_exp, &sbi->ll_root_fid, oc, valid, 0, 
+        err = md_getattr(sbi->ll_md_exp, &sbi->ll_root_fid, oc, valid, 0,
                           &request);
          if (oc)
                  free_capa(oc);
@@ -607,7 +490,7 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt)
          md_free_lustre_md(sbi->ll_md_exp, &lmd);
          ptlrpc_req_finished(request);
  
-        if (root == NULL || is_bad_inode(root)) {
+        if (root == NULL || IS_ERR(root)) {
                  if (lmd.lsm)
                          obd_free_memmd(sbi->ll_dt_exp, &lmd.lsm);
  #ifdef CONFIG_FS_POSIX_ACL
@@ -616,8 +499,10 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt)
                          lmd.posix_acl = NULL;
                  }
  #endif
+                err = IS_ERR(root) ? PTR_ERR(root) : -EBADF;
+                root = NULL;
                  CERROR("lustre_lite: bad iget4 for root\n");
-                GOTO(out_root, err = -EBADF);
+                GOTO(out_root, err);
          }
  
          err = ll_close_thread_start(&sbi->ll_lcq);
@@ -637,6 +522,7 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt)
          err = obd_set_info_async(sbi->ll_dt_exp, sizeof(KEY_CHECKSUM),
                                   KEY_CHECKSUM, sizeof(checksum), &checksum,
                                   NULL);
+        cl_sb_init(sb);
  
          sb->s_root = d_alloc_root(root);
          if (data != NULL)
@@ -660,11 +546,6 @@ out_root:
          if (root)
                  iput(root);
  out_lock_cn_cb:
-        obd_unregister_lock_cancel_cb(sbi->ll_dt_exp,
-                                      ll_extent_lock_cancel_cb);
-out_page_rm_cb:
-        obd_unregister_page_removal_cb(sbi->ll_dt_exp,
-                                       ll_page_removal_cb);
          obd_fid_fini(sbi->ll_dt_exp);
  out_dt:
          obd_disconnect(sbi->ll_dt_exp);
@@ -1119,6 +1000,8 @@ out_free:
  } /* ll_fill_super */
  
  
+void lu_context_keys_dump(void);
+
  void ll_put_super(struct super_block *sb)
  {
          struct config_llog_instance cfg;
@@ -1154,6 +1037,8 @@ void ll_put_super(struct super_block *sb)
                  }
          }
  
+        cl_sb_fini(sb);
+
          if (sbi->ll_lcq) {
                  /* Only if client_common_fill_super succeeded */
                  client_common_put_super(sb);
@@ -1171,6 +1056,9 @@ void ll_put_super(struct super_block *sb)
  
          lustre_common_put_super(sb);
  
+        cl_env_cache_purge(~0);
+        lu_context_keys_dump();
+
          LCONSOLE_WARN("client %s umount complete\n", ll_instance);
  
          cfs_module_put();
@@ -1178,32 +1066,6 @@ void ll_put_super(struct super_block *sb)
          EXIT;
  } /* client_put_super */
  
-#if defined(HAVE_REGISTER_CACHE) || defined(HAVE_SHRINKER_CACHE)
-
-#if defined(HAVE_CACHE_RETURN_INT)
-static int
-#else
-static void
-#endif
-ll_shrink_cache(int priority, unsigned int gfp_mask)
-{
-        struct ll_sb_info *sbi;
-        int count = 0;
-
-        list_for_each_entry(sbi, &ll_super_blocks, ll_list)
-                count += llap_shrink_cache(sbi, priority);
-
-#if defined(HAVE_CACHE_RETURN_INT)
-        return count;
-#endif
-}
-
-struct cache_definition ll_cache_definition = {
-        .name = "llap_cache",
-        .shrink = ll_shrink_cache
-};
-#endif /* HAVE_REGISTER_CACHE || HAVE_SHRINKER_CACHE */
-
  struct inode *ll_inode_from_lock(struct ldlm_lock *lock)
  {
          struct inode *inode = NULL;
@@ -1270,14 +1132,6 @@ void ll_clear_inode(struct inode *inode)
          if (lli->lli_mds_read_och)
                  ll_md_real_close(inode, FMODE_READ);
  
-        if (lli->lli_smd) {
-                obd_change_cbdata(sbi->ll_dt_exp, lli->lli_smd,
-                                  null_if_equal, inode);
-
-                obd_free_memmd(sbi->ll_dt_exp, &lli->lli_smd);
-                lli->lli_smd = NULL;
-        }
-
          if (lli->lli_symlink_name) {
                  OBD_FREE(lli->lli_symlink_name,
                           strlen(lli->lli_symlink_name) + 1);
@@ -1307,6 +1161,17 @@ void ll_clear_inode(struct inode *inode)
          spin_unlock(&sbi->ll_deathrow_lock);
  #endif
          ll_clear_inode_capas(inode);
+        /*
+         * XXX This has to be done before lsm is freed below, because
+         * cl_object still uses inode lsm.
+         */
+        cl_inode_fini(inode);
+
+        if (lli->lli_smd) {
+                obd_free_memmd(sbi->ll_dt_exp, &lli->lli_smd);
+                lli->lli_smd = NULL;
+        }
+
  
          EXIT;
  }
@@ -1396,86 +1261,25 @@ static int ll_setattr_done_writing(struct inode *inode,
          RETURN(rc);
  }
  
-static int ll_setattr_do_truncate(struct inode *inode, loff_t new_size)
+static int ll_setattr_do_truncate(struct inode *inode, loff_t size)
  {
-        struct ll_sb_info *sbi = ll_i2sbi(inode);
-        struct ll_inode_info *lli = ll_i2info(inode);
-        struct lov_stripe_md *lsm = lli->lli_smd;
+        struct obd_capa *capa = ll_osscapa_get(inode, CAPA_OPC_OSS_TRUNC);
          int rc;
-        ldlm_policy_data_t policy = { .l_extent = {new_size,
-                                                   OBD_OBJECT_EOF } };
-        struct lustre_handle lockh = { 0 };
-        int local_lock = 0; /* 0 - no local lock;
-                             * 1 - lock taken by lock_extent;
-                             * 2 - by obd_match*/
-        int ast_flags;
-        int err;
-        ENTRY;
  
-        UNLOCK_INODE_MUTEX(inode);
-        UP_WRITE_I_ALLOC_SEM(inode);
-
-        if (sbi->ll_lockless_truncate_enable &&
-            (sbi->ll_lco.lco_flags & OBD_CONNECT_TRUNCLOCK)) {
-                ast_flags = LDLM_FL_BLOCK_GRANTED;
-                rc = obd_match(sbi->ll_dt_exp, lsm, LDLM_EXTENT,
-                               &policy, LCK_PW, &ast_flags, inode, &lockh);
-                if (rc > 0) {
-                        local_lock = 2;
-                        rc = 0;
-                } else if (rc == 0) {
-                        rc = ll_file_punch(inode, new_size, 1);
-                }
-        } else {
-                /* XXX when we fix the AST intents to pass the discard-range
-                 * XXX extent, make ast_flags always LDLM_AST_DISCARD_DATA
-                 * XXX here. */
-                ast_flags = (new_size == 0) ? LDLM_AST_DISCARD_DATA : 0;
-                rc = ll_extent_lock(NULL, inode, lsm, LCK_PW, &policy,
-                                    &lockh, ast_flags);
-                if (likely(rc == 0))
-                        local_lock = 1;
-        }
-
-        LOCK_INODE_MUTEX(inode);
-        DOWN_WRITE_I_ALLOC_SEM(inode);
-
-        if (likely(rc == 0)) {
-                /* Only ll_inode_size_lock is taken at this level.
-                 * lov_stripe_lock() is grabbed by ll_truncate() only over
-                 * call to obd_adjust_kms().  If vmtruncate returns 0, then
-                 * ll_truncate dropped ll_inode_size_lock() */
-                ll_inode_size_lock(inode, 0);
-                if (!local_lock) {
-                        spin_lock(&lli->lli_lock);
-                        lli->lli_flags |= LLIF_SRVLOCK;
-                        spin_unlock(&lli->lli_lock);
-                }
-                rc = vmtruncate(inode, new_size);
-                if (!local_lock) {
-                        spin_lock(&lli->lli_lock);
-                        lli->lli_flags &= ~LLIF_SRVLOCK;
-                        spin_unlock(&lli->lli_lock);
-                }
-                if (rc != 0) {
-                        LASSERT(atomic_read(&lli->lli_size_sem.count) <= 0);
-                        ll_inode_size_unlock(inode, 0);
-                }
-        }
+        rc = cl_setattr_do_truncate(inode, size, capa);
+        ll_truncate_free_capa(capa);
+        return rc;
+}
  
-        if (local_lock) {
-                if (local_lock == 2)
-                        err = obd_cancel(sbi->ll_dt_exp, lsm, LCK_PW, &lockh);
-                else
-                        err = ll_extent_unlock(NULL, inode, lsm, LCK_PW, &lockh);
-                if (unlikely(err != 0)){
-                        CERROR("extent unlock failed: err=%d,"
-                               " unlock method =%d\n", err, local_lock);
-                        if (rc == 0)
-                                rc = err;
-                }
-        }
-        RETURN(rc);
+static int ll_setattr_ost(struct inode *inode)
+{
+        struct obd_capa *capa = ll_mdscapa_get(inode);
+        int rc;
+
+        rc = cl_setattr_ost(inode, capa);
+        capa_put(capa);
+
+        return rc;
  }
  
  /* If this inode has objects allocated to it (lsm != NULL), then the OST
@@ -1495,7 +1299,6 @@ int ll_setattr_raw(struct inode *inode, struct iattr *attr)
  {
          struct ll_inode_info *lli = ll_i2info(inode);
          struct lov_stripe_md *lsm = lli->lli_smd;
-        struct ll_sb_info *sbi = ll_i2sbi(inode);
          struct md_op_data *op_data = NULL;
          struct md_open_data *mod = NULL;
          int ia_valid = attr->ia_valid;
@@ -1539,7 +1342,7 @@ int ll_setattr_raw(struct inode *inode, struct iattr *attr)
          if ((attr->ia_valid & ATTR_CTIME) && !(attr->ia_valid & ATTR_MTIME)) {
                  /* To avoid stale mtime on mds, obtain it from ost and send
                     to mds. */
-                rc = ll_glimpse_size(inode, 0);
+                rc = cl_glimpse_size(inode);
                  if (rc)
                          RETURN(rc);
  
@@ -1584,48 +1387,12 @@ int ll_setattr_raw(struct inode *inode, struct iattr *attr)
                  GOTO(out, rc = 0);
          }
  
-        /* We really need to get our PW lock before we change inode->i_size.
-         * If we don't we can race with other i_size updaters on our node, like
-         * ll_file_read.  We can also race with i_size propogation to other
-         * nodes through dirtying and writeback of final cached pages.  This
-         * last one is especially bad for racing o_append users on other
-         * nodes. */
-        if (ia_valid & ATTR_SIZE) {
+        if (ia_valid & ATTR_SIZE)
                  rc = ll_setattr_do_truncate(inode, attr->ia_size);
-        } else if (ia_valid & (ATTR_MTIME | ATTR_MTIME_SET)) {
-                obd_flag flags;
-                struct obd_info oinfo = { { { 0 } } };
-                struct obdo *oa;
-
+        else if (ia_valid & (ATTR_MTIME | ATTR_MTIME_SET)) {
                  CDEBUG(D_INODE, "set mtime on OST inode %lu to %lu\n",
                         inode->i_ino, LTIME_S(attr->ia_mtime));
-
-                OBDO_ALLOC(oa);
-                if (oa) {
-                        oa->o_id = lsm->lsm_object_id;
-                        oa->o_gr = lsm->lsm_object_gr;
-                        oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
-
-                        flags = OBD_MD_FLTYPE | OBD_MD_FLATIME |
-                                OBD_MD_FLMTIME | OBD_MD_FLCTIME |
-                                OBD_MD_FLFID | OBD_MD_FLGENER |
-                                OBD_MD_FLGROUP;
-
-                        obdo_from_inode(oa, inode, flags);
-
-                        oinfo.oi_oa = oa;
-                        oinfo.oi_md = lsm;
-                        oinfo.oi_capa = ll_mdscapa_get(inode);
-
-                        /* XXX: this looks unnecessary now. */
-                        rc = obd_setattr_rqset(sbi->ll_dt_exp, &oinfo, NULL);
-                        capa_put(oinfo.oi_capa);
-                        if (rc)
-                                CERROR("obd_setattr_async fails: rc=%d\n", rc);
-                        OBDO_FREE(oa);
-                } else {
-                        rc = -ENOMEM;
-                }
+                rc = ll_setattr_ost(inode);
          }
          EXIT;
  out:
@@ -1815,9 +1582,11 @@ void ll_update_inode(struct inode *inode, struct lustre_md *md)
                          }
                          CDEBUG(D_INODE, "adding lsm %p to inode %lu/%u(%p)\n",
                                 lsm, inode->i_ino, inode->i_generation, inode);
-                        /* ll_inode_size_lock() requires it is only called
-                         * with lli_smd != NULL or lock_lsm == 0 or we can
-                         * race between lock/unlock.  bug 9547 */
+                        cl_inode_init(inode, md);
+                        /* ll_inode_size_lock() requires it is only
+                         * called with lli_smd != NULL or lock_lsm == 0
+                         *  or we can race between lock/unlock.
+                         *  bug 9547 */
                          lli->lli_smd = lsm;
                          lli->lli_maxbytes = lsm->lsm_maxbytes;
                          if (lli->lli_maxbytes > PAGE_CACHE_MAXBYTES)
@@ -1835,8 +1604,10 @@ void ll_update_inode(struct inode *inode, struct lustre_md *md)
                                          dump_lsm(D_ERROR, lsm);
                                          LBUG();
                                  }
-                        } else
+                        } else {
+                                cl_inode_init(inode, md);
                                  ll_replace_lsm(inode, lsm);
+                        }
                  }
                  if (lli->lli_smd != lsm)
                          obd_free_memmd(ll_i2dtexp(inode), &lsm);
@@ -2233,7 +2004,7 @@ int ll_prep_inode(struct inode **inode,
  {
          struct ll_sb_info *sbi = NULL;
          struct lustre_md md;
-        int rc = 0;
+        int rc;
          ENTRY;
  
          LASSERT(*inode || sb);
@@ -2257,8 +2028,8 @@ int ll_prep_inode(struct inode **inode,
                   */
                  LASSERT(fid_is_sane(&md.body->fid1));
  
-                *inode = ll_iget(sb, ll_fid_build_ino(sbi, &md.body->fid1), &md);
-                if (*inode == NULL || is_bad_inode(*inode)) {
+                *inode = ll_iget(sb, ll_fid_build_ino(sbi, &md.body->fid1),&md);
+                if (*inode == NULL || IS_ERR(*inode)) {
                          if (md.lsm)
                                  obd_free_memmd(sbi->ll_dt_exp, &md.lsm);
  #ifdef CONFIG_FS_POSIX_ACL
@@ -2267,7 +2038,8 @@ int ll_prep_inode(struct inode **inode,
                                  md.posix_acl = NULL;
                          }
  #endif
-                        rc = -ENOMEM;
+                        rc = IS_ERR(*inode) ? PTR_ERR(*inode) : -ENOMEM;
+                        *inode = NULL;
                          CERROR("new_inode -fatal: rc %d\n", rc);
                          GOTO(out, rc);
                  }
@@ -2280,33 +2052,6 @@ out:
          RETURN(rc);
  }
  
-char *llap_origins[] = {
-        [LLAP_ORIGIN_UNKNOWN] = "--",
-        [LLAP_ORIGIN_READPAGE] = "rp",
-        [LLAP_ORIGIN_READAHEAD] = "ra",
-        [LLAP_ORIGIN_COMMIT_WRITE] = "cw",
-        [LLAP_ORIGIN_WRITEPAGE] = "wp",
-        [LLAP_ORIGIN_LOCKLESS_IO] = "ls"
-};
-
-struct ll_async_page *llite_pglist_next_llap(struct list_head *head,
-                                             struct list_head *list)
-{
-        struct ll_async_page *llap;
-        struct list_head *pos;
-
-        list_for_each(pos, list) {
-                if (pos == head)
-                        return NULL;
-                llap = list_entry(pos, struct ll_async_page, llap_pglist_item);
-                if (llap->llap_page == NULL)
-                        continue;
-                return llap;
-        }
-        LBUG();
-        return NULL;
-}
-
  int ll_obd_statfs(struct inode *inode, void *arg)
  {
          struct ll_sb_info *sbi = NULL;
diff --git a/lustre/llite/llite_mmap.c b/lustre/llite/llite_mmap.c

index f17a90d..4126a83 100644 (file)
--- a/lustre/llite/llite_mmap.c
+++ b/lustre/llite/llite_mmap.c
@@ -69,210 +69,10 @@
                 vma->vm_file->f_dentry->d_inode->i_ino,                       \
                 vma->vm_file->f_dentry->d_iname, ## arg);                     \
  
-
-struct ll_lock_tree_node {
-        rb_node_t               lt_node;
-        struct list_head        lt_locked_item;
-        __u64                   lt_oid;
-        ldlm_policy_data_t      lt_policy;
-        struct lustre_handle    lt_lockh;
-        ldlm_mode_t             lt_mode;
-        struct inode           *lt_inode;
-};
-
-int lt_get_mmap_locks(struct ll_lock_tree *tree,
-                      unsigned long addr, size_t count);
-
  struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address,
                         int *type);
  
-struct ll_lock_tree_node * ll_node_from_inode(struct inode *inode, __u64 start,
-                                              __u64 end, ldlm_mode_t mode)
-{
-        struct ll_lock_tree_node *node;
-
-        OBD_ALLOC(node, sizeof(*node));
-        if (node == NULL)
-                RETURN(ERR_PTR(-ENOMEM));
-
-        node->lt_inode = inode;
-        node->lt_oid = ll_i2info(inode)->lli_smd->lsm_object_id;
-        node->lt_policy.l_extent.start = start;
-        node->lt_policy.l_extent.end = end;
-        memset(&node->lt_lockh, 0, sizeof(node->lt_lockh));
-        INIT_LIST_HEAD(&node->lt_locked_item);
-        node->lt_mode = mode;
-
-        return node;
-}
-
-int lt_compare(struct ll_lock_tree_node *one, struct ll_lock_tree_node *two)
-{
-        /* To avoid multiple fs deadlock */
-        if (one->lt_inode->i_sb->s_dev < two->lt_inode->i_sb->s_dev)
-                return -1;
-        if (one->lt_inode->i_sb->s_dev > two->lt_inode->i_sb->s_dev)
-                return 1;
-
-        if (one->lt_oid < two->lt_oid)
-                return -1;
-        if (one->lt_oid > two->lt_oid)
-                return 1;
-
-        if (one->lt_policy.l_extent.end < two->lt_policy.l_extent.start)
-                return -1;
-        if (one->lt_policy.l_extent.start > two->lt_policy.l_extent.end)
-                return 1;
-
-        return 0; /* they are the same object and overlap */
-}
-
-static void lt_merge(struct ll_lock_tree_node *dst,
-                     struct ll_lock_tree_node *src)
-{
-        dst->lt_policy.l_extent.start = min(dst->lt_policy.l_extent.start,
-                                            src->lt_policy.l_extent.start);
-        dst->lt_policy.l_extent.end = max(dst->lt_policy.l_extent.end,
-                                          src->lt_policy.l_extent.end);
-
-        /* XXX could be a real call to the dlm to find superset modes */
-        if (src->lt_mode == LCK_PW && dst->lt_mode != LCK_PW)
-                dst->lt_mode = LCK_PW;
-}
-
-static void lt_insert(struct ll_lock_tree *tree,
-                      struct ll_lock_tree_node *node)
-{
-        struct ll_lock_tree_node *walk;
-        rb_node_t **p, *parent;
-        ENTRY;
-
-restart:
-        p = &tree->lt_root.rb_node;
-        parent = NULL;
-        while (*p) {
-                parent = *p;
-                walk = rb_entry(parent, struct ll_lock_tree_node, lt_node);
-                switch (lt_compare(node, walk)) {
-                case -1:
-                        p = &(*p)->rb_left;
-                        break;
-                case 1:
-                        p = &(*p)->rb_right;
-                        break;
-                case 0:
-                        lt_merge(node, walk);
-                        rb_erase(&walk->lt_node, &tree->lt_root);
-                        OBD_FREE(walk, sizeof(*walk));
-                        goto restart;
-                        break;
-                default:
-                        LBUG();
-                        break;
-                }
-        }
-        rb_link_node(&node->lt_node, parent, p);
-        rb_insert_color(&node->lt_node, &tree->lt_root);
-        EXIT;
-}
-
-static struct ll_lock_tree_node *lt_least_node(struct ll_lock_tree *tree)
-{
-        rb_node_t *rbnode;
-        struct ll_lock_tree_node *node = NULL;
-
-        for ( rbnode = tree->lt_root.rb_node; rbnode != NULL;
-              rbnode = rbnode->rb_left) {
-                if (rbnode->rb_left == NULL) {
-                        node = rb_entry(rbnode, struct ll_lock_tree_node,
-                                        lt_node);
-                        break;
-                }
-        }
-        RETURN(node);
-}
-
-int ll_tree_unlock(struct ll_lock_tree *tree)
-{
-        struct ll_lock_tree_node *node;
-        struct list_head *pos, *n;
-        struct inode *inode;
-        int rc = 0;
-        ENTRY;
-
-        list_for_each_safe(pos, n, &tree->lt_locked_list) {
-                node = list_entry(pos, struct ll_lock_tree_node,
-                                  lt_locked_item);
-
-                inode = node->lt_inode;
-                rc = ll_extent_unlock(tree->lt_fd, inode,
-                                      ll_i2info(inode)->lli_smd, node->lt_mode,
-                                      &node->lt_lockh);
-                if (rc != 0) {
-                        /* XXX better message */
-                        CERROR("couldn't unlock %d\n", rc);
-                }
-                list_del(&node->lt_locked_item);
-                OBD_FREE(node, sizeof(*node));
-        }
-
-        while ((node = lt_least_node(tree))) {
-                rb_erase(&node->lt_node, &tree->lt_root);
-                OBD_FREE(node, sizeof(*node));
-        }
-
-        RETURN(rc);
-}
-
-int ll_tree_lock(struct ll_lock_tree *tree,
-                 struct ll_lock_tree_node *first_node,
-                 const char *buf, size_t count, int ast_flags)
-{
-        struct ll_lock_tree_node *node;
-        int rc = 0;
-        ENTRY;
-
-        tree->lt_root.rb_node = NULL;
-        INIT_LIST_HEAD(&tree->lt_locked_list);
-        if (first_node != NULL)
-                lt_insert(tree, first_node);
-
-        /* To avoid such subtle deadlock case: client1 try to read file1 to
-         * mmapped file2, on the same time, client2 try to read file2 to
-         * mmapped file1.*/
-        rc = lt_get_mmap_locks(tree, (unsigned long)buf, count);
-        if (rc)
-                GOTO(out, rc);
-
-        while ((node = lt_least_node(tree))) {
-                struct inode *inode = node->lt_inode;
-                rc = ll_extent_lock(tree->lt_fd, inode,
-                                    ll_i2info(inode)->lli_smd, node->lt_mode,
-                                    &node->lt_policy, &node->lt_lockh,
-                                    ast_flags);
-                if (rc != 0)
-                        GOTO(out, rc);
-
-                rb_erase(&node->lt_node, &tree->lt_root);
-                list_add_tail(&node->lt_locked_item, &tree->lt_locked_list);
-        }
-        RETURN(rc);
-out:
-        ll_tree_unlock(tree);
-        RETURN(rc);
-}
-
-static ldlm_mode_t mode_from_vma(struct vm_area_struct *vma)
-{
-        /* we only want to hold PW locks if the mmap() can generate
-         * writes back to the file and that only happens in shared
-         * writable vmas */
-        if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
-                return LCK_PW;
-        return LCK_PR;
-}
-
-static void policy_from_vma(ldlm_policy_data_t *policy,
+void policy_from_vma(ldlm_policy_data_t *policy,
                              struct vm_area_struct *vma, unsigned long addr,
                              size_t count)
  {
@@ -282,7 +82,7 @@ static void policy_from_vma(ldlm_policy_data_t *policy,
                                 ~CFS_PAGE_MASK;
  }
  
-static struct vm_area_struct * our_vma(unsigned long addr, size_t count)
+struct vm_area_struct * our_vma(unsigned long addr, size_t count)
  {
          struct mm_struct *mm = current->mm;
          struct vm_area_struct *vma, *ret = NULL;
@@ -305,56 +105,19 @@ static struct vm_area_struct * our_vma(unsigned long addr, size_t count)
          RETURN(ret);
  }
  
-int ll_region_mapped(unsigned long addr, size_t count)
-{
-        return !!our_vma(addr, count);
-}
-
-int lt_get_mmap_locks(struct ll_lock_tree *tree,
-                      unsigned long addr, size_t count)
-{
-        struct vm_area_struct *vma;
-        struct ll_lock_tree_node *node;
-        ldlm_policy_data_t policy;
-        struct inode *inode;
-        ENTRY;
-
-        if (count == 0)
-                RETURN(0);
-
-        /* we need to look up vmas on page aligned addresses */
-        count += addr & (~CFS_PAGE_MASK);
-        addr &= CFS_PAGE_MASK;
-
-        while ((vma = our_vma(addr, count)) != NULL) {
-                LASSERT(vma->vm_file);
-
-                inode = vma->vm_file->f_dentry->d_inode;
-                policy_from_vma(&policy, vma, addr, count);
-                node = ll_node_from_inode(inode, policy.l_extent.start,
-                                          policy.l_extent.end,
-                                          mode_from_vma(vma));
-                if (IS_ERR(node)) {
-                        CERROR("not enough mem for lock_tree_node!\n");
-                        RETURN(-ENOMEM);
-                }
-                lt_insert(tree, node);
-
-                if (vma->vm_end - addr >= count)
-                        break;
-                count -= vma->vm_end - addr;
-                addr = vma->vm_end;
-        }
-        RETURN(0);
-}
-
  /**
- * Page fault handler.
+ * Lustre implementation of a vm_operations_struct::nopage() method, called by
+ * VM to server page fault (both in kernel and user space).
+ *
+ * This function sets up CIT_FAULT cl_io that does the job.
   *
   * \param vma - is virtiual area struct related to page fault
   * \param address - address when hit fault
   * \param type - of fault
   *
+ * XXX newer 2.6 kernels provide vm_operations_struct::fault() method with
+ * slightly different semantics instead.
+ *
   * \return allocated and filled page for address
   * \retval NOPAGE_SIGBUS if page not exist on this address
   * \retval NOPAGE_OOM not have memory for allocate new page
@@ -362,151 +125,113 @@ int lt_get_mmap_locks(struct ll_lock_tree *tree,
  struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address,
                         int *type)
  {
-        struct file *filp = vma->vm_file;
-        struct ll_file_data *fd = LUSTRE_FPRIVATE(filp);
-        struct inode *inode = filp->f_dentry->d_inode;
-        struct lustre_handle lockh = { 0 };
-        ldlm_policy_data_t policy;
-        ldlm_mode_t mode;
-        struct page *page = NULL;
-        struct ll_inode_info *lli = ll_i2info(inode);
-        struct lov_stripe_md *lsm;
-        struct ost_lvb lvb;
-        __u64 kms, old_mtime;
-        unsigned long pgoff, size, rand_read, seq_read;
-        int rc = 0;
-        ENTRY;
-
-        if (lli->lli_smd == NULL) {
-                CERROR("No lsm on fault?\n");
-                RETURN(NULL);
-        }
-
-        ll_clear_file_contended(inode);
-
-        /* start and end the lock on the first and last bytes in the page */
-        policy_from_vma(&policy, vma, address, CFS_PAGE_SIZE);
+        struct file       *file  = vma->vm_file;
+        struct inode      *inode = file->f_dentry->d_inode;
+        struct lu_env     *env;
+        struct cl_io      *io;
+        struct page       *page  = NULL;
+        struct cl_env_nest nest;
+        int result;
  
-        CDEBUG(D_MMAP, "nopage vma %p inode %lu, locking ["LPU64", "LPU64"]\n",
-               vma, inode->i_ino, policy.l_extent.start, policy.l_extent.end);
-
-        mode = mode_from_vma(vma);
-        old_mtime = LTIME_S(inode->i_mtime);
-
-        lsm = lli->lli_smd;
-        rc = ll_extent_lock(fd, inode, lsm, mode, &policy,
-                            &lockh, LDLM_FL_CBPENDING);
-        if (rc != 0)
-                RETURN(NULL);
-
-        if (vma->vm_flags & VM_EXEC && LTIME_S(inode->i_mtime) != old_mtime)
-                CWARN("binary changed. inode %lu\n", inode->i_ino);
-
-        lov_stripe_lock(lsm);
-        inode_init_lvb(inode, &lvb);
-        obd_merge_lvb(ll_i2dtexp(inode), lsm, &lvb, 1);
-        kms = lvb.lvb_size;
-
-        pgoff = ((address - vma->vm_start) >> CFS_PAGE_SHIFT) + vma->vm_pgoff;
-        size = (kms + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT;
-
-        if (pgoff >= size) {
-                lov_stripe_unlock(lsm);
-                ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED);
-        } else {
-                /* XXX change inode size without ll_inode_size_lock() held!
-                 *     there is a race condition with truncate path. (see
-                 *     ll_extent_lock) */
-                /* XXX i_size_write() is not used because it is not safe to
-                 *     take the ll_inode_size_lock() due to a potential lock
-                 *     inversion (bug 6077).  And since it's not safe to use
-                 *     i_size_write() without a covering mutex we do the
-                 *     assignment directly.  It is not critical that the
-                 *     size be correct. */
-                /* region is within kms and, hence, within real file size (A).
-                 * We need to increase i_size to cover the read region so that
-                 * generic_file_read() will do its job, but that doesn't mean
-                 * the kms size is _correct_, it is only the _minimum_ size.
-                 * If someone does a stat they will get the correct size which
-                 * will always be >= the kms value here.  b=11081 */
-                if (i_size_read(inode) < kms) {
-                        inode->i_size = kms;
-                        CDEBUG(D_INODE, "ino=%lu, updating i_size %llu\n",
-                               inode->i_ino, i_size_read(inode));
-                }
-                lov_stripe_unlock(lsm);
-        }
+        ENTRY;
  
-        /* If mapping is writeable, adjust kms to cover this page,
-         * but do not extend kms beyond actual file size.
-         * policy.l_extent.end is set to the end of the page by policy_from_vma
-         * bug 10919 */
-        lov_stripe_lock(lsm);
-        if (mode == LCK_PW)
-                obd_adjust_kms(ll_i2dtexp(inode), lsm,
-                               min_t(loff_t, policy.l_extent.end + 1,
-                               i_size_read(inode)), 0);
-        lov_stripe_unlock(lsm);
-
-        /* disable VM_SEQ_READ and use VM_RAND_READ to make sure that
-         * the kernel will not read other pages not covered by ldlm in
-         * filemap_nopage. we do our readahead in ll_readpage.
+        /*
+         * vm_operations_struct::nopage() can be called when lustre IO is
+         * already active for the current thread, e.g., when doing read/write
+         * against user level buffer mapped from Lustre buffer. To avoid
+         * stomping on existing context, optionally force an allocation of a new
+         * one.
           */
-        rand_read = vma->vm_flags & VM_RAND_READ;
-        seq_read = vma->vm_flags & VM_SEQ_READ;
-        vma->vm_flags &= ~ VM_SEQ_READ;
-        vma->vm_flags |= VM_RAND_READ;
-
-        page = filemap_nopage(vma, address, type);
-        if (page != NOPAGE_SIGBUS && page != NOPAGE_OOM)
-                LL_CDEBUG_PAGE(D_PAGE, page, "got addr %lu type %lx\n", address,
-                               (long)type);
-        else
-                CDEBUG(D_PAGE, "got addr %lu type %lx - SIGBUS\n",  address,
-                               (long)type);
-
-        vma->vm_flags &= ~VM_RAND_READ;
-        vma->vm_flags |= (rand_read | seq_read);
-
-        ll_extent_unlock(fd, inode, ll_i2info(inode)->lli_smd, mode, &lockh);
+        env = cl_env_nested_get(&nest);
+        if (!IS_ERR(env)) {
+                pgoff_t pg_offset;
+                const unsigned long writable = VM_SHARED|VM_WRITE;
+                unsigned long ra_flags;
+                struct cl_fault_io *fio;
+
+                io = &ccc_env_info(env)->cti_io;
+                io->ci_obj = ll_i2info(inode)->lli_clob;
+                LASSERT(io->ci_obj != NULL);
+
+                fio = &io->u.ci_fault;
+                pg_offset = (address - vma->vm_start) >> PAGE_SHIFT;
+                fio->ft_index      = pg_offset + vma->vm_pgoff;
+                fio->ft_writable   = (vma->vm_flags&writable) == writable;
+                fio->ft_executable = vma->vm_flags&VM_EXEC;
+
+                /*
+                 * disable VM_SEQ_READ and use VM_RAND_READ to make sure that
+                 * the kernel will not read other pages not covered by ldlm in
+                 * filemap_nopage. we do our readahead in ll_readpage.
+                 */
+                ra_flags = vma->vm_flags & (VM_RAND_READ|VM_SEQ_READ);
+                vma->vm_flags &= ~VM_SEQ_READ;
+                vma->vm_flags |= VM_RAND_READ;
+
+                CDEBUG(D_INFO, "vm_flags: %lx (%lu %i %i)\n", vma->vm_flags,
+                       fio->ft_index, fio->ft_writable, fio->ft_executable);
+
+                if (cl_io_init(env, io, CIT_FAULT, io->ci_obj) == 0) {
+                        struct vvp_io *vio = vvp_env_io(env);
+                        struct ccc_io *cio = ccc_env_io(env);
+
+                        LASSERT(cio->cui_cl.cis_io == io);
+
+                        vio->u.fault.ft_vma     = vma;
+                        vio->u.fault.ft_address = address;
+                        vio->u.fault.ft_type    = type;
+                        cio->cui_fd = LUSTRE_FPRIVATE(file);
+
+                        result = cl_io_loop(env, io);
+                        if (result == 0) {
+                                LASSERT(fio->ft_page != NULL);
+                                page = cl_page_vmpage(env, fio->ft_page);
+                        } else if (result == -EFAULT) {
+                                page = NOPAGE_SIGBUS;
+                        } else if (result == -ENOMEM) {
+                                page = NOPAGE_OOM;
+                        }
+                } else
+                        result = io->ci_result;
+
+                vma->vm_flags &= ~VM_RAND_READ;
+                vma->vm_flags |= ra_flags;
+
+                cl_io_fini(env, io);
+                cl_env_nested_put(&nest, env);
+        }
          RETURN(page);
  }
  
-/* To avoid cancel the locks covering mmapped region for lock cache pressure,
- * we track the mapped vma count by lli_mmap_cnt.
- * ll_vm_open():  when first vma is linked, split locks from lru.
- * ll_vm_close(): when last vma is unlinked, join all this file's locks to lru.
- *
- * XXX we don't check the if the region of vma/lock for performance.
+/**
+ *  To avoid cancel the locks covering mmapped region for lock cache pressure,
+ *  we track the mapped vma count in ccc_object::cob_mmap_cnt.
   */
  static void ll_vm_open(struct vm_area_struct * vma)
  {
-        struct inode *inode = vma->vm_file->f_dentry->d_inode;
-        struct ll_inode_info *lli = ll_i2info(inode);
-        ENTRY;
+        struct inode *inode    = vma->vm_file->f_dentry->d_inode;
+        struct ccc_object *vob = cl_inode2ccc(inode);
  
+        ENTRY;
          LASSERT(vma->vm_file);
-
-        spin_lock(&lli->lli_lock);
-        LASSERT(atomic_read(&lli->lli_mmap_cnt) >= 0);
-
-        atomic_inc(&lli->lli_mmap_cnt);
-                spin_unlock(&lli->lli_lock);
+        LASSERT(atomic_read(&vob->cob_mmap_cnt) >= 0);
+        atomic_inc(&vob->cob_mmap_cnt);
+        EXIT;
  }
  
+/**
+ * Dual to ll_vm_open().
+ */
  static void ll_vm_close(struct vm_area_struct *vma)
  {
-        struct inode *inode = vma->vm_file->f_dentry->d_inode;
-        struct ll_inode_info *lli = ll_i2info(inode);
-        ENTRY;
+        struct inode      *inode = vma->vm_file->f_dentry->d_inode;
+        struct ccc_object *vob   = cl_inode2ccc(inode);
  
+        ENTRY;
          LASSERT(vma->vm_file);
-
-        spin_lock(&lli->lli_lock);
-        LASSERT(atomic_read(&lli->lli_mmap_cnt) > 0);
-
-        atomic_dec(&lli->lli_mmap_cnt);
-                spin_unlock(&lli->lli_lock);
+        atomic_dec(&vob->cob_mmap_cnt);
+        LASSERT(atomic_read(&vob->cob_mmap_cnt) >= 0);
+        EXIT;
  }
  
  #ifndef HAVE_FILEMAP_POPULATE
@@ -570,7 +295,7 @@ int ll_file_mmap(struct file * file, struct vm_area_struct * vma)
                  vma->vm_ops = &ll_file_vm_ops;
                  vma->vm_ops->open(vma);
                  /* update the inode's size and mtime */
-                rc = ll_glimpse_size(file->f_dentry->d_inode, 0);
+                rc = cl_glimpse_size(file->f_dentry->d_inode);
          }
  
          RETURN(rc);
diff --git a/lustre/llite/lloop.c b/lustre/llite/lloop.c

index 3241307..05026f1 100644 (file)
--- a/lustre/llite/lloop.c
+++ b/lustre/llite/lloop.c
@@ -134,8 +134,8 @@ struct lloop_device {
          loff_t             lo_offset;
          loff_t             lo_sizelimit;
          int                lo_flags;
-        int                (*ioctl)(struct lloop_device *, int cmd, 
-                                 unsigned long arg); 
+        int                (*ioctl)(struct lloop_device *, int cmd,
+                                 unsigned long arg);
  
          struct file *      lo_backing_file;
          struct block_device *lo_device;
@@ -241,8 +241,8 @@ static int do_bio_filebacked(struct lloop_device *lo, struct bio *bio)
          oinfo.oi_md = lsm;
          opc = cmd & OBD_BRW_WRITE ? CAPA_OPC_OSS_WRITE : CAPA_OPC_OSS_RW;
          oinfo.oi_capa = ll_osscapa_get(inode, opc);
-        ret = obd_brw(cmd, ll_i2dtexp(inode), &oinfo, 
-                      (obd_count)(i - bio->bi_idx), 
+        ret = obd_brw(cmd, ll_i2dtexp(inode), &oinfo,
+                      (obd_count)(i - bio->bi_idx),
                        lo->lo_requests[0].lrd_pages, NULL);
          capa_put(oinfo.oi_capa);
          if (ret == 0)
@@ -470,7 +470,7 @@ static int loop_set_fd(struct lloop_device *lo, struct file *unused,
          return error;
  }
  
-static int loop_clr_fd(struct lloop_device *lo, struct block_device *bdev, 
+static int loop_clr_fd(struct lloop_device *lo, struct block_device *bdev,
                         int count)
  {
          struct file *filp = lo->lo_backing_file;
@@ -532,7 +532,7 @@ static int lo_release(struct inode *inode, struct file *file)
  }
  
  /* lloop device node's ioctl function. */
-static int lo_ioctl(struct inode *inode, struct file *unused, 
+static int lo_ioctl(struct inode *inode, struct file *unused,
          unsigned int cmd, unsigned long arg)
  {
          struct lloop_device *lo = inode->i_bdev->bd_disk->private_data;
@@ -556,7 +556,7 @@ static int lo_ioctl(struct inode *inode, struct file *unused,
  
                  if (put_user(ino, (__u64 *)arg))
                          err = -EFAULT;
-                break; 
+                break;
          }
  
          default:
@@ -575,13 +575,13 @@ static struct block_device_operations lo_fops = {
          .ioctl =        lo_ioctl,
  };
  
-/* dynamic iocontrol callback. 
- * This callback is registered in lloop_init and will be called by 
- * ll_iocontrol_call. 
- * This is a llite regular file ioctl function. It takes the responsibility 
- * of attaching a file, and detaching a file by a lloop's device numner. 
+/* dynamic iocontrol callback.
+ * This callback is registered in lloop_init and will be called by
+ * ll_iocontrol_call.
+ * This is a llite regular file ioctl function. It takes the responsibility
+ * of attaching a file, and detaching a file by a lloop's device numner.
   */
-static enum llioc_iter lloop_ioctl(struct inode *unused, struct file *file, 
+static enum llioc_iter lloop_ioctl(struct inode *unused, struct file *file,
                  unsigned int cmd, unsigned long arg,
                  void *magic, int *rcp)
  {
@@ -611,7 +611,7 @@ static enum llioc_iter lloop_ioctl(struct inode *unused, struct file *file,
                                          lo_free = lo;
                                  continue;
                          }
-                        if (lo->lo_backing_file->f_dentry->d_inode == 
+                        if (lo->lo_backing_file->f_dentry->d_inode ==
                              file->f_dentry->d_inode)
                                  break;
                  }
@@ -641,7 +641,7 @@ static enum llioc_iter lloop_ioctl(struct inode *unused, struct file *file,
  
          case LL_IOC_LLOOP_DETACH_BYDEV: {
                  int minor;
-                
+
                  dev = old_decode_dev(arg);
                  if (MAJOR(dev) != lloop_major)
                          GOTO(out, err = -EINVAL);
diff --git a/lustre/llite/lproc_llite.c b/lustre/llite/lproc_llite.c

index c0eda8c..be49e62 100644 (file)
--- a/lustre/llite/lproc_llite.c
+++ b/lustre/llite/lproc_llite.c
@@ -47,7 +47,7 @@ struct proc_dir_entry *proc_lustre_fs_root;
  
  #ifdef LPROCFS
  /* /proc/lustre/llite mount point registration */
-struct file_operations llite_dump_pgcache_fops;
+extern struct file_operations vvp_dump_pgcache_file_ops;
  struct file_operations ll_rw_extents_stats_fops;
  struct file_operations ll_rw_extents_stats_pp_fops;
  struct file_operations ll_rw_offset_stats_fops;
@@ -212,6 +212,19 @@ static int ll_rd_sb_uuid(char *page, char **start, off_t off, int count,
          return snprintf(page, count, "%s\n", ll_s2sbi(sb)->ll_sb_uuid.uuid);
  }
  
+static int ll_rd_site_stats(char *page, char **start, off_t off,
+                            int count, int *eof, void *data)
+{
+        struct super_block *sb = data;
+
+        /*
+         * See description of statistical counters in struct cl_site, and
+         * struct lu_site.
+         */
+        return cl_site_stats_print(lu2cl_site(ll_s2sbi(sb)->ll_site),
+                                   page, count);
+}
+
  static int ll_rd_max_readahead_mb(char *page, char **start, off_t off,
                                     int count, int *eof, void *data)
  {
@@ -318,8 +331,7 @@ static int ll_wr_max_cached_mb(struct file *file, const char *buffer,
  {
          struct super_block *sb = data;
          struct ll_sb_info *sbi = ll_s2sbi(sb);
-        unsigned long budget;
-        int mult, rc, pages_number, cpu;
+        int mult, rc, pages_number;
  
          mult = 1 << (20 - CFS_PAGE_SHIFT);
          rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult);
@@ -340,46 +352,9 @@ static int ll_wr_max_cached_mb(struct file *file, const char *buffer,
                  /* Not set up yet, don't call llap_shrink_cache */
                  return count;
  
-        spin_lock(&sbi->ll_async_page_reblnc_lock);
-        budget = sbi->ll_async_page_max / num_online_cpus();
-        for_each_online_cpu(cpu)
-                LL_PGLIST_DATA_CPU(sbi, cpu)->llpd_budget = budget;
-        spin_unlock(&sbi->ll_async_page_reblnc_lock);
-
-        if (lcounter_read(&sbi->ll_async_page_count) >= sbi->ll_async_page_max)
-                llap_shrink_cache(sbi, -1);
-
          return count;
  }
  
-static int ll_rd_pgcache_bnlc(char *page, char **start, off_t off,
-                          int count, int *eof, void *data)
-{
-        struct super_block *sb = data;
-        struct ll_sb_info *sbi = ll_s2sbi(sb);
-        struct ll_pglist_data *pd;
-        unsigned long total_budget = 0;
-        int n = 0, cpu;
-
-        n += snprintf(page +n, count - n,
-                "cpu\tpage count\tbudget\t\treblnc count\tgen\thit\tmiss\tcross\n");
-        for_each_online_cpu(cpu) {
-                pd = LL_PGLIST_DATA_CPU(sbi, cpu);
-                n += snprintf(page + n, count - n,
-                              "%d\t%-8lu\t%-8lu\t%-8lu\t%lu\t%lu\t%lu\t%lu\n",
-                              cpu, pd->llpd_count, pd->llpd_budget,
-                              pd->llpd_reblnc_count, pd->llpd_gen,
-                              pd->llpd_hit, pd->llpd_miss, pd->llpd_cross);
-                total_budget += pd->llpd_budget;
-        }
-        n += snprintf(page + n, count - n,
-                "Total budget: %lu, page max: %lu, rebalance cnt: %lu\n",
-                total_budget, sbi->ll_async_page_max,
-                sbi->ll_async_page_reblnc_count);
-        *eof = 1;
-        return n;
-}
-
  static int ll_rd_checksum(char *page, char **start, off_t off,
                            int count, int *eof, void *data)
  {
@@ -558,51 +533,11 @@ static int ll_rd_statahead_stats(char *page, char **start, off_t off,
                          sbi->ll_sa_miss);
  }
  
-static int ll_rd_contention_time(char *page, char **start, off_t off,
-                                 int count, int *eof, void *data)
-{
-        struct super_block *sb = data;
-
-        *eof = 1;
-        return snprintf(page, count, "%u\n", ll_s2sbi(sb)->ll_contention_time);
-
-}
-
-static int ll_wr_contention_time(struct file *file, const char *buffer,
-                                 unsigned long count, void *data)
-{
-        struct super_block *sb = data;
-        struct ll_sb_info *sbi = ll_s2sbi(sb);
-
-        return lprocfs_write_helper(buffer, count,&sbi->ll_contention_time) ?:
-                count;
-}
-
-static int ll_rd_lockless_truncate(char *page, char **start, off_t off,
-                                   int count, int *eof, void *data)
-{
-        struct super_block *sb = data;
-
-        *eof = 1;
-        return snprintf(page, count, "%u\n",
-                        ll_s2sbi(sb)->ll_lockless_truncate_enable);
-}
-
-static int ll_wr_lockless_truncate(struct file *file, const char *buffer,
-                                   unsigned long count, void *data)
-{
-        struct super_block *sb = data;
-        struct ll_sb_info *sbi = ll_s2sbi(sb);
-
-        return lprocfs_write_helper(buffer, count,
-                                    &sbi->ll_lockless_truncate_enable)
-                                    ?: count;
-}
-
  static struct lprocfs_vars lprocfs_llite_obd_vars[] = {
          { "uuid",         ll_rd_sb_uuid,          0, 0 },
          //{ "mntpt_path",   ll_rd_path,             0, 0 },
          { "fstype",       ll_rd_fstype,           0, 0 },
+        { "site",         ll_rd_site_stats,       0, 0 },
          { "blocksize",    ll_rd_blksize,          0, 0 },
          { "kbytestotal",  ll_rd_kbytestotal,      0, 0 },
          { "kbytesfree",   ll_rd_kbytesfree,       0, 0 },
@@ -616,7 +551,6 @@ static struct lprocfs_vars lprocfs_llite_obd_vars[] = {
          { "max_read_ahead_whole_mb", ll_rd_max_read_ahead_whole_mb,
                                       ll_wr_max_read_ahead_whole_mb, 0 },
          { "max_cached_mb",    ll_rd_max_cached_mb, ll_wr_max_cached_mb, 0 },
-        { "pgcache_balance",ll_rd_pgcache_bnlc, 0, 0 },
          { "checksum_pages",   ll_rd_checksum, ll_wr_checksum, 0 },
          { "max_rw_chunk",     ll_rd_max_rw_chunk, ll_wr_max_rw_chunk, 0 },
          { "stats_track_pid",  ll_rd_track_pid, ll_wr_track_pid, 0 },
@@ -624,9 +558,6 @@ static struct lprocfs_vars lprocfs_llite_obd_vars[] = {
          { "stats_track_gid",  ll_rd_track_gid, ll_wr_track_gid, 0 },
          { "statahead_max",    ll_rd_statahead_max, ll_wr_statahead_max, 0 },
          { "statahead_stats",  ll_rd_statahead_stats, 0, 0 },
-        { "contention_seconds", ll_rd_contention_time, ll_wr_contention_time, 0},
-        { "lockless_truncate", ll_rd_lockless_truncate,
-                               ll_wr_lockless_truncate, 0},
          { 0 }
  };
  
@@ -706,6 +637,22 @@ void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count)
  }
  EXPORT_SYMBOL(ll_stats_ops_tally);
  
+static const char *ra_stat_string[] = {
+        [RA_STAT_HIT] = "hits",
+        [RA_STAT_MISS] = "misses",
+        [RA_STAT_DISTANT_READPAGE] = "readpage not consecutive",
+        [RA_STAT_MISS_IN_WINDOW] = "miss inside window",
+        [RA_STAT_FAILED_GRAB_PAGE] = "failed grab_cache_page",
+        [RA_STAT_FAILED_MATCH] = "failed lock match",
+        [RA_STAT_DISCARDED] = "read but discarded",
+        [RA_STAT_ZERO_LEN] = "zero length file",
+        [RA_STAT_ZERO_WINDOW] = "zero size window",
+        [RA_STAT_EOF] = "read-ahead to EOF",
+        [RA_STAT_MAX_IN_FLIGHT] = "hit max r-a issue",
+        [RA_STAT_WRONG_GRAB_PAGE] = "wrong page from grab_cache_page",
+};
+
+
  int lprocfs_register_mountpoint(struct proc_dir_entry *parent,
                                  struct super_block *sb, char *osc, char *mdc)
  {
@@ -715,7 +662,6 @@ int lprocfs_register_mountpoint(struct proc_dir_entry *parent,
          struct obd_device *obd;
          char name[MAX_STRING_SIZE + 1], *ptr;
          int err, id, len, rc;
-        static const char *ra_stats_string[] = LL_RA_STAT_STRINGS;
          ENTRY;
  
          memset(lvars, 0, sizeof(lvars));
@@ -744,9 +690,8 @@ int lprocfs_register_mountpoint(struct proc_dir_entry *parent,
                  RETURN(err);
          }
  
-
          rc = lprocfs_seq_create(sbi->ll_proc_root, "dump_page_cache", 0444,
-                                &llite_dump_pgcache_fops, sbi);
+                                &vvp_dump_pgcache_file_ops, sbi);
          if (rc)
                  CWARN("Error adding the dump_page_cache file\n");
  
@@ -789,14 +734,14 @@ int lprocfs_register_mountpoint(struct proc_dir_entry *parent,
          if (err)
                  GOTO(out, err);
  
-        sbi->ll_ra_stats = lprocfs_alloc_stats(LL_RA_STAT,
+        sbi->ll_ra_stats = lprocfs_alloc_stats(ARRAY_SIZE(ra_stat_string),
                                                 LPROCFS_STATS_FLAG_PERCPU);
          if (sbi->ll_ra_stats == NULL)
                  GOTO(out, err = -ENOMEM);
  
-        for (id = 0; id < LL_RA_STAT; id++)
+        for (id = 0; id < ARRAY_SIZE(ra_stat_string); id++)
                  lprocfs_counter_init(sbi->ll_ra_stats, id, 0,
-                                     ra_stats_string[id], "pages");
+                                     ra_stat_string[id], "pages");
          err = lprocfs_register_stats(sbi->ll_proc_root, "read_ahead_stats",
                                       sbi->ll_ra_stats);
          if (err)
@@ -863,224 +808,6 @@ void lprocfs_unregister_mountpoint(struct ll_sb_info *sbi)
  }
  #undef MAX_STRING_SIZE
  
-#define seq_page_flag(seq, page, flag, has_flags) do {                  \
-                if (test_bit(PG_##flag, &(page)->flags)) {              \
-                        if (!has_flags)                                 \
-                                has_flags = 1;                          \
-                        else                                            \
-                                seq_putc(seq, '|');                     \
-                        seq_puts(seq, #flag);                           \
-                }                                                       \
-        } while(0);
-
-static void *llite_dump_pgcache_seq_start(struct seq_file *seq, loff_t *pos)
-{
-        struct ll_async_page *dummy_llap = seq->private;
-
-        if (dummy_llap->llap_magic == 2)
-                return NULL;
-
-        return (void *)1;
-}
-
-static int llite_dump_pgcache_seq_show(struct seq_file *seq, void *v)
-{
-        struct ll_async_page *llap, *dummy_llap = seq->private;
-        struct ll_sb_info *sbi = dummy_llap->llap_cookie;
-        struct ll_pglist_data *pd;
-        int cpu = dummy_llap->llap_pglist_cpu;
-
-        /* 2.4 doesn't seem to have SEQ_START_TOKEN, so we implement
-         * it in our own state */
-        if (dummy_llap->llap_magic == 0) {
-                seq_printf(seq, "gener |  llap  cookie  origin wq du wb | page "
-                                "inode index count [ page flags ]\n");
-                return 0;
-        }
-
-        pd = ll_pglist_cpu_lock(sbi, cpu);
-        llap = llite_pglist_next_llap(&pd->llpd_list,
-                                      &dummy_llap->llap_pglist_item);
-        if (llap != NULL)  {
-                int has_flags = 0, i;
-                struct page *page = llap->llap_page;
-                unsigned long gen = 0UL;
-
-                LASSERTF(llap->llap_origin < LLAP__ORIGIN_MAX, "%u\n",
-                         llap->llap_origin);
-
-                for_each_online_cpu(i)
-                         gen += LL_PGLIST_DATA_CPU(sbi, i)->llpd_gen;
-
-                seq_printf(seq," %5lu | %p %p %s %s %s %s | %p %lu/%u(%p) "
-                           "%lu %u [",
-                           gen,
-                           llap, llap->llap_cookie,
-                           llap_origins[llap->llap_origin],
-                           llap->llap_write_queued ? "wq" : "- ",
-                           llap->llap_defer_uptodate ? "du" : "- ",
-                           PageWriteback(page) ? "wb" : "-",
-                           page, page->mapping->host->i_ino,
-                           page->mapping->host->i_generation,
-                           page->mapping->host, page->index,
-                           page_count(page));
-                seq_page_flag(seq, page, locked, has_flags);
-                seq_page_flag(seq, page, error, has_flags);
-                seq_page_flag(seq, page, referenced, has_flags);
-                seq_page_flag(seq, page, uptodate, has_flags);
-                seq_page_flag(seq, page, dirty, has_flags);
-#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,12))
-                seq_page_flag(seq, page, highmem, has_flags);
-#endif
-                seq_page_flag(seq, page, writeback, has_flags);
-                if (!has_flags)
-                        seq_puts(seq, "-]\n");
-                else
-                        seq_puts(seq, "]\n");
-        }
-        ll_pglist_cpu_unlock(sbi, cpu);
-
-        return 0;
-}
-
-static void *llite_dump_pgcache_seq_next(struct seq_file *seq, void *v,
-                                         loff_t *pos)
-{
-        struct ll_async_page *llap, *dummy_llap = seq->private;
-        struct ll_sb_info *sbi = dummy_llap->llap_cookie;
-        struct ll_pglist_data *pd, *next;
-        int cpu = dummy_llap->llap_pglist_cpu;
-
-        /* bail if we just displayed the banner */
-        if (dummy_llap->llap_magic == 0) {
-                dummy_llap->llap_magic = 1;
-                return dummy_llap;
-        }
-
-        /* we've just displayed the llap that is after us in the list.
-         * we advance to a position beyond it, returning null if there
-         * isn't another llap in the list beyond that new position. */
-        pd = ll_pglist_cpu_lock(sbi, cpu);
-        llap = llite_pglist_next_llap(&pd->llpd_list,
-                        &dummy_llap->llap_pglist_item);
-        list_del_init(&dummy_llap->llap_pglist_item);
-        if (llap) {
-                list_add(&dummy_llap->llap_pglist_item,&llap->llap_pglist_item);
-                llap = llite_pglist_next_llap(&pd->llpd_list,
-                                &dummy_llap->llap_pglist_item);
-        }
-        if (llap == NULL) {
-                int i = cpu + 1;
-                for (next = NULL; i < num_possible_cpus(); i++, next = NULL) {
-                        next = ll_pglist_cpu_lock(sbi, i);
-                        if (!list_empty(&next->llpd_list))
-                                break;
-                        ll_pglist_cpu_unlock(sbi, i);
-                }
-                if (next != NULL) {
-                        list_move(&dummy_llap->llap_pglist_item,
-                                  &next->llpd_list);
-                        dummy_llap->llap_pglist_cpu = i;
-                        ll_pglist_cpu_unlock(sbi, cpu);
-                        llap = llite_pglist_next_llap(&next->llpd_list,
-                                        &dummy_llap->llap_pglist_item);
-                        LASSERT(llap);
-                        cpu = i;
-                }
-        }
-        ll_pglist_cpu_unlock(sbi, cpu);
-
-        ++*pos;
-        if (llap == NULL) {
-                dummy_llap->llap_magic = 2;
-                return NULL;
-        }
-        return dummy_llap;
-}
-
-static void null_stop(struct seq_file *seq, void *v)
-{
-}
-
-struct seq_operations llite_dump_pgcache_seq_sops = {
-        .start = llite_dump_pgcache_seq_start,
-        .stop = null_stop,
-        .next = llite_dump_pgcache_seq_next,
-        .show = llite_dump_pgcache_seq_show,
-};
-
-/* we're displaying llaps in a list_head list.  we don't want to hold a lock
- * while we walk the entire list, and we don't want to have to seek into
- * the right position in the list as an app advances with many syscalls.  we
- * allocate a dummy llap and hang it off file->private.  its position in
- * the list records where the app is currently displaying.  this way our
- * seq .start and .stop don't actually do anything.  .next returns null
- * when the dummy hits the end of the list which eventually leads to .release
- * where we tear down.  this kind of displaying is super-racey, so we put
- * a generation counter on the list so the output shows when the list
- * changes between reads.
- */
-static int llite_dump_pgcache_seq_open(struct inode *inode, struct file *file)
-{
-        struct proc_dir_entry *dp = PDE(inode);
-        struct ll_async_page *dummy_llap;
-        struct seq_file *seq;
-        struct ll_sb_info *sbi = dp->data;
-        struct ll_pglist_data *pd;
-        int rc = -ENOMEM;
-
-        LPROCFS_ENTRY_AND_CHECK(dp);
-
-        OBD_ALLOC_PTR_WAIT(dummy_llap);
-        if (dummy_llap == NULL)
-                GOTO(out, rc);
-        dummy_llap->llap_page = NULL;
-        dummy_llap->llap_cookie = sbi;
-        dummy_llap->llap_magic = 0;
-        dummy_llap->llap_pglist_cpu = 0;
-
-        rc = seq_open(file, &llite_dump_pgcache_seq_sops);
-        if (rc) {
-                OBD_FREE(dummy_llap, sizeof(*dummy_llap));
-                GOTO(out, rc);
-        }
-        seq = file->private_data;
-        seq->private = dummy_llap;
-
-        pd = ll_pglist_cpu_lock(sbi, 0);
-        list_add(&dummy_llap->llap_pglist_item, &pd->llpd_list);
-        ll_pglist_cpu_unlock(sbi, 0);
-
-out:
-        if (rc)
-                LPROCFS_EXIT();
-        return rc;
-}
-
-static int llite_dump_pgcache_seq_release(struct inode *inode,
-                                          struct file *file)
-{
-        struct seq_file *seq = file->private_data;
-        struct ll_async_page *dummy_llap = seq->private;
-        struct ll_sb_info *sbi = dummy_llap->llap_cookie;
-        int cpu = dummy_llap->llap_pglist_cpu;
-
-        ll_pglist_cpu_lock(sbi, cpu);
-        if (!list_empty(&dummy_llap->llap_pglist_item))
-                list_del_init(&dummy_llap->llap_pglist_item);
-        ll_pglist_cpu_unlock(sbi, cpu);
-        OBD_FREE(dummy_llap, sizeof(*dummy_llap));
-
-        return lprocfs_seq_release(inode, file);
-}
-
-struct file_operations llite_dump_pgcache_fops = {
-        .owner   = THIS_MODULE,
-        .open    = llite_dump_pgcache_seq_open,
-        .read    = seq_read,
-        .release = llite_dump_pgcache_seq_release,
-};
-
  #define pct(a,b) (b ? a * 100 / b : 0)
  
  static void ll_display_extents_info(struct ll_rw_extents_info *io_extents,
@@ -1248,8 +975,9 @@ static ssize_t ll_rw_extents_stats_seq_write(struct file *file, const char *buf,
  
  LPROC_SEQ_FOPS(ll_rw_extents_stats);
  
-void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid, struct file
-                               *file, size_t count, int rw)
+void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid,
+                       struct ll_file_data *file, loff_t pos,
+                       size_t count, int rw)
  {
          int i, cur = -1;
          struct ll_rw_process_info *process;
@@ -1298,9 +1026,8 @@ void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid, struct file
          for (i = 0; i < LL_PROCESS_HIST_MAX; i++) {
                  if (process[i].rw_pid == pid) {
                          if (process[i].rw_last_file != file) {
-                                process[i].rw_range_start = file->f_pos;
-                                process[i].rw_last_file_pos =
-                                                        file->f_pos + count;
+                                process[i].rw_range_start = pos;
+                                process[i].rw_last_file_pos = pos + count;
                                  process[i].rw_smallest_extent = count;
                                  process[i].rw_largest_extent = count;
                                  process[i].rw_offset = 0;
@@ -1308,7 +1035,7 @@ void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid, struct file
                                  spin_unlock(&sbi->ll_process_lock);
                                  return;
                          }
-                        if (process[i].rw_last_file_pos != file->f_pos) {
+                        if (process[i].rw_last_file_pos != pos) {
                                  *off_count =
                                      (*off_count + 1) % LL_OFFSET_HIST_MAX;
                                  offset[*off_count].rw_op = process[i].rw_op;
@@ -1324,17 +1051,17 @@ void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid, struct file
                                  offset[*off_count].rw_offset =
                                          process[i].rw_offset;
                                  process[i].rw_op = rw;
-                                process[i].rw_range_start = file->f_pos;
+                                process[i].rw_range_start = pos;
                                  process[i].rw_smallest_extent = count;
                                  process[i].rw_largest_extent = count;
-                                process[i].rw_offset = file->f_pos -
+                                process[i].rw_offset = pos -
                                          process[i].rw_last_file_pos;
                          }
                          if(process[i].rw_smallest_extent > count)
                                  process[i].rw_smallest_extent = count;
                          if(process[i].rw_largest_extent < count)
                                  process[i].rw_largest_extent = count;
-                        process[i].rw_last_file_pos = file->f_pos + count;
+                        process[i].rw_last_file_pos = pos + count;
                          spin_unlock(&sbi->ll_process_lock);
                          return;
                  }
@@ -1342,8 +1069,8 @@ void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid, struct file
          *process_count = (*process_count + 1) % LL_PROCESS_HIST_MAX;
          process[*process_count].rw_pid = pid;
          process[*process_count].rw_op = rw;
-        process[*process_count].rw_range_start = file->f_pos;
-        process[*process_count].rw_last_file_pos = file->f_pos + count;
+        process[*process_count].rw_range_start = pos;
+        process[*process_count].rw_last_file_pos = pos + count;
          process[*process_count].rw_smallest_extent = count;
          process[*process_count].rw_largest_extent = count;
          process[*process_count].rw_offset = 0;
diff --git a/lustre/llite/namei.c b/lustre/llite/namei.c

index a7d87c6..0933e2f 100644 (file)
--- a/lustre/llite/namei.c
+++ b/lustre/llite/namei.c
@@ -123,22 +123,28 @@ struct inode *ll_iget(struct super_block *sb, ino_t hash,
          if (inode) {
                  lli = ll_i2info(inode);
                  if (inode->i_state & I_NEW) {
+                        int rc;
+
                          ll_read_inode2(inode, md);
-                        unlock_new_inode(inode);
-                } else {
-                        if (!(inode->i_state & (I_FREEING | I_CLEAR)))
+                        rc = cl_inode_init(inode, md);
+                        if (rc != 0) {
+                                md->lsm = NULL;
+                                make_bad_inode(inode);
+                                unlock_new_inode(inode);
+                                iput(inode);
+                                inode = ERR_PTR(rc);
+                        } else
+                                unlock_new_inode(inode);
+                } else if (!(inode->i_state & (I_FREEING | I_CLEAR)))
                                  ll_update_inode(inode, md);
-                }
-                CDEBUG(D_VFSTRACE, "got inode: %lu/%u(%p) for "DFID"\n",
-                       inode->i_ino, inode->i_generation, inode,
-                       PFID(&lli->lli_fid));
+                CDEBUG(D_VFSTRACE, "got inode: %p for "DFID"\n",
+                       inode, PFID(&md->body->fid1));
          }
-
          RETURN(inode);
  }
  
  static void ll_drop_negative_dentry(struct inode *dir)
-{ 
+{
          struct dentry *dentry, *tmp_alias, *tmp_subdir;
  
          spin_lock(&ll_lookup_lock);
@@ -438,7 +444,7 @@ int ll_lookup_it_finish(struct ptlrpc_request *request,
                     2.4 and
                     vfs_getattr_it->ll_getattr()->ll_inode_revalidate_it() in 2.6
                     Everybody else who needs correct file size would call
-                   ll_glimpse_size or some equivalent themselves anyway.
+                   cl_glimpse_size or some equivalent themselves anyway.
                     Also see bug 7198. */
  
                  ll_dops_init(*de, 1);
@@ -461,7 +467,7 @@ int ll_lookup_it_finish(struct ptlrpc_request *request,
                     might get picked up later when UPDATE lock will appear */
                  if (ll_have_md_lock(parent, MDS_INODELOCK_UPDATE)) {
                          spin_lock(&dcache_lock);
-                        ll_d_add(*de, inode);
+                        ll_d_add(*de, NULL);
                          spin_unlock(&dcache_lock);
                  } else {
                          (*de)->d_inode = NULL;
@@ -996,7 +1002,7 @@ static void ll_get_child_fid(struct inode * dir, struct qstr *name,
                               struct lu_fid *fid)
  {
          struct dentry *parent, *child;
-        
+
          parent = list_entry(dir->i_dentry.next, struct dentry, d_alias);
          child = d_lookup(parent, name);
          if (child) {
@@ -1013,7 +1019,7 @@ static int ll_rmdir_generic(struct inode *dir, struct dentry *dparent,
          struct md_op_data *op_data;
          int rc;
          ENTRY;
-        
+
          CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p)\n",
                 name->len, name->name, dir->i_ino, dir->i_generation, dir);
  
@@ -1084,7 +1090,7 @@ int ll_objects_destroy(struct ptlrpc_request *request, struct inode *dir)
  
          if (body->valid & OBD_MD_FLCOOKIE) {
                  oa->o_valid |= OBD_MD_FLCOOKIE;
-                oti.oti_logcookies = 
+                oti.oti_logcookies =
                          req_capsule_server_sized_get(&request->rq_pill,
                                                       &RMF_LOGCOOKIES,
                                                     sizeof(struct llog_cookie) *
diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c

index 6a2be0a..4fb44d1 100644 (file)
--- a/lustre/llite/rw.c
+++ b/lustre/llite/rw.c
@@ -56,6 +56,8 @@
  #include <linux/mm.h>
  #include <linux/pagemap.h>
  #include <linux/smp_lock.h>
+/* current_is_kswapd() */
+#include <linux/swap.h>
  
  #define DEBUG_SUBSYSTEM S_LLITE
  
@@ -65,107 +67,6 @@
  #include "llite_internal.h"
  #include <linux/lustre_compat25.h>
  
-#ifndef list_for_each_prev_safe
-#define list_for_each_prev_safe(pos, n, head) \
-        for (pos = (head)->prev, n = pos->prev; pos != (head); \
-                pos = n, n = pos->prev )
-#endif
-
-cfs_mem_cache_t *ll_async_page_slab = NULL;
-size_t ll_async_page_slab_size = 0;
-
-/* SYNCHRONOUS I/O to object storage for an inode */
-static int ll_brw(int cmd, struct inode *inode, struct obdo *oa,
-                  struct page *page, int flags)
-{
-        struct ll_inode_info *lli = ll_i2info(inode);
-        struct lov_stripe_md *lsm = lli->lli_smd;
-        struct obd_info oinfo = { { { 0 } } };
-        struct brw_page pg;
-        int opc, rc;
-        ENTRY;
-
-        pg.pg = page;
-        pg.off = ((obd_off)page->index) << CFS_PAGE_SHIFT;
-
-        if ((cmd & OBD_BRW_WRITE) && (pg.off+CFS_PAGE_SIZE>i_size_read(inode)))
-                pg.count = i_size_read(inode) % CFS_PAGE_SIZE;
-        else
-                pg.count = CFS_PAGE_SIZE;
-
-        LL_CDEBUG_PAGE(D_PAGE, page, "%s %d bytes ino %lu at "LPU64"/"LPX64"\n",
-                       cmd & OBD_BRW_WRITE ? "write" : "read", pg.count,
-                       inode->i_ino, pg.off, pg.off);
-        if (pg.count == 0) {
-                CERROR("ZERO COUNT: ino %lu: size %p:%Lu(%p:%Lu) idx %lu off "
-                       LPU64"\n", inode->i_ino, inode, i_size_read(inode),
-                       page->mapping->host, i_size_read(page->mapping->host),
-                       page->index, pg.off);
-        }
-
-        pg.flag = flags;
-
-        if (cmd & OBD_BRW_WRITE)
-                ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_BRW_WRITE,
-                                   pg.count);
-        else
-                ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_BRW_READ,
-                                   pg.count);
-        oinfo.oi_oa = oa;
-        oinfo.oi_md = lsm;
-        /* NB partial write, so we might not have CAPA_OPC_OSS_READ capa */
-        opc = cmd & OBD_BRW_WRITE ? CAPA_OPC_OSS_WRITE : CAPA_OPC_OSS_RW;
-        oinfo.oi_capa = ll_osscapa_get(inode, opc);
-        rc = obd_brw(cmd, ll_i2dtexp(inode), &oinfo, 1, &pg, NULL);
-        capa_put(oinfo.oi_capa);
-        if (rc == 0)
-                obdo_to_inode(inode, oa, OBD_MD_FLBLOCKS);
-        else if (rc != -EIO)
-                CERROR("error from obd_brw: rc = %d\n", rc);
-        RETURN(rc);
-}
-
-int ll_file_punch(struct inode * inode, loff_t new_size, int srvlock)
-{
-        struct ll_inode_info *lli = ll_i2info(inode);
-        struct obd_info oinfo = { { { 0 } } };
-        struct obdo oa;
-        int rc;
-
-        ENTRY;
-        CDEBUG(D_INFO, "calling punch for "LPX64" (new size %Lu=%#Lx)\n",
-               lli->lli_smd->lsm_object_id, i_size_read(inode), i_size_read(inode));
-
-        oinfo.oi_md = lli->lli_smd;
-        oinfo.oi_policy.l_extent.start = new_size;
-        oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF;
-        oinfo.oi_oa = &oa;
-        oa.o_id = lli->lli_smd->lsm_object_id;
-        oa.o_gr = lli->lli_smd->lsm_object_gr;
-        oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
-        if (srvlock) {
-                /* set OBD_MD_FLFLAGS in o_valid, only if we
-                 * set OBD_FL_TRUNCLOCK, otherwise ost_punch
-                 * and filter_setattr get confused, see the comment
-                 * in ost_punch */
-                oa.o_flags = OBD_FL_TRUNCLOCK;
-                oa.o_valid |= OBD_MD_FLFLAGS;
-        }
-        obdo_from_inode(&oa, inode, OBD_MD_FLTYPE | OBD_MD_FLMODE |
-                        OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME |
-                        OBD_MD_FLFID | OBD_MD_FLGENER);
-
-        oinfo.oi_capa = ll_osscapa_get(inode, CAPA_OPC_OSS_TRUNC);
-        rc = obd_punch_rqset(ll_i2dtexp(inode), &oinfo, NULL);
-        ll_truncate_free_capa(oinfo.oi_capa);
-        if (rc)
-                CERROR("obd_truncate fails (%d) ino %lu\n", rc, inode->i_ino);
-        else
-                obdo_to_inode(inode, &oa, OBD_MD_FLSIZE | OBD_MD_FLBLOCKS |
-                              OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME);
-        RETURN(rc);
-}
-
  /* this isn't where truncate starts.   roughly:
   * sys_truncate->ll_setattr_raw->vmtruncate->ll_truncate. setattr_raw grabs
   * DLM lock on [size, EOF], i_mutex, ->lli_size_sem, and WRITE_I_ALLOC_SEM to
@@ -175,7 +76,6 @@ int ll_file_punch(struct inode * inode, loff_t new_size, int srvlock)
  void ll_truncate(struct inode *inode)
  {
          struct ll_inode_info *lli = ll_i2info(inode);
-        int srvlock = !!(lli->lli_flags & LLIF_SRVLOCK);
          loff_t new_size;
          ENTRY;
          CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) to %Lu=%#Lx\n",inode->i_ino,
@@ -183,7 +83,7 @@ void ll_truncate(struct inode *inode)
                 i_size_read(inode));
  
          ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_TRUNC, 1);
-        if (lli->lli_size_sem_owner != current) {
+        if (lli->lli_size_sem_owner != cfs_current()) {
                  EXIT;
                  return;
          }
@@ -193,29 +93,7 @@ void ll_truncate(struct inode *inode)
                         inode->i_ino);
                  GOTO(out_unlock, 0);
          }
-
-        LASSERT(atomic_read(&lli->lli_size_sem.count) <= 0);
-
-        if (!srvlock) {
-                struct ost_lvb lvb;
-                int rc;
-
-                /* XXX I'm pretty sure this is a hack to paper
-                 * over a more fundamental race condition. */
-                lov_stripe_lock(lli->lli_smd);
-                inode_init_lvb(inode, &lvb);
-                rc = obd_merge_lvb(ll_i2dtexp(inode), lli->lli_smd, &lvb, 0);
-                if (lvb.lvb_size == i_size_read(inode) && rc == 0) {
-                        CDEBUG(D_VFSTRACE, "skipping punch for obj "LPX64
-                               ",%Lu=%#Lx\n", lli->lli_smd->lsm_object_id,
-                               i_size_read(inode), i_size_read(inode));
-                        lov_stripe_unlock(lli->lli_smd);
-                        GOTO(out_unlock, 0);
-                }
-                obd_adjust_kms(ll_i2dtexp(inode), lli->lli_smd,
-                               i_size_read(inode), 1);
-                lov_stripe_unlock(lli->lli_smd);
-        }
+        LASSERT_SEM_LOCKED(&lli->lli_size_sem);
  
          if (unlikely((ll_i2sbi(inode)->ll_flags & LL_SBI_CHECKSUM) &&
                       (i_size_read(inode) & ~CFS_PAGE_MASK))) {
@@ -224,6 +102,7 @@ void ll_truncate(struct inode *inode)
                                                    i_size_read(inode) >>
                                                    CFS_PAGE_SHIFT);
                  if (page != NULL) {
+#if 0 /* XXX */
                          struct ll_async_page *llap = llap_cast_private(page);
                          if (llap != NULL) {
                                  char *kaddr = kmap_atomic(page, KM_USER0);
@@ -236,15 +115,12 @@ void ll_truncate(struct inode *inode)
                                  kunmap_atomic(kaddr, KM_USER0);
                          }
                          page_cache_release(page);
+#endif
                  }
          }
  
          new_size = i_size_read(inode);
          ll_inode_size_unlock(inode, 0);
-        if (!srvlock)
-                ll_file_punch(inode, new_size, 0);
-        else
-                ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LOCKLESS_TRUNC, 1);
  
          EXIT;
          return;
@@ -253,848 +129,231 @@ void ll_truncate(struct inode *inode)
          ll_inode_size_unlock(inode, 0);
  } /* ll_truncate */
  
-int ll_prepare_write(struct file *file, struct page *page, unsigned from,
-                     unsigned to)
-{
-        struct inode *inode = page->mapping->host;
-        struct ll_inode_info *lli = ll_i2info(inode);
-        struct lov_stripe_md *lsm = lli->lli_smd;
-        obd_off offset = ((obd_off)page->index) << CFS_PAGE_SHIFT;
-        struct obd_info oinfo = { { { 0 } } };
-        struct brw_page pga;
-        struct obdo oa;
-        struct ost_lvb lvb;
-        int rc = 0;
-        ENTRY;
-
-        LASSERT(PageLocked(page));
-        (void)llap_cast_private(page); /* assertion */
-
-        /* Check to see if we should return -EIO right away */
-        pga.pg = page;
-        pga.off = offset;
-        pga.count = CFS_PAGE_SIZE;
-        pga.flag = 0;
-
-        oa.o_mode = inode->i_mode;
-        oa.o_id = lsm->lsm_object_id;
-        oa.o_gr = lsm->lsm_object_gr;
-        oa.o_valid = OBD_MD_FLID | OBD_MD_FLMODE |
-                     OBD_MD_FLTYPE | OBD_MD_FLGROUP;
-        obdo_from_inode(&oa, inode, OBD_MD_FLFID | OBD_MD_FLGENER);
-
-        oinfo.oi_oa = &oa;
-        oinfo.oi_md = lsm;
-        rc = obd_brw(OBD_BRW_CHECK, ll_i2dtexp(inode), &oinfo, 1, &pga, NULL);
-        if (rc)
-                RETURN(rc);
-
-        if (PageUptodate(page)) {
-                LL_CDEBUG_PAGE(D_PAGE, page, "uptodate\n");
-                RETURN(0);
-        }
-
-        /* We're completely overwriting an existing page, so _don't_ set it up
-         * to date until commit_write */
-        if (from == 0 && to == CFS_PAGE_SIZE) {
-                LL_CDEBUG_PAGE(D_PAGE, page, "full page write\n");
-                POISON_PAGE(page, 0x11);
-                RETURN(0);
-        }
-
-        /* If are writing to a new page, no need to read old data.  The extent
-         * locking will have updated the KMS, and for our purposes here we can
-         * treat it like i_size. */
-        lov_stripe_lock(lsm);
-        inode_init_lvb(inode, &lvb);
-        obd_merge_lvb(ll_i2dtexp(inode), lsm, &lvb, 1);
-        lov_stripe_unlock(lsm);
-        if (lvb.lvb_size <= offset) {
-                char *kaddr = kmap_atomic(page, KM_USER0);
-                LL_CDEBUG_PAGE(D_PAGE, page, "kms "LPU64" <= offset "LPU64"\n",
-                               lvb.lvb_size, offset);
-                memset(kaddr, 0, CFS_PAGE_SIZE);
-                kunmap_atomic(kaddr, KM_USER0);
-                GOTO(prepare_done, rc = 0);
-        }
-
-        /* XXX could be an async ocp read.. read-ahead? */
-        rc = ll_brw(OBD_BRW_READ, inode, &oa, page, 0);
-        if (rc == 0) {
-                /* bug 1598: don't clobber blksize */
-                oa.o_valid &= ~(OBD_MD_FLSIZE | OBD_MD_FLBLKSZ);
-                obdo_refresh_inode(inode, &oa, oa.o_valid);
-        }
-
-        EXIT;
- prepare_done:
-        if (rc == 0)
-                SetPageUptodate(page);
-
-        return rc;
-}
-
  /**
- * make page ready for ASYNC write
- * \param data - pointer to llap cookie
- * \param cmd - is OBD_BRW_* macroses
- *
- * \retval 0 is page successfully prepared to send
- * \retval -EAGAIN is page not need to send
+ * Initializes common cl-data at the typical address_space operation entry
+ * point.
   */
-static int ll_ap_make_ready(void *data, int cmd)
+static int ll_cl_init(struct file *file, struct page *vmpage,
+                      struct lu_env **env,
+                      struct cl_io **io, struct cl_page **page, int *refcheck)
  {
-        struct ll_async_page *llap;
-        struct page *page;
-        ENTRY;
-
-        llap = llap_from_cookie(data);
-        page = llap->llap_page;
-
-        /* we're trying to write, but the page is locked.. come back later */
-        if (TryLockPage(page))
-                RETURN(-EAGAIN);
-
-        LASSERTF(!(cmd & OBD_BRW_READ) || !PageWriteback(page),
-                "cmd %x page %p ino %lu index %lu fl %lx\n", cmd, page,
-                 page->mapping->host->i_ino, page->index, page->flags);
-
-        /* if we left PageDirty we might get another writepage call
-         * in the future.  list walkers are bright enough
-         * to check page dirty so we can leave it on whatever list
-         * its on.  XXX also, we're called with the cli list so if
-         * we got the page cache list we'd create a lock inversion
-         * with the removepage path which gets the page lock then the
-         * cli lock */
-        LASSERTF(!PageWriteback(page),"cmd %x page %p ino %lu index %lu\n", cmd, page,
-                 page->mapping->host->i_ino, page->index);
-        if(!clear_page_dirty_for_io(page)) {
-               unlock_page(page);
-               RETURN(-EAGAIN);
-       }
-
-        /* This actually clears the dirty bit in the radix tree.*/
-        set_page_writeback(page);
-
-        LL_CDEBUG_PAGE(D_PAGE, page, "made ready\n");
-        page_cache_get(page);
-
-        RETURN(0);
+        struct lu_env    *_env;
+        struct cl_io     *_io;
+        struct cl_page   *_page;
+        struct cl_object *clob;
+
+        int result;
+
+        *env  = NULL;
+        *io   = NULL;
+        *page = NULL;
+
+        clob = ll_i2info(vmpage->mapping->host)->lli_clob;
+        LASSERT(clob != NULL);
+
+        _env = cl_env_get(refcheck);
+        if (!IS_ERR(env)) {
+                struct ccc_io *cio = ccc_env_io(_env);
+
+                *env = _env;
+                *io  = _io = cio->cui_cl.cis_io;
+                if (_io != NULL) {
+                        LASSERT(_io->ci_state == CIS_IO_GOING);
+                        LASSERT(cio->cui_fd == LUSTRE_FPRIVATE(file));
+                        _page = cl_page_find(_env, clob, vmpage->index, vmpage,
+                                             CPT_CACHEABLE);
+                        if (!IS_ERR(_page)) {
+                                *page = _page;
+                                lu_ref_add(&_page->cp_reference, "cl_io", _io);
+                                result = 0;
+                        } else
+                                result = PTR_ERR(_page);
+                } else
+                        /*
+                         * This is for a case where operation can be called
+                         * either with or without cl_io created by the upper
+                         * layer (e.g., ->prepare_write() called directly from
+                         * loop-back driver).
+                         */
+                        result = -EALREADY;
+        } else
+                result = PTR_ERR(_env);
+        CDEBUG(D_VFSTRACE, "%lu@"DFID" -> %i %p %p %p\n",
+               vmpage->index, PFID(lu_object_fid(&clob->co_lu)), result,
+               *env, *io, *page);
+        return result;
  }
  
-/* We have two reasons for giving llite the opportunity to change the
- * write length of a given queued page as it builds the RPC containing
- * the page:
- *
- * 1) Further extending writes may have landed in the page cache
- *    since a partial write first queued this page requiring us
- *    to write more from the page cache.  (No further races are possible, since
- *    by the time this is called, the page is locked.)
- * 2) We might have raced with truncate and want to avoid performing
- *    write RPCs that are just going to be thrown away by the
- *    truncate's punch on the storage targets.
- *
- * The kms serves these purposes as it is set at both truncate and extending
- * writes.
+/**
+ * Finalizes cl-data before exiting typical address_space operation. Dual to
+ * ll_cl_init().
   */
-static int ll_ap_refresh_count(void *data, int cmd)
-{
-        struct ll_inode_info *lli;
-        struct ll_async_page *llap;
-        struct lov_stripe_md *lsm;
-        struct page *page;
-        struct inode *inode;
-        struct ost_lvb lvb;
-        __u64 kms;
-        ENTRY;
-
-        /* readpage queues with _COUNT_STABLE, shouldn't get here. */
-        LASSERT(cmd != OBD_BRW_READ);
-
-        llap = llap_from_cookie(data);
-        page = llap->llap_page;
-        inode = page->mapping->host;
-        lli = ll_i2info(inode);
-        lsm = lli->lli_smd;
-
-        lov_stripe_lock(lsm);
-        inode_init_lvb(inode, &lvb);
-        obd_merge_lvb(ll_i2dtexp(inode), lsm, &lvb, 1);
-        kms = lvb.lvb_size;
-        lov_stripe_unlock(lsm);
-
-        /* catch race with truncate */
-        if (((__u64)page->index << CFS_PAGE_SHIFT) >= kms)
-                return 0;
-
-        /* catch sub-page write at end of file */
-        if (((__u64)page->index << CFS_PAGE_SHIFT) + CFS_PAGE_SIZE > kms)
-                return kms % CFS_PAGE_SIZE;
-
-        return CFS_PAGE_SIZE;
-}
-
-void ll_inode_fill_obdo(struct inode *inode, int cmd, struct obdo *oa)
-{
-        struct lov_stripe_md *lsm;
-        obd_flag valid_flags;
-
-        lsm = ll_i2info(inode)->lli_smd;
-
-        oa->o_id = lsm->lsm_object_id;
-        oa->o_gr = lsm->lsm_object_gr;
-        oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
-        valid_flags = OBD_MD_FLTYPE | OBD_MD_FLATIME;
-        if (cmd & OBD_BRW_WRITE) {
-                oa->o_valid |= OBD_MD_FLEPOCH;
-                oa->o_easize = ll_i2info(inode)->lli_ioepoch;
-
-                valid_flags |= OBD_MD_FLMTIME | OBD_MD_FLCTIME |
-                        OBD_MD_FLUID | OBD_MD_FLGID |
-                        OBD_MD_FLFID | OBD_MD_FLGENER;
-        }
-
-        obdo_from_inode(oa, inode, valid_flags);
-}
-
-static void ll_ap_fill_obdo(void *data, int cmd, struct obdo *oa)
-{
-        struct ll_async_page *llap;
-        ENTRY;
-
-        llap = llap_from_cookie(data);
-        ll_inode_fill_obdo(llap->llap_page->mapping->host, cmd, oa);
-
-        EXIT;
-}
-
-static void ll_ap_update_obdo(void *data, int cmd, struct obdo *oa,
-                              obd_valid valid)
-{
-        struct ll_async_page *llap;
-        ENTRY;
-
-        llap = llap_from_cookie(data);
-        obdo_from_inode(oa, llap->llap_page->mapping->host, valid);
-
-        EXIT;
-}
-
-static struct obd_capa *ll_ap_lookup_capa(void *data, int cmd)
-{
-        int opc = cmd & OBD_BRW_WRITE ? CAPA_OPC_OSS_WRITE : CAPA_OPC_OSS_RW;
-        struct ll_async_page *llap = llap_from_cookie(data);
-
-        return ll_osscapa_get(llap->llap_page->mapping->host, opc);
-}
-
-static struct obd_async_page_ops ll_async_page_ops = {
-        .ap_make_ready =        ll_ap_make_ready,
-        .ap_refresh_count =     ll_ap_refresh_count,
-        .ap_fill_obdo =         ll_ap_fill_obdo,
-        .ap_update_obdo =       ll_ap_update_obdo,
-        .ap_completion =        ll_ap_completion,
-        .ap_lookup_capa =       ll_ap_lookup_capa,
-};
-
-struct ll_async_page *llap_cast_private(struct page *page)
-{
-        struct ll_async_page *llap = (struct ll_async_page *)page_private(page);
-
-        LASSERTF(llap == NULL || llap->llap_magic == LLAP_MAGIC,
-                 "page %p private %lu gave magic %d which != %d\n",
-                 page, page_private(page), llap->llap_magic, LLAP_MAGIC);
-
-        return llap;
-}
-
-/* Try to reap @target pages in the specific @cpu's async page list.
- *
- * There is an llap attached onto every page in lustre, linked off @sbi.
- * We add an llap to the list so we don't lose our place during list walking.
- * If llaps in the list are being moved they will only move to the end
- * of the LRU, and we aren't terribly interested in those pages here (we
- * start at the beginning of the list where the least-used llaps are. */
-static inline int llap_shrink_cache_internal(struct ll_sb_info *sbi, 
-        int cpu, int target)
+static void ll_cl_fini(struct lu_env *env,
+                       struct cl_io *io, struct cl_page *page, int *refcheck)
  {
-        struct ll_async_page *llap, dummy_llap = { .llap_magic = 0xd11ad11a };
-        struct ll_pglist_data *pd;
-        struct list_head *head;
-        int count = 0;
-
-        pd = ll_pglist_cpu_lock(sbi, cpu);
-        head = &pd->llpd_list;
-        list_add(&dummy_llap.llap_pglist_item, head);
-        while (count < target) {
-                struct page *page;
-                int keep;
-
-                if (unlikely(need_resched())) {
-                        ll_pglist_cpu_unlock(sbi, cpu);
-                        cond_resched();
-                        ll_pglist_cpu_lock(sbi, cpu);
-                }
-
-                llap = llite_pglist_next_llap(head, 
-                        &dummy_llap.llap_pglist_item);
-                list_del_init(&dummy_llap.llap_pglist_item);
-                if (llap == NULL)
-                        break;
-
-                page = llap->llap_page;
-                LASSERT(page != NULL);
-
-                list_add(&dummy_llap.llap_pglist_item, &llap->llap_pglist_item);
-
-                /* Page needs/undergoing IO */
-                if (TryLockPage(page)) {
-                        LL_CDEBUG_PAGE(D_PAGE, page, "can't lock\n");
-                        continue;
-                }
-
-               keep = (llap->llap_write_queued || PageDirty(page) ||
-                      PageWriteback(page) || (!PageUptodate(page) &&
-                      llap->llap_origin != LLAP_ORIGIN_READAHEAD));
-
-                LL_CDEBUG_PAGE(D_PAGE, page,"%s LRU page: %s%s%s%s%s origin %s\n",
-                               keep ? "keep" : "drop",
-                               llap->llap_write_queued ? "wq " : "",
-                               PageDirty(page) ? "pd " : "",
-                               PageUptodate(page) ? "" : "!pu ",
-                               PageWriteback(page) ? "wb" : "",
-                               llap->llap_defer_uptodate ? "" : "!du",
-                               llap_origins[llap->llap_origin]);
-
-                /* If page is dirty or undergoing IO don't discard it */
-                if (keep) {
-                        unlock_page(page);
-                        continue;
-                }
-
-                page_cache_get(page);
-                ll_pglist_cpu_unlock(sbi, cpu);
-
-                if (page->mapping != NULL) {
-                        ll_teardown_mmaps(page->mapping,
-                                         (__u64)page->index << CFS_PAGE_SHIFT,
-                                         ((__u64)page->index << CFS_PAGE_SHIFT)|
-                                          ~CFS_PAGE_MASK);
-                        if (!PageDirty(page) && !page_mapped(page)) {
-                                ll_ra_accounting(llap, page->mapping);
-                                ll_truncate_complete_page(page);
-                                ++count;
-                        } else {
-                                LL_CDEBUG_PAGE(D_PAGE, page, "Not dropping page"
-                                                             " because it is "
-                                                             "%s\n",
-                                                              PageDirty(page)?
-                                                              "dirty":"mapped");
+        if (page != NULL) {
+                lu_ref_del(&page->cp_reference, "cl_io", io);
+                cl_page_put(env, page);
+        }
+        if (env != NULL) {
+                struct vvp_io *vio;
+
+                vio = vvp_env_io(env);
+                LASSERT(vio->cui_oneshot >= 0);
+                if (vio->cui_oneshot > 0) {
+                        if (--vio->cui_oneshot == 0) {
+                                cl_io_end(env, io);
+                                cl_io_unlock(env, io);
+                                cl_io_iter_fini(env, io);
+                                cl_io_fini(env, io);
+                                /* to trigger assertion above, if ll_cl_fini()
+                                 * is called against freed io. */
+                                vio->cui_oneshot = -1;
                          }
+                        /* additional reference on env was acquired by io,
+                         * disable refcheck */
+                        refcheck = NULL;
                  }
-                unlock_page(page);
-                page_cache_release(page);
-
-                ll_pglist_cpu_lock(sbi, cpu);
-        }
-        list_del(&dummy_llap.llap_pglist_item);
-        ll_pglist_cpu_unlock(sbi, cpu);
-
-        CDEBUG(D_CACHE, "shrank %d, expected %d however. \n", count, target);
-        return count;
-}
-
-
-/* Try to shrink the page cache for the @sbi filesystem by 1/@shrink_fraction.
- *
- * At first, this code calculates total pages wanted by @shrink_fraction, then
- * it deduces how many pages should be reaped from each cpu in proportion as 
- * their own # of page count(llpd_count).
- */
-int llap_shrink_cache(struct ll_sb_info *sbi, int shrink_fraction)
-{
-        unsigned long total, want, percpu_want, count = 0;
-        int cpu, nr_cpus;
-
-        total = lcounter_read(&sbi->ll_async_page_count);
-        if (total == 0)
-                return 0;
-
-#ifdef HAVE_SHRINKER_CACHE
-        want = shrink_fraction;
-        if (want == 0)
-                return total;
-#else
-        /* There can be a large number of llaps (600k or more in a large
-         * memory machine) so the VM 1/6 shrink ratio is likely too much.
-         * Since we are freeing pages also, we don't necessarily want to
-         * shrink so much.  Limit to 40MB of pages + llaps per call. */
-        if (shrink_fraction <= 0)
-                want = total - sbi->ll_async_page_max + 32*num_online_cpus();
-        else
-                want = (total + shrink_fraction - 1) / shrink_fraction;
-#endif
-
-        if (want > 40 << (20 - CFS_PAGE_SHIFT))
-                want = 40 << (20 - CFS_PAGE_SHIFT);
-
-        CDEBUG(D_CACHE, "shrinking %lu of %lu pages (1/%d)\n",
-               want, total, shrink_fraction);
-
-        nr_cpus = num_possible_cpus();
-        cpu = sbi->ll_async_page_clock_hand;
-        /* we at most do one round */
-        do {
-                int c;
-
-                cpu = (cpu + 1) % nr_cpus;
-                c = LL_PGLIST_DATA_CPU(sbi, cpu)->llpd_count;
-                if (!cpu_online(cpu))
-                        percpu_want = c;
-                else
-                        percpu_want = want / ((total / (c + 1)) + 1);
-                if (percpu_want == 0)
-                        continue;
-
-                count += llap_shrink_cache_internal(sbi, cpu, percpu_want);
-                if (count >= want)
-                        sbi->ll_async_page_clock_hand = cpu;
-        } while (cpu != sbi->ll_async_page_clock_hand);
-
-        CDEBUG(D_CACHE, "shrank %lu/%lu and left %lu unscanned\n",
-               count, want, total);
-
-#ifdef HAVE_SHRINKER_CACHE
-        return lcounter_read(&sbi->ll_async_page_count);
-#else
-        return count;
-#endif
+                cl_env_put(env, refcheck);
+        } else
+                LASSERT(io == NULL);
  }
  
-/* Rebalance the async page queue len for each cpu. We hope that the cpu
- * which do much IO job has a relative longer queue len.
- * This function should be called with preempt disabled.
+/**
+ * Initializes one-shot cl_io for the case when loop driver calls
+ * ->{prepare,commit}_write() methods directly.
   */
-static inline int llap_async_cache_rebalance(struct ll_sb_info *sbi)
+static int ll_prepare_loop(struct lu_env *env, struct cl_io *io,
+                           struct file *file, struct page *vmpage,
+                           unsigned from, unsigned to)
  {
-        unsigned long sample = 0, *cpu_sample, bias, slice;
-        struct ll_pglist_data *pd;
-        cpumask_t mask;
-        int cpu, surplus;
-        int w1 = 7, w2 = 3, base = (w1 + w2); /* weight value */
-        atomic_t *pcnt;
-
-        if (!spin_trylock(&sbi->ll_async_page_reblnc_lock)) {
-                /* someone else is doing the job */
-                return 1;
-        }
-
-        pcnt = &LL_PGLIST_DATA(sbi)->llpd_sample_count;
-        if (!atomic_read(pcnt)) {
-                /* rare case, somebody else has gotten this job done */
-                spin_unlock(&sbi->ll_async_page_reblnc_lock);
-                return 1;
-        }
-
-        sbi->ll_async_page_reblnc_count++;
-        cpu_sample = sbi->ll_async_page_sample;
-        memset(cpu_sample, 0, num_possible_cpus() * sizeof(unsigned long));
-        for_each_online_cpu(cpu) {
-                pcnt = &LL_PGLIST_DATA_CPU(sbi, cpu)->llpd_sample_count;
-                cpu_sample[cpu] = atomic_read(pcnt);
-                atomic_set(pcnt, 0);
-                sample += cpu_sample[cpu];
-        }
+        struct vvp_io *vio;
+        struct ccc_io *cio;
+        int result;
+        loff_t pos;
  
-        cpus_clear(mask);
-        surplus = sbi->ll_async_page_max;
-        slice = surplus / sample + 1;
-        sample /= num_online_cpus();
-        bias = sample >> 4;
-        for_each_online_cpu(cpu) {
-                pd = LL_PGLIST_DATA_CPU(sbi, cpu);
-                if (labs((long int)sample - cpu_sample[cpu]) > bias) {
-                        unsigned long budget = pd->llpd_budget;
-                        /* weighted original queue length and expected queue
-                         * length to avoid thrashing. */
-                        pd->llpd_budget = (budget * w1) / base +
-                                        (slice * cpu_sample[cpu]) * w2 / base;
-                        cpu_set(cpu, mask);
-                }
-                surplus -= pd->llpd_budget;
-        }
-        surplus /= cpus_weight(mask) ?: 1;
-        for_each_cpu_mask(cpu, mask)
-                LL_PGLIST_DATA_CPU(sbi, cpu)->llpd_budget += surplus;
-        spin_unlock(&sbi->ll_async_page_reblnc_lock);
-
-        /* TODO: do we really need to call llap_shrink_cache_internal 
-         * for every cpus with its page_count greater than budget?
-         * for_each_cpu_mask(cpu, mask) 
-         *      ll_shrink_cache_internal(...) 
+        vio = vvp_env_io(env);
+        cio = ccc_env_io(env);
+        ll_io_init(io, file, 1);
+        pos = (vmpage->index << CFS_PAGE_SHIFT) + from;
+        /*
+         * Create IO and quickly drive it through CIS_{INIT,IT_STARTED,LOCKED}
+         * states. DLM locks are not taken for vio->cui_oneshot IO---we cannot
+         * take DLM locks here, because page is already locked. With new
+         * ->write_{being,end}() address_space operations lustre might be
+         * luckier.
           */
-
-        return 0;
-}
-
-static struct ll_async_page *llap_from_page_with_lockh(struct page *page,
-                                                       unsigned origin,
-                                                       struct lustre_handle *lockh)
-{
-        struct ll_async_page *llap;
-        struct obd_export *exp;
-        struct inode *inode = page->mapping->host;
-        struct ll_sb_info *sbi;
-        struct ll_pglist_data *pd;
-        int rc, cpu, target;
-        ENTRY;
-
-        if (!inode) {
-                static int triggered;
-
-                if (!triggered) {
-                        LL_CDEBUG_PAGE(D_ERROR, page, "Bug 10047. Wrong anon "
-                                       "page received\n");
-                        libcfs_debug_dumpstack(NULL);
-                        triggered = 1;
-                }
-                RETURN(ERR_PTR(-EINVAL));
-        }
-        sbi = ll_i2sbi(inode);
-        LASSERT(ll_async_page_slab);
-        LASSERTF(origin < LLAP__ORIGIN_MAX, "%u\n", origin);
-
-        llap = llap_cast_private(page);
-        if (llap != NULL) {
-                /* move to end of LRU list, except when page is just about to
-                 * die */
-                if (origin != LLAP_ORIGIN_REMOVEPAGE) {
-                        int old_cpu = llap->llap_pglist_cpu;
-                        struct ll_pglist_data *old_pd;
-
-                        pd = ll_pglist_double_lock(sbi, old_cpu, &old_pd);
-                        pd->llpd_hit++;
-                        while (old_cpu != llap->llap_pglist_cpu) {
-                                /* rarely case, someone else is touching this
-                                 * page too. */
-                                ll_pglist_double_unlock(sbi, old_cpu);
-                                old_cpu = llap->llap_pglist_cpu;
-                                pd=ll_pglist_double_lock(sbi, old_cpu, &old_pd);
-                        }
-
-                        list_move(&llap->llap_pglist_item,
-                                  &pd->llpd_list);
-                        old_pd->llpd_gen++;
-                        if (pd->llpd_cpu != old_cpu) {
-                                pd->llpd_count++;
-                                old_pd->llpd_count--;
-                                old_pd->llpd_gen++;
-                                llap->llap_pglist_cpu = pd->llpd_cpu;
-                                pd->llpd_cross++;
-                        }
-                        ll_pglist_double_unlock(sbi, old_cpu);
-                }
-                GOTO(out, llap);
-        }
-
-        exp = ll_i2dtexp(page->mapping->host);
-        if (exp == NULL)
-                RETURN(ERR_PTR(-EINVAL));
-
-        /* limit the number of lustre-cached pages */
-        cpu = get_cpu();
-        pd = LL_PGLIST_DATA(sbi);
-        target = pd->llpd_count - pd->llpd_budget;
-        if (target > 0) {
-                rc = 0;
-                atomic_inc(&pd->llpd_sample_count);
-                if (atomic_read(&pd->llpd_sample_count) > 
-                    sbi->ll_async_page_sample_max) {
-                        pd->llpd_reblnc_count++;
-                        rc = llap_async_cache_rebalance(sbi);
-                        if (rc == 0)
-                                target = pd->llpd_count - pd->llpd_budget;
-                }
-                /* if rc equals 1, it means other cpu is doing the rebalance
-                 * job, and our budget # would be modified when we read it. 
-                 * Furthermore, it is much likely being increased because
-                 * we have already reached the rebalance threshold. In this
-                 * case, we skip to shrink cache here. */
-                if ((rc == 0) && target > 0)
-                        llap_shrink_cache_internal(sbi, cpu, target + 32);
-        }
-        put_cpu();
-
-        OBD_SLAB_ALLOC(llap, ll_async_page_slab, CFS_ALLOC_STD,
-                       ll_async_page_slab_size);
-        if (llap == NULL)
-                RETURN(ERR_PTR(-ENOMEM));
-        llap->llap_magic = LLAP_MAGIC;
-        llap->llap_cookie = (void *)llap + size_round(sizeof(*llap));
-
-        /* XXX: for bug 11270 - check for lockless origin here! */
-        if (origin == LLAP_ORIGIN_LOCKLESS_IO)
-                llap->llap_nocache = 1;
-
-        rc = obd_prep_async_page(exp, ll_i2info(inode)->lli_smd, NULL, page,
-                                 (obd_off)page->index << CFS_PAGE_SHIFT,
-                                 &ll_async_page_ops, llap, &llap->llap_cookie,
-                                 llap->llap_nocache, lockh);
-        if (rc) {
-                OBD_SLAB_FREE(llap, ll_async_page_slab,
-                              ll_async_page_slab_size);
-                RETURN(ERR_PTR(rc));
-        }
-
-        CDEBUG(D_CACHE, "llap %p page %p cookie %p obj off "LPU64"\n", llap,
-               page, llap->llap_cookie, (obd_off)page->index << CFS_PAGE_SHIFT);
-        /* also zeroing the PRIVBITS low order bitflags */
-        __set_page_ll_data(page, llap);
-        llap->llap_page = page;
-
-        lcounter_inc(&sbi->ll_async_page_count);
-        pd = ll_pglist_lock(sbi);
-        list_add_tail(&llap->llap_pglist_item, &pd->llpd_list);
-        INIT_LIST_HEAD(&llap->llap_pending_write);
-        pd->llpd_count++;
-        pd->llpd_gen++;
-        pd->llpd_miss++;
-        llap->llap_pglist_cpu = pd->llpd_cpu;
-        ll_pglist_unlock(sbi);
-
- out:
-        if (unlikely(sbi->ll_flags & LL_SBI_CHECKSUM)) {
-                __u32 csum;
-                char *kaddr = kmap_atomic(page, KM_USER0);
-                csum = init_checksum(OSC_DEFAULT_CKSUM);
-                csum = compute_checksum(csum, kaddr, CFS_PAGE_SIZE,
-                                        OSC_DEFAULT_CKSUM);
-                kunmap_atomic(kaddr, KM_USER0);
-                if (origin == LLAP_ORIGIN_READAHEAD ||
-                    origin == LLAP_ORIGIN_READPAGE ||
-                    origin == LLAP_ORIGIN_LOCKLESS_IO) {
-                        llap->llap_checksum = 0;
-                } else if (origin == LLAP_ORIGIN_COMMIT_WRITE ||
-                           llap->llap_checksum == 0) {
-                        llap->llap_checksum = csum;
-                        CDEBUG(D_PAGE, "page %p cksum %x\n", page, csum);
-                } else if (llap->llap_checksum == csum) {
-                        /* origin == LLAP_ORIGIN_WRITEPAGE */
-                        CDEBUG(D_PAGE, "page %p cksum %x confirmed\n",
-                               page, csum);
-                } else {
-                        /* origin == LLAP_ORIGIN_WRITEPAGE */
-                        LL_CDEBUG_PAGE(D_ERROR, page, "old cksum %x != new "
-                                       "%x!\n", llap->llap_checksum, csum);
+        result = cl_io_rw_init(env, io, CIT_WRITE, pos, from - to);
+        if (result == 0) {
+                cio->cui_fd = LUSTRE_FPRIVATE(file);
+                vio->cui_oneshot = 1;
+                result = cl_io_iter_init(env, io);
+                if (result == 0) {
+                        result = cl_io_lock(env, io);
+                        if (result == 0)
+                                result = cl_io_start(env, io);
                  }
-        }
-
-        llap->llap_origin = origin;
-        RETURN(llap);
+        } else
+                result = io->ci_result;
+        return result;
  }
  
-struct ll_async_page *llap_from_page(struct page *page,
-                                     unsigned origin)
-{
-        return llap_from_page_with_lockh(page, origin, NULL);
-}
-
-static int queue_or_sync_write(struct obd_export *exp, struct inode *inode,
-                               struct ll_async_page *llap,
-                               unsigned to, obd_flag async_flags)
+/**
+ * ->prepare_write() address space operation called by generic_file_write()
+ * for every page during write.
+ */
+int ll_prepare_write(struct file *file, struct page *vmpage, unsigned from,
+                     unsigned to)
  {
-        unsigned long size_index = i_size_read(inode) >> CFS_PAGE_SHIFT;
-        struct obd_io_group *oig;
-        struct ll_sb_info *sbi = ll_i2sbi(inode);
-        int rc, noquot = llap->llap_ignore_quota ? OBD_BRW_NOQUOTA : 0;
+        struct lu_env    *env;
+        struct cl_io     *io;
+        struct cl_page   *page;
+        int result;
+        int refcheck;
          ENTRY;
  
-        /* _make_ready only sees llap once we've unlocked the page */
-        llap->llap_write_queued = 1;
-        rc = obd_queue_async_io(exp, ll_i2info(inode)->lli_smd, NULL,
-                                llap->llap_cookie, OBD_BRW_WRITE | noquot,
-                                0, 0, 0, async_flags);
-        if (rc == 0) {
-                LL_CDEBUG_PAGE(D_PAGE, llap->llap_page, "write queued\n");
-                GOTO(out, 0);
-        }
-
-        llap->llap_write_queued = 0;
-        /* Do not pass llap here as it is sync write. */
-        llap_write_pending(inode, NULL);
-
-        rc = oig_init(&oig);
-        if (rc)
-                GOTO(out, rc);
-
-        /* make full-page requests if we are not at EOF (bug 4410) */
-        if (to != CFS_PAGE_SIZE && llap->llap_page->index < size_index) {
-                LL_CDEBUG_PAGE(D_PAGE, llap->llap_page,
-                               "sync write before EOF: size_index %lu, to %d\n",
-                               size_index, to);
-                to = CFS_PAGE_SIZE;
-        } else if (to != CFS_PAGE_SIZE && llap->llap_page->index == size_index){
-                int size_to = i_size_read(inode) & ~CFS_PAGE_MASK;
-                LL_CDEBUG_PAGE(D_PAGE, llap->llap_page,
-                               "sync write at EOF: size_index %lu, to %d/%d\n",
-                               size_index, to, size_to);
-                if (to < size_to)
-                        to = size_to;
-        }
-
-        /* compare the checksum once before the page leaves llite */
-        if (unlikely((sbi->ll_flags & LL_SBI_CHECKSUM) &&
-                     llap->llap_checksum != 0)) {
-                __u32 csum;
-                struct page *page = llap->llap_page;
-                char *kaddr = kmap_atomic(page, KM_USER0);
-                csum = init_checksum(OSC_DEFAULT_CKSUM);
-                csum = compute_checksum(csum, kaddr, CFS_PAGE_SIZE,
-                                        OSC_DEFAULT_CKSUM);
-                kunmap_atomic(kaddr, KM_USER0);
-                if (llap->llap_checksum == csum) {
-                        CDEBUG(D_PAGE, "page %p cksum %x confirmed\n",
-                               page, csum);
-                } else {
-                        CERROR("page %p old cksum %x != new cksum %x!\n",
-                               page, llap->llap_checksum, csum);
+        result = ll_cl_init(file, vmpage, &env, &io, &page, &refcheck);
+        /*
+         * Loop-back driver calls ->prepare_write() and ->sendfile() methods
+         * directly, bypassing file system ->write() operation, so cl_io has
+         * to be created here.
+         */
+        if (result == -EALREADY) {
+                io = &ccc_env_info(env)->cti_io;
+                result = ll_prepare_loop(env, io, file, vmpage, from, to);
+                if (result == 0) {
+                        result = ll_cl_init(file, vmpage,
+                                            &env, &io, &page, &refcheck);
+                        cl_env_put(env, NULL);
                  }
          }
-
-        rc = obd_queue_group_io(exp, ll_i2info(inode)->lli_smd, NULL, oig,
-                                llap->llap_cookie, OBD_BRW_WRITE | noquot,
-                                0, to, 0, ASYNC_READY | ASYNC_URGENT |
-                                ASYNC_COUNT_STABLE | ASYNC_GROUP_SYNC);
-        if (rc)
-                GOTO(free_oig, rc);
-
-        rc = obd_trigger_group_io(exp, ll_i2info(inode)->lli_smd, NULL, oig);
-        if (rc)
-                GOTO(free_oig, rc);
-
-        rc = oig_wait(oig);
-
-        if (!rc && async_flags & ASYNC_READY) {
-                unlock_page(llap->llap_page);
-                if (PageWriteback(llap->llap_page))
-                        end_page_writeback(llap->llap_page);
-        }
-
-        if (rc == 0 && llap_write_complete(inode, llap))
-                ll_queue_done_writing(inode, 0);
-
-        LL_CDEBUG_PAGE(D_PAGE, llap->llap_page, "sync write returned %d\n", rc);
-
-free_oig:
-        oig_release(oig);
-out:
-        RETURN(rc);
+        if (result == 0) {
+                cl_page_assume(env, io, page);
+                result = cl_io_prepare_write(env, io, page, from, to);
+                if (result == 0) {
+                        struct vvp_io *vio;
+
+                        /*
+                         * Add a reference, so that page is not evicted from
+                         * the cache until ->commit_write() is called.
+                         */
+                        cl_page_get(page);
+                        lu_ref_add(&page->cp_reference, "prepare_write",
+                                   cfs_current());
+                        vio = vvp_env_io(env);
+                        if (vio->cui_oneshot > 0)
+                                vio->cui_oneshot++;
+                } else
+                        cl_page_unassume(env, io, page);
+        }
+        ll_cl_fini(env, io, page, &refcheck);
+        RETURN(result);
  }
  
-/* update our write count to account for i_size increases that may have
- * happened since we've queued the page for io. */
-
-/* be careful not to return success without setting the page Uptodate or
- * the next pass through prepare_write will read in stale data from disk. */
-int ll_commit_write(struct file *file, struct page *page, unsigned from,
+int ll_commit_write(struct file *file, struct page *vmpage, unsigned from,
                      unsigned to)
  {
-        struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
-        struct inode *inode = page->mapping->host;
-        struct ll_inode_info *lli = ll_i2info(inode);
-        struct lov_stripe_md *lsm = lli->lli_smd;
-        struct obd_export *exp;
-        struct ll_async_page *llap;
-        loff_t size;
-        struct lustre_handle *lockh = NULL;
-        int rc = 0;
+        struct lu_env    *env;
+        struct cl_io     *io;
+        struct cl_page   *page;
+        int result;
+        int refcheck;
          ENTRY;
  
-        SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */
-        LASSERT(inode == file->f_dentry->d_inode);
-        LASSERT(PageLocked(page));
-
-        CDEBUG(D_INODE, "inode %p is writing page %p from %d to %d at %lu\n",
-               inode, page, from, to, page->index);
-
-        if (fd->fd_flags & LL_FILE_GROUP_LOCKED)
-                lockh = &fd->fd_cwlockh;
-
-        llap = llap_from_page_with_lockh(page, LLAP_ORIGIN_COMMIT_WRITE, lockh);
-        if (IS_ERR(llap))
-                RETURN(PTR_ERR(llap));
-
-        exp = ll_i2dtexp(inode);
-        if (exp == NULL)
-                RETURN(-EINVAL);
-
-        llap->llap_ignore_quota = cfs_capable(CFS_CAP_SYS_RESOURCE);
-
-        /*
-         * queue a write for some time in the future the first time we
-         * dirty the page.
-         *
-         * This is different from what other file systems do: they usually
-         * just mark page (and some of its buffers) dirty and rely on
-         * balance_dirty_pages() to start a write-back. Lustre wants write-back
-         * to be started earlier for the following reasons:
-         *
-         *     (1) with a large number of clients we need to limit the amount
-         *     of cached data on the clients a lot;
-         *
-         *     (2) large compute jobs generally want compute-only then io-only
-         *     and the IO should complete as quickly as possible;
-         *
-         *     (3) IO is batched up to the RPC size and is async until the
-         *     client max cache is hit
-         *     (/proc/fs/lustre/osc/OSC.../max_dirty_mb)
-         *
-         */
-        if (!PageDirty(page)) {
-                ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_DIRTY_MISSES, 1);
-
-                rc = queue_or_sync_write(exp, inode, llap, to, 0);
-                if (rc)
-                        GOTO(out, rc);
-        } else {
-                ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_DIRTY_HITS, 1);
+        result = ll_cl_init(file, vmpage, &env, &io, &page, &refcheck);
+        LASSERT(result != -EALREADY);
+        if (result == 0) {
+                LASSERT(cl_page_is_owned(page, io));
+                result = cl_io_commit_write(env, io, page, from, to);
+                if (cl_page_is_owned(page, io))
+                        cl_page_unassume(env, io, page);
+                /*
+                 * Release reference acquired by cl_io_prepare_write().
+                 */
+                lu_ref_del(&page->cp_reference, "prepare_write", cfs_current());
+                cl_page_put(env, page);
          }
+        ll_cl_fini(env, io, page, &refcheck);
+        RETURN(result);
+}
  
-        /* put the page in the page cache, from now on ll_removepage is
-         * responsible for cleaning up the llap.
-         * only set page dirty when it's queued to be write out */
-        if (llap->llap_write_queued)
-                set_page_dirty(page);
+struct obd_capa *cl_capa_lookup(struct inode *inode, enum cl_req_type crt)
+{
+        __u64 opc;
  
-out:
-        size = (((obd_off)page->index) << CFS_PAGE_SHIFT) + to;
-        ll_inode_size_lock(inode, 0);
-        if (rc == 0) {
-                lov_stripe_lock(lsm);
-                obd_adjust_kms(exp, lsm, size, 0);
-                lov_stripe_unlock(lsm);
-                if (size > i_size_read(inode))
-                        i_size_write(inode, size);
-                SetPageUptodate(page);
-        } else if (size > i_size_read(inode)) {
-                /* this page beyond the pales of i_size, so it can't be
-                 * truncated in ll_p_r_e during lock revoking. we must
-                 * teardown our book-keeping here. */
-                ll_removepage(page);
-        }
-        ll_inode_size_unlock(inode, 0);
-        RETURN(rc);
+        opc = crt == CRT_WRITE ? CAPA_OPC_OSS_WRITE : CAPA_OPC_OSS_RW;
+        return ll_osscapa_get(inode, opc);
  }
  
  static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which);
  
-/* WARNING: This algorithm is used to reduce the contention on 
- * sbi->ll_lock. It should work well if the ra_max_pages is much 
+/* WARNING: This algorithm is used to reduce the contention on
+ * sbi->ll_lock. It should work well if the ra_max_pages is much
   * greater than the single file's read-ahead window.
   *
- * TODO: There may exist a `global sync problem' in this implementation. 
+ * TODO: There may exist a `global sync problem' in this implementation.
   * Considering the global ra window is 100M, and each file's ra window is 10M,
- * there are over 10 files trying to get its ra budget and reach 
+ * there are over 10 files trying to get its ra budget and reach
   * ll_ra_count_get at the exactly same time. All of them will get a zero ra
   * window, although the global window is 100M. -jay
   */
@@ -1116,187 +375,24 @@ out:
          RETURN(ret);
  }
  
-static void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len)
+void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len)
  {
          struct ll_ra_info *ra = &sbi->ll_ra_info;
          atomic_sub(len, &ra->ra_cur_pages);
  }
  
-/* called for each page in a completed rpc.*/
-int ll_ap_completion(void *data, int cmd, struct obdo *oa, int rc)
-{
-        struct ll_async_page *llap;
-        struct page *page;
-        int ret = 0;
-        ENTRY;
-
-        llap = llap_from_cookie(data);
-        page = llap->llap_page;
-        LASSERT(PageLocked(page));
-        LASSERT(CheckWriteback(page,cmd));
-
-        LL_CDEBUG_PAGE(D_PAGE, page, "completing cmd %d with %d\n", cmd, rc);
-
-        if (cmd & OBD_BRW_READ && llap->llap_defer_uptodate)
-                ll_ra_count_put(ll_i2sbi(page->mapping->host), 1);
-
-        if (rc == 0)  {
-                if (cmd & OBD_BRW_READ) {
-                        if (!llap->llap_defer_uptodate)
-                                SetPageUptodate(page);
-                } else {
-                        llap->llap_write_queued = 0;
-                }
-                ClearPageError(page);
-        } else {
-                if (cmd & OBD_BRW_READ) {
-                        llap->llap_defer_uptodate = 0;
-                }
-                SetPageError(page);
-                if (rc == -ENOSPC)
-                        set_bit(AS_ENOSPC, &page->mapping->flags);
-                else
-                        set_bit(AS_EIO, &page->mapping->flags);
-        }
-
-        /* be carefull about clear WB.
-         * if WB will cleared after page lock is released - paralel IO can be
-         * started before ap_make_ready is finished - so we will be have page
-         * with PG_Writeback set from ->writepage() and completed READ which
-         * clear this flag */
-        if ((cmd & OBD_BRW_WRITE) && PageWriteback(page))
-                end_page_writeback(page);
-
-        unlock_page(page);
-
-        if (cmd & OBD_BRW_WRITE) {
-                /* Only rc == 0, write succeed, then this page could be deleted
-                 * from the pending_writing list
-                 */
-                if (rc == 0 && llap_write_complete(page->mapping->host, llap))
-                        ll_queue_done_writing(page->mapping->host, 0);
-        }
-
-        page_cache_release(page);
-
-        RETURN(ret);
-}
-
-static void __ll_put_llap(struct page *page)
-{
-        struct inode *inode = page->mapping->host;
-        struct obd_export *exp;
-        struct ll_async_page *llap;
-        struct ll_sb_info *sbi = ll_i2sbi(inode);
-        struct ll_pglist_data *pd;
-        int rc, cpu;
-        ENTRY;
-
-        exp = ll_i2dtexp(inode);
-        if (exp == NULL) {
-                CERROR("page %p ind %lu gave null export\n", page, page->index);
-                EXIT;
-                return;
-        }
-
-        llap = llap_from_page(page, LLAP_ORIGIN_REMOVEPAGE);
-        if (IS_ERR(llap)) {
-                CERROR("page %p ind %lu couldn't find llap: %ld\n", page,
-                       page->index, PTR_ERR(llap));
-                EXIT;
-                return;
-        }
-
-        if (llap_write_complete(inode, llap))
-                ll_queue_done_writing(inode, 0);
-
-        rc = obd_teardown_async_page(exp, ll_i2info(inode)->lli_smd, NULL,
-                                     llap->llap_cookie);
-        if (rc != 0)
-                CERROR("page %p ind %lu failed: %d\n", page, page->index, rc);
-
-        /* this unconditional free is only safe because the page lock
-         * is providing exclusivity to memory pressure/truncate/writeback..*/
-        __clear_page_ll_data(page);
-
-        lcounter_dec(&sbi->ll_async_page_count);
-        cpu = llap->llap_pglist_cpu;
-        pd = ll_pglist_cpu_lock(sbi, cpu);
-        pd->llpd_gen++;
-        pd->llpd_count--;
-        if (!list_empty(&llap->llap_pglist_item))
-                list_del_init(&llap->llap_pglist_item);
-        ll_pglist_cpu_unlock(sbi, cpu);
-        OBD_SLAB_FREE(llap, ll_async_page_slab, ll_async_page_slab_size);
-        EXIT;
-}
-
-/* the kernel calls us here when a page is unhashed from the page cache.
- * the page will be locked and the kernel is holding a spinlock, so
- * we need to be careful.  we're just tearing down our book-keeping
- * here. */
-void ll_removepage(struct page *page)
-{
-        struct ll_async_page *llap = llap_cast_private(page);
-        ENTRY;
-
-        LASSERT(!in_interrupt());
-
-        /* sync pages or failed read pages can leave pages in the page
-         * cache that don't have our data associated with them anymore */
-        if (page_private(page) == 0) {
-                EXIT;
-                return;
-        }
-
-        LASSERT(!llap->llap_lockless_io_page);
-        LASSERT(!llap->llap_nocache);
-        LL_CDEBUG_PAGE(D_PAGE, page, "being evicted\n");
-        __ll_put_llap(page);
-        EXIT;
-}
-
-static int ll_issue_page_read(struct obd_export *exp,
-                              struct ll_async_page *llap,
-                              struct obd_io_group *oig, int defer)
-{
-        struct page *page = llap->llap_page;
-        int rc;
-
-        page_cache_get(page);
-        llap->llap_defer_uptodate = defer;
-        llap->llap_ra_used = 0;
-        rc = obd_queue_group_io(exp, ll_i2info(page->mapping->host)->lli_smd,
-                                NULL, oig, llap->llap_cookie, OBD_BRW_READ, 0,
-                                CFS_PAGE_SIZE, 0, ASYNC_COUNT_STABLE |
-                                                  ASYNC_READY | ASYNC_URGENT);
-        if (rc) {
-                LL_CDEBUG_PAGE(D_ERROR, page, "read queue failed: rc %d\n", rc);
-                page_cache_release(page);
-        }
-        RETURN(rc);
-}
-
  static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which)
  {
          LASSERTF(which >= 0 && which < _NR_RA_STAT, "which: %u\n", which);
          lprocfs_counter_incr(sbi->ll_ra_stats, which);
  }
  
-static void ll_ra_stats_inc(struct address_space *mapping, enum ra_stat which)
+void ll_ra_stats_inc(struct address_space *mapping, enum ra_stat which)
  {
          struct ll_sb_info *sbi = ll_i2sbi(mapping->host);
          ll_ra_stats_inc_sbi(sbi, which);
  }
  
-void ll_ra_accounting(struct ll_async_page *llap, struct address_space *mapping)
-{
-        if (!llap->llap_defer_uptodate || llap->llap_ra_used)
-                return;
-
-        ll_ra_stats_inc(mapping, RA_STAT_DISCARDED);
-}
-
  #define RAS_CDEBUG(ras) \
          CDEBUG(D_READA,                                                      \
                 "lrp %lu cr %lu cp %lu ws %lu wl %lu nra %lu r %lu ri %lu"    \
@@ -1380,86 +476,102 @@ struct ll_ra_read *ll_ra_read_get(struct file *f)
          return bead;
  }
  
-static int ll_read_ahead_page(struct obd_export *exp, struct obd_io_group *oig,
+static int cl_read_ahead_page(const struct lu_env *env, struct cl_io *io,
+                              struct cl_page_list *queue, struct cl_page *page,
+                              struct page *vmpage)
+{
+        struct ccc_page *cp;
+        int              rc;
+
+        ENTRY;
+
+        rc = 0;
+        cl_page_assume(env, io, page);
+        lu_ref_add(&page->cp_reference, "ra", cfs_current());
+        cp = cl2ccc_page(cl_page_at(page, &vvp_device_type));
+        if (!cp->cpg_defer_uptodate && !Page_Uptodate(vmpage)) {
+                rc = cl_page_is_under_lock(env, io, page);
+                if (rc == -EBUSY) {
+                        cp->cpg_defer_uptodate = 1;
+                        cp->cpg_ra_used = 0;
+                        cl_page_list_add(queue, page);
+                        rc = 1;
+                } else {
+                        cl_page_delete(env, page);
+                        rc = -ENOLCK;
+                }
+        } else
+                /* skip completed pages */
+                cl_page_unassume(env, io, page);
+        lu_ref_del(&page->cp_reference, "ra", cfs_current());
+        cl_page_put(env, page);
+        RETURN(rc);
+}
+
+/**
+ * Initiates read-ahead of a page with given index.
+ *
+ * \retval     +ve: page was added to \a queue.
+ *
+ * \retval -ENOLCK: there is no extent lock for this part of a file, stop
+ *                  read-ahead.
+ *
+ * \retval  -ve, 0: page wasn't added to \a queue for other reason.
+ */
+static int ll_read_ahead_page(const struct lu_env *env, struct cl_io *io,
+                              struct cl_page_list *queue,
                                int index, struct address_space *mapping)
  {
-        struct ll_async_page *llap;
-        struct page *page;
-        unsigned int gfp_mask = 0;
-        int rc = 0;
+        struct page      *vmpage;
+        struct cl_object *clob  = ll_i2info(mapping->host)->lli_clob;
+        struct cl_page   *page;
+        enum ra_stat      which = _NR_RA_STAT; /* keep gcc happy */
+        unsigned int      gfp_mask;
+        int               rc    = 0;
+        const char       *msg   = NULL;
+
+        ENTRY;
  
          gfp_mask = GFP_HIGHUSER & ~__GFP_WAIT;
  #ifdef __GFP_NOWARN
          gfp_mask |= __GFP_NOWARN;
  #endif
-        page = grab_cache_page_nowait_gfp(mapping, index, gfp_mask);
-        if (page == NULL) {
-                ll_ra_stats_inc(mapping, RA_STAT_FAILED_GRAB_PAGE);
-                CDEBUG(D_READA, "g_c_p_n failed\n");
-                return 0;
-        }
-
-        /* Check if page was truncated or reclaimed */
-        if (page->mapping != mapping) {
-                ll_ra_stats_inc(mapping, RA_STAT_WRONG_GRAB_PAGE);
-                CDEBUG(D_READA, "g_c_p_n returned invalid page\n");
-                GOTO(unlock_page, rc = 0);     
-        }
-
-        /* we do this first so that we can see the page in the /proc
-         * accounting */
-        llap = llap_from_page(page, LLAP_ORIGIN_READAHEAD);
-        if (IS_ERR(llap) || llap->llap_defer_uptodate) {
-                if (PTR_ERR(llap) == -ENOLCK) {
-                        ll_ra_stats_inc(mapping, RA_STAT_FAILED_MATCH);
-                        CDEBUG(D_READA | D_PAGE,
-                               "Adding page to cache failed index "
-                                "%d\n", index);
-                                CDEBUG(D_READA, "nolock page\n");
-                                GOTO(unlock_page, rc = -ENOLCK);
+        vmpage = grab_cache_page_nowait_gfp(mapping, index, gfp_mask);
+        if (vmpage != NULL) {
+                /* Check if vmpage was truncated or reclaimed */
+                if (vmpage->mapping == mapping) {
+                        page = cl_page_find(env, clob, vmpage->index,
+                                            vmpage, CPT_CACHEABLE);
+                        if (!IS_ERR(page)) {
+                                rc = cl_read_ahead_page(env, io, queue,
+                                                        page, vmpage);
+                                if (rc == -ENOLCK) {
+                                        which = RA_STAT_FAILED_MATCH;
+                                        msg   = "lock match failed";
+                                }
+                        } else {
+                                which = RA_STAT_FAILED_GRAB_PAGE;
+                                msg   = "cl_page_find failed";
+                        }
+                } else {
+                        which = RA_STAT_WRONG_GRAB_PAGE;
+                        msg   = "g_c_p_n returned invalid page";
                  }
-                CDEBUG(D_READA, "read-ahead page\n");
-                GOTO(unlock_page, rc = 0);     
-        }
-
-        /* skip completed pages */
-        if (Page_Uptodate(page))
-                GOTO(unlock_page, rc = 0);     
-
-        /* bail out when we hit the end of the lock. */
-        rc = ll_issue_page_read(exp, llap, oig, 1);
-        if (rc == 0) {
-                LL_CDEBUG_PAGE(D_READA | D_PAGE, page, "started read-ahead\n");
-                rc = 1;
+                if (rc != 1)
+                        unlock_page(vmpage);
+                page_cache_release(vmpage);
          } else {
-unlock_page:   
-                unlock_page(page);
-                LL_CDEBUG_PAGE(D_READA | D_PAGE, page, "skipping read-ahead\n");
+                which = RA_STAT_FAILED_GRAB_PAGE;
+                msg   = "g_c_p_n failed";
+        }
+        if (msg != NULL) {
+                ll_ra_stats_inc(mapping, which);
+                CDEBUG(D_READA, "%s\n", msg);
          }
-        page_cache_release(page);
-        return rc;
+        RETURN(rc);
  }
  
-/* ra_io_arg will be filled in the beginning of ll_readahead with
- * ras_lock, then the following ll_read_ahead_pages will read RA
- * pages according to this arg, all the items in this structure are
- * counted by page index.
- */
-struct ra_io_arg {
-        unsigned long ria_start;  /* start offset of read-ahead*/
-        unsigned long ria_end;    /* end offset of read-ahead*/
-        /* If stride read pattern is detected, ria_stoff means where
-         * stride read is started. Note: for normal read-ahead, the
-         * value here is meaningless, and also it will not be accessed*/
-        pgoff_t ria_stoff;
-        /* ria_length and ria_pages are the length and pages length in the
-         * stride I/O mode. And they will also be used to check whether
-         * it is stride I/O read-ahead in the read-ahead pages*/
-        unsigned long ria_length;
-        unsigned long ria_pages;
-};
-
-#define RIA_DEBUG(ria)                                                       \
+#define RIA_DEBUG(ria)                                                       \
          CDEBUG(D_READA, "rs %lu re %lu ro %lu rl %lu rp %lu\n",       \
          ria->ria_start, ria->ria_end, ria->ria_stoff, ria->ria_length,\
          ria->ria_pages)
@@ -1522,9 +634,9 @@ static int ras_inside_ra_window(unsigned long idx, struct ra_io_arg *ria)
                 (idx - ria->ria_stoff) % ria->ria_length < ria->ria_pages;
  }
  
-static int ll_read_ahead_pages(struct obd_export *exp,
-                               struct obd_io_group *oig,
-                               struct ra_io_arg *ria,  
+static int ll_read_ahead_pages(const struct lu_env *env,
+                               struct cl_io *io, struct cl_page_list *queue,
+                               struct ra_io_arg *ria,
                                 unsigned long *reserved_pages,
                                 struct address_space *mapping,
                                 unsigned long *ra_end)
@@ -1540,17 +652,18 @@ static int ll_read_ahead_pages(struct obd_export *exp,
                          *reserved_pages > 0; page_idx++) {
                  if (ras_inside_ra_window(page_idx, ria)) {
                          /* If the page is inside the read-ahead window*/
-                        rc = ll_read_ahead_page(exp, oig, page_idx, mapping);
-                       if (rc == 1) {
-                               (*reserved_pages)--;
-                               count ++;
-                       } else if (rc == -ENOLCK)
-                               break;
+                        rc = ll_read_ahead_page(env, io, queue,
+                                                page_idx, mapping);
+                        if (rc == 1) {
+                                (*reserved_pages)--;
+                                count ++;
+                        } else if (rc == -ENOLCK)
+                                break;
                  } else if (stride_ria) {
                          /* If it is not in the read-ahead window, and it is
                           * read-ahead mode, then check whether it should skip
                           * the stride gap */
-                       pgoff_t offset;
+                        pgoff_t offset;
                          /* FIXME: This assertion only is valid when it is for
                           * forward read-ahead, it will be fixed when backward
                           * read-ahead is implemented */
@@ -1559,9 +672,9 @@ static int ll_read_ahead_pages(struct obd_export *exp,
                                  " offset %lu \n", page_idx, ria->ria_stoff);
  
                          offset = page_idx - ria->ria_stoff;
-                       offset = offset % (ria->ria_length);
-                       if (offset > ria->ria_pages) {
-                               page_idx += ria->ria_length - offset;
+                        offset = offset % (ria->ria_length);
+                        if (offset > ria->ria_pages) {
+                                page_idx += ria->ria_length - offset;
                                  CDEBUG(D_READA, "i %lu skip %lu \n", page_idx,
                                         ria->ria_length - offset);
                                  continue;
@@ -1572,43 +685,56 @@ static int ll_read_ahead_pages(struct obd_export *exp,
          return count;
  }
  
-static int ll_readahead(struct ll_readahead_state *ras,
-                         struct obd_export *exp, struct address_space *mapping,
-                         struct obd_io_group *oig, int flags)
+int ll_readahead(const struct lu_env *env, struct cl_io *io,
+                 struct ll_readahead_state *ras, struct address_space *mapping,
+                 struct cl_page_list *queue, int flags)
  {
+        struct vvp_io *vio = vvp_env_io(env);
+        struct vvp_thread_info *vti = vvp_env_info(env);
+        struct ccc_thread_info *cti = ccc_env_info(env);
          unsigned long start = 0, end = 0, reserved;
          unsigned long ra_end, len;
          struct inode *inode;
-        struct lov_stripe_md *lsm;
          struct ll_ra_read *bead;
-        struct ost_lvb lvb;
-        struct ra_io_arg ria = { 0 };
+        struct ra_io_arg *ria = &vti->vti_ria;
+        struct ll_inode_info *lli;
+        struct cl_object *clob;
+        struct cl_attr   *attr = &cti->cti_attr;
          int ret = 0;
          __u64 kms;
          ENTRY;
  
          inode = mapping->host;
-        lsm = ll_i2info(inode)->lli_smd;
+        lli = ll_i2info(inode);
+        clob = lli->lli_clob;
+
+        memset(ria, 0, sizeof *ria);
+
+        cl_object_attr_lock(clob);
+        ret = cl_object_attr_get(env, clob, attr);
+        cl_object_attr_unlock(clob);
  
-        lov_stripe_lock(lsm);
-        inode_init_lvb(inode, &lvb);
-        obd_merge_lvb(ll_i2dtexp(inode), lsm, &lvb, 1);
-        kms = lvb.lvb_size;
-        lov_stripe_unlock(lsm);
+        if (ret != 0)
+                RETURN(ret);
+        kms = attr->cat_kms;
          if (kms == 0) {
                  ll_ra_stats_inc(mapping, RA_STAT_ZERO_LEN);
                  RETURN(0);
          }
  
          spin_lock(&ras->ras_lock);
-        bead = ll_ra_read_get_locked(ras);
+        if (vio->cui_ra_window_set)
+                bead = &vio->cui_bead;
+        else
+                bead = NULL;
+
          /* Enlarge the RA window to encompass the full read */
          if (bead != NULL && ras->ras_window_start + ras->ras_window_len <
              bead->lrr_start + bead->lrr_count) {
                  ras->ras_window_len = bead->lrr_start + bead->lrr_count -
                                        ras->ras_window_start;
          }
-               /* Reserve a part of the read-ahead window that we'll be issuing */
+        /* Reserve a part of the read-ahead window that we'll be issuing */
          if (ras->ras_window_len) {
                  start = ras->ras_next_readahead;
                  end = ras->ras_window_start + ras->ras_window_len - 1;
@@ -1619,13 +745,13 @@ static int ll_readahead(struct ll_readahead_state *ras,
                  ras->ras_next_readahead = max(end, end + 1);
                  RAS_CDEBUG(ras);
          }
-        ria.ria_start = start;
-        ria.ria_end = end;
+        ria->ria_start = start;
+        ria->ria_end = end;
          /* If stride I/O mode is detected, get stride window*/
          if (stride_io_mode(ras)) {
-                ria.ria_stoff = ras->ras_stride_offset;
-                ria.ria_length = ras->ras_stride_length;
-                ria.ria_pages = ras->ras_stride_pages;
+                ria->ria_stoff = ras->ras_stride_offset;
+                ria->ria_length = ras->ras_stride_length;
+                ria->ria_pages = ras->ras_stride_pages;
          }
          spin_unlock(&ras->ras_lock);
  
@@ -1633,7 +759,7 @@ static int ll_readahead(struct ll_readahead_state *ras,
                  ll_ra_stats_inc(mapping, RA_STAT_ZERO_WINDOW);
                  RETURN(0);
          }
-        len = ria_page_count(&ria);
+        len = ria_page_count(ria);
          if (len == 0)
                  RETURN(0);
  
@@ -1643,8 +769,9 @@ static int ll_readahead(struct ll_readahead_state *ras,
                  ll_ra_stats_inc(mapping, RA_STAT_MAX_IN_FLIGHT);
  
          CDEBUG(D_READA, "reserved page %lu \n", reserved);
-       
-        ret = ll_read_ahead_pages(exp, oig, &ria, &reserved, mapping, &ra_end);
+
+        ret = ll_read_ahead_pages(env, io, queue,
+                                  ria, &reserved, mapping, &ra_end);
  
          LASSERTF(reserved >= 0, "reserved %lu\n", reserved);
          if (reserved != 0)
@@ -1659,15 +786,15 @@ static int ll_readahead(struct ll_readahead_state *ras,
           * if the region we failed to issue read-ahead on is still ahead
           * of the app and behind the next index to start read-ahead from */
          CDEBUG(D_READA, "ra_end %lu end %lu stride end %lu \n",
-               ra_end, end, ria.ria_end);
+               ra_end, end, ria->ria_end);
  
-        if (ra_end != (end + 1)) {
+        if (ra_end != end + 1) {
                  spin_lock(&ras->ras_lock);
                  if (ra_end < ras->ras_next_readahead &&
                      index_in_window(ra_end, ras->ras_window_start, 0,
                                      ras->ras_window_len)) {
-                       ras->ras_next_readahead = ra_end;
-                               RAS_CDEBUG(ras);
+                        ras->ras_next_readahead = ra_end;
+                               RAS_CDEBUG(ras);
                  }
                  spin_unlock(&ras->ras_lock);
          }
@@ -1710,7 +837,7 @@ void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras)
          INIT_LIST_HEAD(&ras->ras_read_beads);
  }
  
-/* 
+/*
   * Check whether the read request is in the stride window.
   * If it is in the stride window, return 1, otherwise return 0.
   */
@@ -1719,7 +846,7 @@ static int index_in_stride_window(unsigned long index,
                                    struct inode *inode)
  {
          unsigned long stride_gap = index - ras->ras_last_readpage - 1;
- 
+
          if (ras->ras_stride_length == 0 || ras->ras_stride_pages == 0)
                  return 0;
  
@@ -1729,7 +856,7 @@ static int index_in_stride_window(unsigned long index,
  
          /* Otherwise check the stride by itself */
          return (ras->ras_stride_length - ras->ras_stride_pages) == stride_gap &&
-                ras->ras_consecutive_pages == ras->ras_stride_pages; 
+                ras->ras_consecutive_pages == ras->ras_stride_pages;
  }
  
  static void ras_update_stride_detector(struct ll_readahead_state *ras,
@@ -1737,7 +864,7 @@ static void ras_update_stride_detector(struct ll_readahead_state *ras,
  {
          unsigned long stride_gap = index - ras->ras_last_readpage - 1;
  
-        if (!stride_io_mode(ras) && (stride_gap != 0 || 
+        if (!stride_io_mode(ras) && (stride_gap != 0 ||
               ras->ras_consecutive_stride_requests == 0)) {
                  ras->ras_stride_pages = ras->ras_consecutive_pages;
                  ras->ras_stride_length = stride_gap +ras->ras_consecutive_pages;
@@ -1809,14 +936,15 @@ static void ras_set_stride_offset(struct ll_readahead_state *ras)
          RAS_CDEBUG(ras);
  }
  
-static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
-                       struct ll_readahead_state *ras, unsigned long index,
-                       unsigned hit)
+void ras_update(struct ll_sb_info *sbi, struct inode *inode,
+                struct ll_readahead_state *ras, unsigned long index,
+                unsigned hit)
  {
          struct ll_ra_info *ra = &sbi->ll_ra_info;
          int zero = 0, stride_detect = 0, ra_miss = 0;
          ENTRY;
  
+        spin_lock(&sbi->ll_lock);
          spin_lock(&ras->ras_lock);
  
          ll_ra_stats_inc_sbi(sbi, hit ? RA_STAT_HIT : RA_STAT_MISS);
@@ -1879,14 +1007,14 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode,
                  if (ra_miss) {
                          if (index_in_stride_window(index, ras, inode) &&
                              stride_io_mode(ras)) {
-                                /*If stride-RA hit cache miss, the stride dector 
+                                /*If stride-RA hit cache miss, the stride dector
                                   *will not be reset to avoid the overhead of
                                   *redetecting read-ahead mode */
                                  if (index != ras->ras_last_readpage + 1)
                                         ras->ras_consecutive_pages = 0;
                                  RAS_CDEBUG(ras);
                          } else {
-                                /* Reset both stride window and normal RA window */ 
+                                /* Reset both stride window and normal RA window */
                                  ras_reset(ras, index);
                                  ras->ras_consecutive_pages++;
                                  ras_stride_reset(ras);
@@ -1940,433 +1068,107 @@ out_unlock:
          RAS_CDEBUG(ras);
          ras->ras_request_index++;
          spin_unlock(&ras->ras_lock);
+        spin_unlock(&sbi->ll_lock);
          return;
  }
  
-int ll_writepage(struct page *page)
-{
-        struct inode *inode = page->mapping->host;
-        struct ll_inode_info *lli = ll_i2info(inode);
-        struct obd_export *exp;
-        struct ll_async_page *llap;
-        int rc = 0;
-        ENTRY;
-
-        LASSERT(PageLocked(page));
-
-        exp = ll_i2dtexp(inode);
-        if (exp == NULL)
-                GOTO(out, rc = -EINVAL);
-
-        llap = llap_from_page(page, LLAP_ORIGIN_WRITEPAGE);
-        if (IS_ERR(llap))
-                GOTO(out, rc = PTR_ERR(llap));
-
-        LASSERT(!llap->llap_nocache);
-        LASSERT(!PageWriteback(page));
-        set_page_writeback(page);
-
-        page_cache_get(page);
-        if (llap->llap_write_queued) {
-                LL_CDEBUG_PAGE(D_PAGE, page, "marking urgent\n");
-                rc = obd_set_async_flags(exp, lli->lli_smd, NULL,
-                                         llap->llap_cookie,
-                                         ASYNC_READY | ASYNC_URGENT);
-        } else {
-                rc = queue_or_sync_write(exp, inode, llap, CFS_PAGE_SIZE,
-                                         ASYNC_READY | ASYNC_URGENT);
-        }
-        if (rc) {
-                /* re-dirty page on error so it retries write */
-                if (PageWriteback(page))
-                        end_page_writeback(page);
-
-                /* resend page only for not started IO*/
-                if (!PageError(page))
-                        ll_redirty_page(page);
-
-                page_cache_release(page);
-        }
-out:
-        if (rc) {
-                if (!lli->lli_async_rc)
-                        lli->lli_async_rc = rc;
-                /* resend page only for not started IO*/
-                unlock_page(page);
-        }
-        RETURN(rc);
-}
-
-/*
- * for now we do our readpage the same on both 2.4 and 2.5.  The kernel's
- * read-ahead assumes it is valid to issue readpage all the way up to
- * i_size, but our dlm locks make that not the case.  We disable the
- * kernel's read-ahead and do our own by walking ahead in the page cache
- * checking for dlm lock coverage.  the main difference between 2.4 and
- * 2.6 is how read-ahead gets batched and issued, but we're using our own,
- * so they look the same.
- */
-int ll_readpage(struct file *filp, struct page *page)
+int ll_writepage(struct page *vmpage, struct writeback_control *_)
  {
-        struct ll_file_data *fd = LUSTRE_FPRIVATE(filp);
-        struct inode *inode = page->mapping->host;
-        struct obd_export *exp;
-        struct ll_async_page *llap;
-        struct obd_io_group *oig = NULL;
-        struct lustre_handle *lockh = NULL;
-        int rc;
+        struct inode           *inode = vmpage->mapping->host;
+        struct lu_env          *env;
+        struct cl_io           *io;
+        struct cl_page         *page;
+        struct cl_object       *clob;
+        struct cl_2queue       *queue;
+        struct cl_env_nest      nest;
+        int result;
          ENTRY;
  
-        LASSERT(PageLocked(page));
-        LASSERT(!PageUptodate(page));
-        CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),offset=%Lu=%#Lx\n",
-               inode->i_ino, inode->i_generation, inode,
-               (((loff_t)page->index) << CFS_PAGE_SHIFT),
-               (((loff_t)page->index) << CFS_PAGE_SHIFT));
-        LASSERT(atomic_read(&filp->f_dentry->d_inode->i_count) > 0);
-
-        if (!ll_i2info(inode)->lli_smd) {
-                /* File with no objects - one big hole */
-                /* We use this just for remove_from_page_cache that is not
-                 * exported, we'd make page back up to date. */
-                ll_truncate_complete_page(page);
-                clear_page(kmap(page));
-                kunmap(page);
-                SetPageUptodate(page);
-                unlock_page(page);
-                RETURN(0);
-        }
-
-        rc = oig_init(&oig);
-        if (rc < 0)
-                GOTO(out, rc);
-
-        exp = ll_i2dtexp(inode);
-        if (exp == NULL)
-                GOTO(out, rc = -EINVAL);
+        LASSERT(PageLocked(vmpage));
+        LASSERT(!PageWriteback(vmpage));
  
-        if (fd->fd_flags & LL_FILE_GROUP_LOCKED)
-                lockh = &fd->fd_cwlockh;
-
-        llap = llap_from_page_with_lockh(page, LLAP_ORIGIN_READPAGE, lockh);
-        if (IS_ERR(llap)) {
-                if (PTR_ERR(llap) == -ENOLCK) {
-                        CWARN("ino %lu page %lu (%llu) not covered by "
-                              "a lock (mmap?).  check debug logs.\n",
-                              inode->i_ino, page->index,
-                              (long long)page->index << PAGE_CACHE_SHIFT);
-                }
-                GOTO(out, rc = PTR_ERR(llap));
-        }
-
-        if (ll_i2sbi(inode)->ll_ra_info.ra_max_pages)
-                ras_update(ll_i2sbi(inode), inode, &fd->fd_ras, page->index,
-                           llap->llap_defer_uptodate);
-
-
-        if (llap->llap_defer_uptodate) {
-                /* This is the callpath if we got the page from a readahead */
-                llap->llap_ra_used = 1;
-                rc = ll_readahead(&fd->fd_ras, exp, page->mapping, oig,
-                                  fd->fd_flags);
-                if (rc > 0)
-                        obd_trigger_group_io(exp, ll_i2info(inode)->lli_smd,
-                                             NULL, oig);
-                LL_CDEBUG_PAGE(D_PAGE, page, "marking uptodate from defer\n");
-                SetPageUptodate(page);
-                unlock_page(page);
-                GOTO(out_oig, rc = 0);
-        }
-
-        rc = ll_issue_page_read(exp, llap, oig, 0);
-        if (rc)
-                GOTO(out, rc);
-
-        LL_CDEBUG_PAGE(D_PAGE, page, "queued readpage\n");
-        /* We have just requested the actual page we want, see if we can tack
-         * on some readahead to that page's RPC before it is sent. */
-        if (ll_i2sbi(inode)->ll_ra_info.ra_max_pages)
-                ll_readahead(&fd->fd_ras, exp, page->mapping, oig,
-                             fd->fd_flags);
-
-        rc = obd_trigger_group_io(exp, ll_i2info(inode)->lli_smd, NULL, oig);
-
-out:
-        if (rc)
-                unlock_page(page);
-out_oig:
-        if (oig != NULL)
-                oig_release(oig);
-        RETURN(rc);
-}
-
-static void ll_file_put_pages(struct page **pages, int numpages)
-{
-        int i;
-        struct page **pp;
-        ENTRY;
-
-        for (i = 0, pp = pages; i < numpages; i++, pp++) {
-                if (*pp) {
-                        LL_CDEBUG_PAGE(D_PAGE, (*pp), "free\n");
-                        __ll_put_llap(*pp);
-                        if (page_private(*pp))
-                                CERROR("the llap wasn't freed\n");
-                        (*pp)->mapping = NULL;
-                        if (page_count(*pp) != 1)
-                                CERROR("page %p, flags %#lx, count %i, private %p\n",
-                                (*pp), (unsigned long)(*pp)->flags, page_count(*pp),
-                                (void*)page_private(*pp));
-                        __free_pages(*pp, 0);
-                }
-        }
-        OBD_FREE(pages, numpages * sizeof(struct page*));
-        EXIT;
-}
-
-static struct page **ll_file_prepare_pages(int numpages, struct inode *inode,
-                                           unsigned long first)
-{
-        struct page **pages;
-        int i;
-        int rc = 0;
-        ENTRY;
-
-        OBD_ALLOC(pages, sizeof(struct page *) * numpages);
-        if (pages == NULL)
-                RETURN(ERR_PTR(-ENOMEM));
-        for (i = 0; i < numpages; i++) {
-                struct page *page;
-                struct ll_async_page *llap;
-
-                page = alloc_pages(GFP_HIGHUSER, 0);
-                if (page == NULL)
-                        GOTO(err, rc = -ENOMEM);
-                pages[i] = page;
-                /* llap_from_page needs page index and mapping to be set */
-                page->index = first++;
-                page->mapping = inode->i_mapping;
-                llap = llap_from_page(page, LLAP_ORIGIN_LOCKLESS_IO);
-                if (IS_ERR(llap))
-                        GOTO(err, rc = PTR_ERR(llap));
-                llap->llap_lockless_io_page = 1;
-        }
-        RETURN(pages);
-err:
-        ll_file_put_pages(pages, numpages);
-        RETURN(ERR_PTR(rc));
- }
-
-static ssize_t ll_file_copy_pages(struct page **pages, int numpages,
-                                  char *buf, loff_t pos, size_t count, int rw)
-{
-        ssize_t amount = 0;
-        int i;
-        int updatechecksum = ll_i2sbi(pages[0]->mapping->host)->ll_flags &
-                             LL_SBI_CHECKSUM;
-        ENTRY;
+        if (ll_i2dtexp(inode) == NULL)
+                RETURN(-EINVAL);
  
-        for (i = 0; i < numpages; i++) {
-                unsigned offset, bytes, left;
-                char *vaddr;
-
-                vaddr = kmap(pages[i]);
-                offset = pos & (CFS_PAGE_SIZE - 1);
-                bytes = min_t(unsigned, CFS_PAGE_SIZE - offset, count);
-                LL_CDEBUG_PAGE(D_PAGE, pages[i], "op = %s, addr = %p, "
-                               "buf = %p, bytes = %u\n",
-                               (rw == WRITE) ? "CFU" : "CTU",
-                               vaddr + offset, buf, bytes);
-                if (rw == WRITE) {
-                        left = copy_from_user(vaddr + offset, buf, bytes);
-                        if (updatechecksum) {
-                                struct ll_async_page *llap;
-
-                                llap = llap_cast_private(pages[i]);
-                                llap->llap_checksum = crc32_le(0, vaddr,
-                                                               CFS_PAGE_SIZE);
+        env = cl_env_nested_get(&nest);
+        if (IS_ERR(env))
+                RETURN(PTR_ERR(env));
+
+        io    = &ccc_env_info(env)->cti_io;
+        queue = &vvp_env_info(env)->vti_queue;
+        clob  = ll_i2info(inode)->lli_clob;
+        LASSERT(clob != NULL);
+
+        io->ci_obj = clob;
+        result = cl_io_init(env, io, CIT_MISC, clob);
+        if (result == 0) {
+                page = cl_page_find(env, clob, vmpage->index,
+                                    vmpage, CPT_CACHEABLE);
+                if (!IS_ERR(page)) {
+                        lu_ref_add(&page->cp_reference, "writepage",
+                                   cfs_current());
+                        cl_page_assume(env, io, page);
+                        /*
+                         * Mark page dirty, because this is what
+                         * ->vio_submit()->cpo_prep_write() assumes.
+                         *
+                         * XXX better solution is to detect this from within
+                         * cl_io_submit_rw() somehow.
+                         */
+                        set_page_dirty(vmpage);
+                        cl_2queue_init_page(queue, page);
+                        result = cl_io_submit_rw(env, io, CRT_WRITE, queue);
+                        cl_page_list_disown(env, io, &queue->c2_qin);
+                        if (result != 0) {
+                                /*
+                                 * There is no need to clear PG_writeback, as
+                                 * cl_io_submit_rw() calls completion callback
+                                 * on failure.
+                                 */
+                                /*
+                                 * Re-dirty page on error so it retries write,
+                                 * but not in case when IO has actually
+                                 * occurred and completed with an error.
+                                 */
+                                if (!PageError(vmpage))
+                                        set_page_dirty(vmpage);
                          }
-                } else {
-                        left = copy_to_user(buf, vaddr + offset, bytes);
-                }
-                kunmap(pages[i]);
-                amount += bytes;
-                if (left) {
-                        amount -= left;
-                        break;
+                        LASSERT(!cl_page_is_owned(page, io));
+                        lu_ref_del(&page->cp_reference,
+                                   "writepage", cfs_current());
+                        cl_page_put(env, page);
+                        cl_2queue_fini(env, queue);
                  }
-                buf += bytes;
-                count -= bytes;
-                pos += bytes;
          }
-        if (amount == 0)
-                RETURN(-EFAULT);
-        RETURN(amount);
+        cl_io_fini(env, io);
+        cl_env_nested_put(&nest, env);
+        RETURN(result);
  }
  
-static int ll_file_oig_pages(struct inode * inode, struct page **pages,
-                             int numpages, loff_t pos, size_t count, int rw)
+int ll_readpage(struct file *file, struct page *vmpage)
  {
-        struct obd_io_group *oig;
-        struct ll_inode_info *lli = ll_i2info(inode);
-        struct obd_export *exp;
-        loff_t org_pos = pos;
-        obd_flag brw_flags;
-        int rc;
-        int i;
+        struct lu_env    *env;
+        struct cl_io     *io;
+        struct cl_page   *page;
+        int result;
+        int refcheck;
          ENTRY;
  
-        exp = ll_i2dtexp(inode);
-        if (exp == NULL)
-                RETURN(-EINVAL);
-        rc = oig_init(&oig);
-        if (rc)
-                RETURN(rc);
-        brw_flags = OBD_BRW_SRVLOCK;
-        if (cfs_capable(CFS_CAP_SYS_RESOURCE))
-                brw_flags |= OBD_BRW_NOQUOTA;
-
-        for (i = 0; i < numpages; i++) {
-                struct ll_async_page *llap;
-                unsigned from, bytes;
-
-                from = pos & (CFS_PAGE_SIZE - 1);
-                bytes = min_t(unsigned, CFS_PAGE_SIZE - from,
-                              count - pos + org_pos);
-                llap = llap_cast_private(pages[i]);
-                LASSERT(llap);
-
-                lock_page(pages[i]);
-
-                LL_CDEBUG_PAGE(D_PAGE, pages[i], "offset "LPU64","
-                               " from %u, bytes = %u\n",
-                               (__u64)pos, from, bytes);
-                LASSERTF(pos >> CFS_PAGE_SHIFT == pages[i]->index,
-                         "wrong page index %lu (%lu)\n",
-                         pages[i]->index,
-                         (unsigned long)(pos >> CFS_PAGE_SHIFT));
-                rc = obd_queue_group_io(exp, lli->lli_smd, NULL, oig,
-                                        llap->llap_cookie,
-                                        (rw == WRITE) ?
-                                        OBD_BRW_WRITE:OBD_BRW_READ,
-                                        from, bytes, brw_flags,
-                                        ASYNC_READY | ASYNC_URGENT |
-                                        ASYNC_COUNT_STABLE | ASYNC_GROUP_SYNC);
-                if (rc) {
-                        i++;
-                        GOTO(out, rc);
+        result = ll_cl_init(file, vmpage, &env, &io, &page, &refcheck);
+        if (result == 0) {
+                LASSERT(page->cp_type == CPT_CACHEABLE);
+                if (likely(!PageUptodate(vmpage))) {
+                        cl_page_assume(env, io, page);
+                        result = cl_io_read_page(env, io, page);
+                } else {
+                        /* Page from a non-object file. */
+                        LASSERT(!ll_i2info(vmpage->mapping->host)->lli_smd);
+                        unlock_page(vmpage);
+                        result = 0;
                  }
-                pos += bytes;
          }
-        rc = obd_trigger_group_io(exp, lli->lli_smd, NULL, oig);
-        if (rc)
-                GOTO(out, rc);
-        rc = oig_wait(oig);
-out:
-        while(--i >= 0)
-                unlock_page(pages[i]);
-        oig_release(oig);
-        RETURN(rc);
+        LASSERT(!cl_page_is_owned(page, io));
+        ll_cl_fini(env, io, page, &refcheck);
+        RETURN(result);
  }
  
-ssize_t ll_file_lockless_io(struct file *file, char *buf, size_t count,
-                                   loff_t *ppos, int rw)
-{
-        loff_t pos;
-        struct inode *inode = file->f_dentry->d_inode;
-        ssize_t rc = 0;
-        int max_pages;
-        size_t amount = 0;
-        unsigned long first, last;
-        ENTRY;
-
-        if (rw == READ) {
-                loff_t isize;
-
-                ll_inode_size_lock(inode, 0);
-                isize = i_size_read(inode);
-                ll_inode_size_unlock(inode, 0);
-                if (*ppos >= isize)
-                        GOTO(out, rc = 0);
-                if (*ppos + count >= isize)
-                        count -= *ppos + count - isize;
-                if (count == 0)
-                        GOTO(out, rc);
-        } else {
-                rc = generic_write_checks(file, ppos, &count, 0);
-                if (rc)
-                        GOTO(out, rc);
-                rc = ll_remove_suid(file->f_dentry, file->f_vfsmnt);
-                if (rc)
-                        GOTO(out, rc);
-        }
-        pos = *ppos;
-        first = pos >> CFS_PAGE_SHIFT;
-        last = (pos + count - 1) >> CFS_PAGE_SHIFT;
-        max_pages = PTLRPC_MAX_BRW_PAGES *
-                ll_i2info(inode)->lli_smd->lsm_stripe_count;
-        CDEBUG(D_INFO, "%u, stripe_count = %u\n",
-               PTLRPC_MAX_BRW_PAGES /* max_pages_per_rpc */,
-               ll_i2info(inode)->lli_smd->lsm_stripe_count);
-
-        while (first <= last && rc >= 0) {
-                int pages_for_io;
-                struct page **pages;
-                size_t bytes = count - amount;
-
-                pages_for_io = min_t(int, last - first + 1, max_pages);
-                pages = ll_file_prepare_pages(pages_for_io, inode, first);
-                if (IS_ERR(pages)) {
-                        rc = PTR_ERR(pages);
-                        break;
-                }
-                if (rw == WRITE) {
-                        rc = ll_file_copy_pages(pages, pages_for_io, buf,
-                                                pos + amount, bytes, rw);
-                        if (rc < 0)
-                                GOTO(put_pages, rc);
-                        bytes = rc;
-                }
-                rc = ll_file_oig_pages(inode, pages, pages_for_io,
-                                       pos + amount, bytes, rw);
-                if (rc)
-                        GOTO(put_pages, rc);
-                if (rw == READ) {
-                        rc = ll_file_copy_pages(pages, pages_for_io, buf,
-                                                pos + amount, bytes, rw);
-                        if (rc < 0)
-                                GOTO(put_pages, rc);
-                        bytes = rc;
-                }
-                amount += bytes;
-                buf += bytes;
-put_pages:
-                ll_file_put_pages(pages, pages_for_io);
-                first += pages_for_io;
-                /* a short read/write check */
-                if (pos + amount < ((loff_t)first << CFS_PAGE_SHIFT))
-                        break;
-        }
-        /* NOTE: don't update i_size and KMS in absence of LDLM locks even
-         * write makes the file large */
-        file_accessed(file);
-        if (rw == READ && amount < count && rc == 0) {
-                unsigned long not_cleared;
-
-                not_cleared = clear_user(buf, count - amount);
-                amount = count - not_cleared;
-                if (not_cleared)
-                        rc = -EFAULT;
-        }
-        if (amount > 0) {
-                lprocfs_counter_add(ll_i2sbi(inode)->ll_stats,
-                                    (rw == WRITE) ?
-                                    LPROC_LL_LOCKLESS_WRITE :
-                                    LPROC_LL_LOCKLESS_READ,
-                                    (long)amount);
-                *ppos += amount;
-                RETURN(amount);
-        }
-out:
-        RETURN(rc);
-}
diff --git a/lustre/llite/rw26.c b/lustre/llite/rw26.c

index 1b0d1bc..031b1ab 100644 (file)
--- a/lustre/llite/rw26.c
+++ b/lustre/llite/rw26.c
@@ -68,28 +68,68 @@
  #include "llite_internal.h"
  #include <linux/lustre_compat25.h>
  
-static int ll_writepage_26(struct page *page, struct writeback_control *wbc)
+/**
+ * Implements Linux VM address_space::invalidatepage() method. This method is
+ * called when the page is truncate from a file, either as a result of
+ * explicit truncate, or when inode is removed from memory (as a result of
+ * final iput(), umount, or memory pressure induced icache shrinking).
+ *
+ * [0, offset] bytes of the page remain valid (this is for a case of not-page
+ * aligned truncate). Lustre leaves partially truncated page in the cache,
+ * relying on struct inode::i_size to limit further accesses.
+ */
+static int cl_invalidatepage(struct page *vmpage, unsigned long offset)
  {
-        return ll_writepage(page);
+        struct inode     *inode;
+        struct lu_env    *env;
+        struct cl_page   *page;
+        struct cl_object *obj;
+
+        int result;
+        int refcheck;
+
+        LASSERT(PageLocked(vmpage));
+        LASSERT(!PageWriteback(vmpage));
+
+        /*
+         * It is safe to not check anything in invalidatepage/releasepage
+         * below because they are run with page locked and all our io is
+         * happening with locked page too
+         */
+        result = 0;
+        if (offset == 0) {
+                env = cl_env_get(&refcheck);
+                if (!IS_ERR(env)) {
+                        inode = vmpage->mapping->host;
+                        obj = ll_i2info(inode)->lli_clob;
+                        if (obj != NULL) {
+                                page = cl_vmpage_page(vmpage, obj);
+                                if (page != NULL) {
+                                        lu_ref_add(&page->cp_reference,
+                                                   "delete", vmpage);
+                                        cl_page_delete(env, page);
+                                        result = 1;
+                                        lu_ref_del(&page->cp_reference,
+                                                   "delete", vmpage);
+                                        cl_page_put(env, page);
+                                }
+                        } else
+                                LASSERT(vmpage->private == 0);
+                        cl_env_put(env, &refcheck);
+                }
+        }
+        return result;
  }
  
-/* It is safe to not check anything in invalidatepage/releasepage below
-   because they are run with page locked and all our io is happening with
-   locked page too */
  #ifdef HAVE_INVALIDATEPAGE_RETURN_INT
  static int ll_invalidatepage(struct page *page, unsigned long offset)
  {
-        if (offset)
-                return 0;
-        if (PagePrivate(page))
-                ll_removepage(page);
-        return 1;
+        return cl_invalidatepage(page, offset);
  }
-#else
+#else /* !HAVE_INVALIDATEPAGE_RETURN_INT */
  static void ll_invalidatepage(struct page *page, unsigned long offset)
  {
-        if (offset == 0 && PagePrivate(page))
-                ll_removepage(page);
+        cl_invalidatepage(page, offset);
  }
  #endif
  
@@ -100,22 +140,34 @@ static void ll_invalidatepage(struct page *page, unsigned long offset)
  #endif
  static int ll_releasepage(struct page *page, RELEASEPAGE_ARG_TYPE gfp_mask)
  {
-        if (PagePrivate(page))
-                ll_removepage(page);
+        void *cookie;
+
+        cookie = cl_env_reenter();
+        ll_invalidatepage(page, 0);
+        cl_env_reexit(cookie);
          return 1;
  }
  
-static int ll_set_page_dirty(struct page *page)
+static int ll_set_page_dirty(struct page *vmpage)
  {
-        struct ll_async_page *llap;
-        ENTRY;
-
-        llap = llap_from_page(page, LLAP_ORIGIN_UNKNOWN);
-        if (IS_ERR(llap))
-                RETURN(PTR_ERR(llap));
-
-        llap_write_pending(page->mapping->host, llap);
-        RETURN(__set_page_dirty_nobuffers(page));
+#if 0
+        struct cl_page    *page = vvp_vmpage_page_transient(vmpage);
+        struct vvp_object *obj  = cl_inode2vvp(vmpage->mapping->host);
+        struct vvp_page   *cpg;
+
+        /*
+         * XXX should page method be called here?
+         */
+        LASSERT(&obj->co_cl == page->cp_obj);
+        cpg = cl2vvp_page(cl_page_at(page, &vvp_device_type));
+        /*
+         * XXX cannot do much here, because page is possibly not locked:
+         * sys_munmap()->...
+         *     ->unmap_page_range()->zap_pte_range()->set_page_dirty().
+         */
+        vvp_write_pending(obj, cpg);
+#endif
+        RETURN(__set_page_dirty_nobuffers(vmpage));
  }
  
  #define MAX_DIRECTIO_SIZE 2*1024*1024*1024UL
@@ -164,45 +216,116 @@ static void ll_free_user_pages(struct page **pages, int npages, int do_dirty)
          OBD_FREE(pages, npages * sizeof(*pages));
  }
  
-static ssize_t ll_direct_IO_26_seg(int rw, struct inode *inode,
+static ssize_t ll_direct_IO_26_seg(const struct lu_env *env, struct cl_io *io,
+                                   int rw, struct inode *inode,
                                     struct address_space *mapping,
-                                   struct obd_info *oinfo,
-                                   struct ptlrpc_request_set *set,
                                     size_t size, loff_t file_offset,
                                     struct page **pages, int page_count)
  {
-        struct brw_page *pga;
-        int i, rc = 0;
-        size_t length;
+        struct cl_page    *clp;
+        struct ccc_page   *clup;
+        struct cl_2queue  *queue;
+        struct cl_object  *obj = io->ci_obj;
+        struct cl_sync_io *anchor = &ccc_env_info(env)->cti_sync_io;
+        int i;
+        ssize_t rc = 0;
+        ssize_t size_orig = size;
+        size_t page_size  = cl_page_size(obj);
          ENTRY;
  
-        OBD_ALLOC(pga, sizeof(*pga) * page_count);
-        if (!pga) {
-                CDEBUG(D_VFSTRACE, "sizeof(*pga) = %u page_count = %u\n",
-                       (int)sizeof(*pga), page_count);
-                RETURN(-ENOMEM);
-        }
+        cl_sync_io_init(anchor, page_count);
  
-        for (i = 0, length = size; length > 0;
-             length -=pga[i].count, file_offset +=pga[i].count,i++) {/*i last!*/
-                pga[i].pg = pages[i];
-                pga[i].off = file_offset;
-                /* To the end of the page, or the length, whatever is less */
-                pga[i].count = min_t(int, CFS_PAGE_SIZE -
-                                          (file_offset & ~CFS_PAGE_MASK),
-                                     length);
-                pga[i].flag = 0;
-                if (rw == READ)
-                        POISON_PAGE(pages[i], 0x0d);
+        queue = &io->ci_queue;
+        cl_2queue_init(queue);
+        for (i = 0; i < page_count; i++) {
+                clp = cl_page_find(env, obj, cl_index(obj, file_offset),
+                                   pages[i], CPT_TRANSIENT);
+                if (IS_ERR(clp)) {
+                        rc = PTR_ERR(clp);
+                        break;
+                }
+
+                /* check the page type: if the page is a host page, then do
+                 * write directly */
+                /*
+                 * Very rare case that the host pages can be found for
+                 * directIO case, since linux kernel truncated all covered
+                 * pages before getting here. So, to make the OST happy(to
+                 * write a contiguous region), all pages are issued
+                 * here. -jay */
+                if (clp->cp_type == CPT_CACHEABLE) {
+                        cfs_page_t *vmpage = cl_page_vmpage(env, clp);
+                        cfs_page_t *src_page;
+                        cfs_page_t *dst_page;
+                        void       *src;
+                        void       *dst;
+
+                        src_page = (rw == WRITE) ? pages[i] : vmpage;
+                        dst_page = (rw == WRITE) ? vmpage : pages[i];
+
+                        src = kmap_atomic(src_page, KM_USER0);
+                        dst = kmap_atomic(dst_page, KM_USER1);
+                        memcpy(dst, (const void *)src, min(page_size, size));
+                        kunmap_atomic(dst, KM_USER1);
+                        kunmap_atomic(src, KM_USER0);
+
+                        /* make sure page will be added to the transfer by
+                         * cl_io_submit()->...->vvp_page_prep_write(). */
+                        if (rw == WRITE)
+                                set_page_dirty(vmpage);
+                        /*
+                         * If direct-io read finds up-to-date page in the
+                         * cache, just copy it to the user space. Page will be
+                         * filtered out by vvp_page_prep_read(). This
+                         * preserves an invariant, that page is read at most
+                         * once, see cl_page_flags::CPF_READ_COMPLETED.
+                         */
+                }
+
+                rc = cl_page_own(env, io, clp);
+                if (rc) {
+                        LASSERT(clp->cp_state == CPS_FREEING);
+                        cl_page_put(env, clp);
+                        break;
+                }
+
+                clup = cl2ccc_page(cl_page_at(clp, &vvp_device_type));
+                clup->cpg_sync_io = anchor;
+                cl_2queue_add(queue, clp);
+
+                /* drop the reference count for cl_page_find, so that the page
+                 * will be freed in cl_2queue_fini. */
+                cl_page_put(env, clp);
+                /*
+                 * Set page clip to tell transfer formation engine that page
+                 * has to be sent even if it is beyond KMS.
+                 */
+                cl_page_clip(env, clp, 0, min(size, page_size));
+                size -= page_size;
+                file_offset += page_size;
          }
  
-        rc = obd_brw_async(rw == WRITE ? OBD_BRW_WRITE : OBD_BRW_READ,
-                                ll_i2dtexp(inode), oinfo, page_count,
-                                pga, NULL, set);
-        if (rc == 0)
-                rc = size;
+        if (rc == 0) {
+                rc = cl_io_submit_rw(env, io, rw == READ ? CRT_READ : CRT_WRITE,
+                                     queue);
+                if (rc == 0) {
+                        /*
+                         * If some pages weren't sent for any reason (e.g.,
+                         * direct-io read found up-to-date pages in the
+                         * cache), count them as completed to avoid infinite
+                         * wait.
+                         */
+                        cl_page_list_for_each(clp, &queue->c2_qin)
+                                cl_sync_io_note(anchor, +1);
+                        /* wait for the IO to be finished. */
+                        rc = cl_sync_io_wait(env, io, &queue->c2_qout,
+                                             anchor) ?: size_orig;
+                }
+        }
  
-        OBD_FREE(pga, sizeof(*pga) * page_count);
+        cl_2queue_discard(env, io, queue);
+        cl_2queue_disown(env, io, queue);
+        cl_2queue_fini(env, queue);
          RETURN(rc);
  }
  
@@ -216,17 +339,17 @@ static ssize_t ll_direct_IO_26(int rw, struct kiocb *iocb,
                                 const struct iovec *iov, loff_t file_offset,
                                 unsigned long nr_segs)
  {
+        struct lu_env *env;
+        struct cl_io *io;
          struct file *file = iocb->ki_filp;
          struct inode *inode = file->f_mapping->host;
+        struct ccc_object *obj = cl_inode2ccc(inode);
          ssize_t count = iov_length(iov, nr_segs), tot_bytes = 0;
          struct ll_inode_info *lli = ll_i2info(inode);
          struct lov_stripe_md *lsm = lli->lli_smd;
-        struct ptlrpc_request_set *set;
-        struct obd_info oinfo;
-        struct obdo oa;
          unsigned long seg = 0;
          size_t size = MAX_DIO_SIZE;
-        int opc;
+        int refcheck;
          ENTRY;
  
          if (!lli->lli_smd || !lli->lli_smd->lsm_object_id)
@@ -242,15 +365,6 @@ static ssize_t ll_direct_IO_26(int rw, struct kiocb *iocb,
                 file_offset, file_offset, count >> CFS_PAGE_SHIFT,
                 MAX_DIO_SIZE >> CFS_PAGE_SHIFT);
  
-        if (rw == WRITE) {
-                ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_DIRECT_WRITE, count);
-                opc = CAPA_OPC_OSS_WRITE;
-                llap_write_pending(inode, NULL);
-        } else {
-                ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_DIRECT_READ, count);
-                opc = CAPA_OPC_OSS_RW;
-        }
-
          /* Check that all user buffers are aligned as well */
          for (seg = 0; seg < nr_segs; seg++) {
                  if (((unsigned long)iov[seg].iov_base & ~CFS_PAGE_MASK) ||
@@ -258,20 +372,18 @@ static ssize_t ll_direct_IO_26(int rw, struct kiocb *iocb,
                          RETURN(-EINVAL);
          }
  
-        set = ptlrpc_prep_set();
-        if (set == NULL)
-                RETURN(-ENOMEM);
+        env = cl_env_get(&refcheck);
+        LASSERT(!IS_ERR(env));
+        io = ccc_env_io(env)->cui_cl.cis_io;
+        LASSERT(io != NULL);
  
-        ll_inode_fill_obdo(inode, rw, &oa);
-        oinfo.oi_oa = &oa;
-        oinfo.oi_md = lsm;
-        oinfo.oi_capa = ll_osscapa_get(inode, opc);
-
-        /* need locking between buffered and direct access. and race with
-         *size changing by concurrent truncates and writes. */
+        /* 0. Need locking between buffered and direct access. and race with
+         *size changing by concurrent truncates and writes.
+         * 1. Need inode sem to operate transient pages. */
          if (rw == READ)
                  LOCK_INODE_MUTEX(inode);
  
+        LASSERT(obj->cob_transient_pages == 0);
          for (seg = 0; seg < nr_segs; seg++) {
                  size_t iov_left = iov[seg].iov_len;
                  unsigned long user_addr = (unsigned long)iov[seg].iov_base;
@@ -293,9 +405,8 @@ static ssize_t ll_direct_IO_26(int rw, struct kiocb *iocb,
                                                         &pages);
                          LASSERT(page_count != 0);
                          if (page_count > 0) {
-                                result = ll_direct_IO_26_seg(rw, inode,
+                                result = ll_direct_IO_26_seg(env, io, rw, inode,
                                                               file->f_mapping,
-                                                             &oinfo, set,
                                                               min(size,iov_left),
                                                               file_offset, pages,
                                                               page_count);
@@ -332,24 +443,19 @@ static ssize_t ll_direct_IO_26(int rw, struct kiocb *iocb,
                  }
          }
  out:
+        LASSERT(obj->cob_transient_pages == 0);
          if (rw == READ)
                  UNLOCK_INODE_MUTEX(inode);
  
          if (tot_bytes > 0) {
-                int rc;
-
-                rc = ptlrpc_set_wait(set);
-                if (rc) {
-                        tot_bytes = rc;
-                } else if (rw == WRITE) {
+                if (rw == WRITE) {
                          lov_stripe_lock(lsm);
                          obd_adjust_kms(ll_i2dtexp(inode), lsm, file_offset, 0);
                          lov_stripe_unlock(lsm);
                  }
          }
  
-        capa_put(oinfo.oi_capa);
-        ptlrpc_set_destroy(set);
+        cl_env_put(env, &refcheck);
          RETURN(tot_bytes);
  }
  
@@ -357,13 +463,13 @@ struct address_space_operations ll_aops = {
          .readpage       = ll_readpage,
  //        .readpages      = ll_readpages,
          .direct_IO      = ll_direct_IO_26,
-        .writepage      = ll_writepage_26,
+        .writepage      = ll_writepage,
          .writepages     = generic_writepages,
          .set_page_dirty = ll_set_page_dirty,
          .sync_page      = NULL,
          .prepare_write  = ll_prepare_write,
          .commit_write   = ll_commit_write,
          .invalidatepage = ll_invalidatepage,
-        .releasepage    = ll_releasepage,
+        .releasepage    = (void *)ll_releasepage,
          .bmap           = NULL
  };
diff --git a/lustre/llite/super25.c b/lustre/llite/super25.c

index 979f0c4..b61e7fc 100644 (file)
--- a/lustre/llite/super25.c
+++ b/lustre/llite/super25.c
@@ -105,12 +105,21 @@ struct super_operations lustre_super_operations =
  
  void lustre_register_client_process_config(int (*cpc)(struct lustre_cfg *lcfg));
  
+int vvp_global_init(void);
+void vvp_global_fini(void);
+
  static int __init init_lustre_lite(void)
  {
          int i, rc, seed[2];
          struct timeval tv;
          lnet_process_id_t lnet_id;
- 
+
+        /* print an address of _any_ initialized kernel symbol from this
+         * module, to allow debugging with gdb that doesn't support data
+         * symbols from modules.*/
+        CDEBUG(D_CONSOLE, "Lustre client module (%p).\n",
+               &lustre_super_operations);
+
          rc = ll_init_inodecache();
          if (rc)
                  return -ENOMEM;
@@ -148,8 +157,6 @@ static int __init init_lustre_lite(void)
          proc_lustre_fs_root = proc_lustre_root ?
                                lprocfs_register("llite", proc_lustre_root, NULL, NULL) : NULL;
  
-        ll_register_cache(&ll_cache_definition);
-
          lustre_register_client_fill_super(ll_fill_super);
          lustre_register_kill_super_cb(ll_kill_super);
  
@@ -174,13 +181,20 @@ static int __init init_lustre_lite(void)
          init_timer(&ll_capa_timer);
          ll_capa_timer.function = ll_capa_timer_callback;
          rc = ll_capa_thread_start();
+        /*
+         * XXX normal cleanup is needed here.
+         */
+        if (rc == 0)
+                rc = vvp_global_init();
+
          return rc;
  }
  
  static void __exit exit_lustre_lite(void)
  {
          int rc;
-        
+
+        vvp_global_fini();
          del_timer(&ll_capa_timer);
          ll_capa_thread_stop();
          LASSERTF(capa_count[CAPA_SITE_CLIENT] == 0,
@@ -192,8 +206,6 @@ static void __exit exit_lustre_lite(void)
  
          lustre_register_client_process_config(NULL);
  
-        ll_unregister_cache(&ll_cache_definition);
-
          ll_destroy_inodecache();
  
          rc = cfs_mem_cache_destroy(ll_rmtperm_hash_cachep);
@@ -206,11 +218,6 @@ static void __exit exit_lustre_lite(void)
  
          rc = cfs_mem_cache_destroy(ll_file_data_slab);
          LASSERTF(rc == 0, "couldn't destroy ll_file_data slab\n");
-        if (ll_async_page_slab) {
-                rc = cfs_mem_cache_destroy(ll_async_page_slab);
-                LASSERTF(rc == 0, "couldn't destroy ll_async_page slab\n");
-        }
-
          if (proc_lustre_fs_root)
                  lprocfs_remove(&proc_lustre_fs_root);
  }
diff --git a/lustre/llite/symlink.c b/lustre/llite/symlink.c

index b1754da..365910f 100644 (file)
--- a/lustre/llite/symlink.c
+++ b/lustre/llite/symlink.c
@@ -78,7 +78,7 @@ static int ll_readlink_internal(struct inode *inode,
                  CERROR("OBD_MD_LINKNAME not set on reply\n");
                  GOTO(failed, rc = -EPROTO);
          }
-        
+
          LASSERT(symlen != 0);
          if (body->eadatasize != symlen) {
                  CERROR("inode %lu: symlink length %d not expected %d\n",
@@ -114,7 +114,6 @@ static int ll_readlink_internal(struct inode *inode,
  static int ll_readlink(struct dentry *dentry, char *buffer, int buflen)
  {
          struct inode *inode = dentry->d_inode;
-        struct ll_inode_info *lli = ll_i2info(inode);
          struct ptlrpc_request *request;
          char *symname;
          int rc;
@@ -122,7 +121,7 @@ static int ll_readlink(struct dentry *dentry, char *buffer, int buflen)
  
          CDEBUG(D_VFSTRACE, "VFS Op\n");
          /* on symlinks lli_open_sem protects lli_symlink_name allocation/data */
-        down(&lli->lli_size_sem);
+        ll_inode_size_lock(inode, 0);
          rc = ll_readlink_internal(inode, &request, &symname);
          if (rc)
                  GOTO(out, rc);
@@ -130,7 +129,7 @@ static int ll_readlink(struct dentry *dentry, char *buffer, int buflen)
          rc = vfs_readlink(dentry, buffer, buflen, symname);
          ptlrpc_req_finished(request);
   out:
-        up(&lli->lli_size_sem);
+        ll_inode_size_unlock(inode, 0);
          RETURN(rc);
  }
  
@@ -144,7 +143,6 @@ static LL_FOLLOW_LINK_RETURN_TYPE ll_follow_link(struct dentry *dentry,
                                                   struct nameidata *nd)
  {
          struct inode *inode = dentry->d_inode;
-        struct ll_inode_info *lli = ll_i2info(inode);
  #ifdef HAVE_VFS_INTENT_PATCHES
          struct lookup_intent *it = ll_nd2it(nd);
  #endif
@@ -166,8 +164,8 @@ static LL_FOLLOW_LINK_RETURN_TYPE ll_follow_link(struct dentry *dentry,
  
          CDEBUG(D_VFSTRACE, "VFS Op\n");
  #if THREAD_SIZE < 8192
-        /* 
-         *  We set the limits recursive symlink to 5 
+        /*
+         *  We set the limits recursive symlink to 5
           *  instead of default 8 when kernel has 4k stack
           *  to prevent stack overflow.
           */
@@ -176,9 +174,9 @@ static LL_FOLLOW_LINK_RETURN_TYPE ll_follow_link(struct dentry *dentry,
                  GOTO(out_release, rc);
          }
  #endif
-        down(&lli->lli_size_sem);
+        ll_inode_size_lock(inode, 0);
          rc = ll_readlink_internal(inode, &request, &symname);
-        up(&lli->lli_size_sem);
+        ll_inode_size_unlock(inode, 0);
          if (rc) {
  #if THREAD_SIZE < 8192
  out_release:
diff --git a/lustre/llite/vvp_dev.c b/lustre/llite/vvp_dev.c

new file mode 100644 (file)

index 0000000..8347789
--- /dev/null
+++ b/lustre/llite/vvp_dev.c
@@ -0,0 +1,559 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * cl_device and cl_device_type implementation for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#ifndef __KERNEL__
+# error This file is kernel only.
+#endif
+
+#include <obd.h>
+#include <lustre_lite.h>
+
+#include "vvp_internal.h"
+
+/*****************************************************************************
+ *
+ * Vvp device and device type functions.
+ *
+ */
+
+/*
+ * vvp_ prefix stands for "Vfs Vm Posix". It corresponds to historical
+ * "llite_" (var. "ll_") prefix.
+ */
+
+cfs_mem_cache_t *vvp_page_kmem;
+cfs_mem_cache_t *vvp_thread_kmem;
+static cfs_mem_cache_t *vvp_session_kmem;
+static struct lu_kmem_descr vvp_caches[] = {
+        {
+                .ckd_cache = &vvp_page_kmem,
+                .ckd_name  = "vvp_page_kmem",
+                .ckd_size  = sizeof (struct ccc_page)
+        },
+        {
+                .ckd_cache = &vvp_thread_kmem,
+                .ckd_name  = "vvp_thread_kmem",
+                .ckd_size  = sizeof (struct vvp_thread_info),
+        },
+        {
+                .ckd_cache = &vvp_session_kmem,
+                .ckd_name  = "vvp_session_kmem",
+                .ckd_size  = sizeof (struct vvp_session)
+        },
+        {
+                .ckd_cache = NULL
+        }
+};
+
+static void *vvp_key_init(const struct lu_context *ctx,
+                          struct lu_context_key *key)
+{
+        struct vvp_thread_info *info;
+
+        OBD_SLAB_ALLOC_PTR(info, vvp_thread_kmem);
+        if (info == NULL)
+                info = ERR_PTR(-ENOMEM);
+        return info;
+}
+
+static void vvp_key_fini(const struct lu_context *ctx,
+                         struct lu_context_key *key, void *data)
+{
+        struct vvp_thread_info *info = data;
+        OBD_SLAB_FREE_PTR(info, vvp_thread_kmem);
+}
+
+static void *vvp_session_key_init(const struct lu_context *ctx,
+                                  struct lu_context_key *key)
+{
+        struct vvp_session *session;
+
+        OBD_SLAB_ALLOC_PTR(session, vvp_session_kmem);
+        if (session == NULL)
+                session = ERR_PTR(-ENOMEM);
+        return session;
+}
+
+static void vvp_session_key_fini(const struct lu_context *ctx,
+                                 struct lu_context_key *key, void *data)
+{
+        struct vvp_session *session = data;
+        OBD_SLAB_FREE_PTR(session, vvp_session_kmem);
+}
+
+
+struct lu_context_key vvp_key = {
+        .lct_tags = LCT_CL_THREAD,
+        .lct_init = vvp_key_init,
+        .lct_fini = vvp_key_fini
+};
+
+struct lu_context_key vvp_session_key = {
+        .lct_tags = LCT_SESSION,
+        .lct_init = vvp_session_key_init,
+        .lct_fini = vvp_session_key_fini
+};
+
+/* type constructor/destructor: vvp_type_{init,fini,start,stop}(). */
+LU_TYPE_INIT_FINI(vvp, &ccc_key, &ccc_session_key, &vvp_key, &vvp_session_key);
+
+static const struct lu_device_operations vvp_lu_ops = {
+        .ldo_object_alloc      = vvp_object_alloc
+};
+
+static const struct cl_device_operations vvp_cl_ops = {
+        .cdo_req_init = ccc_req_init
+};
+
+static struct lu_device *vvp_device_alloc(const struct lu_env *env,
+                                          struct lu_device_type *t,
+                                          struct lustre_cfg *cfg)
+{
+        return ccc_device_alloc(env, t, cfg, &vvp_lu_ops, &vvp_cl_ops);
+}
+
+static const struct lu_device_type_operations vvp_device_type_ops = {
+        .ldto_init = vvp_type_init,
+        .ldto_fini = vvp_type_fini,
+
+        .ldto_start = vvp_type_start,
+        .ldto_stop  = vvp_type_stop,
+
+        .ldto_device_alloc = vvp_device_alloc,
+        .ldto_device_free  = ccc_device_free,
+        .ldto_device_init  = ccc_device_init,
+        .ldto_device_fini  = ccc_device_fini
+};
+
+struct lu_device_type vvp_device_type = {
+        .ldt_tags     = LU_DEVICE_CL,
+        .ldt_name     = LUSTRE_VVP_NAME,
+        .ldt_ops      = &vvp_device_type_ops,
+        .ldt_ctx_tags = LCT_CL_THREAD
+};
+
+/**
+ * A mutex serializing calls to vvp_inode_fini() under extreme memory
+ * pressure, when environments cannot be allocated.
+ */
+int vvp_global_init(void)
+{
+        int result;
+
+        result = lu_kmem_init(vvp_caches);
+        if (result == 0) {
+                result = ccc_global_init(&vvp_device_type);
+                if (result != 0)
+                        lu_kmem_fini(vvp_caches);
+        }
+        return result;
+}
+
+void vvp_global_fini(void)
+{
+        ccc_global_fini(&vvp_device_type);
+        lu_kmem_fini(vvp_caches);
+}
+
+
+/*****************************************************************************
+ *
+ * mirror obd-devices into cl devices.
+ *
+ */
+
+int cl_sb_init(struct super_block *sb)
+{
+        struct ll_sb_info *sbi;
+        struct cl_device  *cl;
+        struct lu_env     *env;
+        int rc = 0;
+        int refcheck;
+
+        sbi  = ll_s2sbi(sb);
+        env = cl_env_get(&refcheck);
+        if (!IS_ERR(env)) {
+                cl = cl_type_setup(env, NULL, &vvp_device_type,
+                                   sbi->ll_dt_exp->exp_obd->obd_lu_dev);
+                if (!IS_ERR(cl)) {
+                        cl2ccc_dev(cl)->cdv_sb = sb;
+                        sbi->ll_cl = cl;
+                        sbi->ll_site = cl2lu_dev(cl)->ld_site;
+                }
+                cl_env_put(env, &refcheck);
+        } else
+                rc = PTR_ERR(env);
+        RETURN(rc);
+}
+
+int cl_sb_fini(struct super_block *sb)
+{
+        struct ll_sb_info *sbi;
+        struct lu_env     *env;
+        struct cl_device  *cld;
+        int                refcheck;
+        int                result;
+
+        ENTRY;
+        sbi = ll_s2sbi(sb);
+        env = cl_env_get(&refcheck);
+        if (!IS_ERR(env)) {
+                cld = sbi->ll_cl;
+
+                if (cld != NULL) {
+                        cl_stack_fini(env, cld);
+                        sbi->ll_cl = NULL;
+                        sbi->ll_site = NULL;
+                }
+                cl_env_put(env, &refcheck);
+                result = 0;
+        } else {
+                CERROR("Cannot cleanup cl-stack due to memory shortage.\n");
+                result = PTR_ERR(env);
+        }
+        /*
+         * If mount failed (sbi->ll_cl == NULL), and this there are no other
+         * mounts, stop device types manually (this usually happens
+         * automatically when last device is destroyed).
+         */
+        lu_types_stop();
+        RETURN(result);
+}
+
+/****************************************************************************
+ *
+ * /proc/fs/lustre/llite/$MNT/dump_page_cache
+ *
+ ****************************************************************************/
+
+/*
+ * To represent contents of a page cache as a byte stream, following
+ * information if encoded in 64bit offset:
+ *
+ *       - file hash bucket in lu_site::ls_hash[]       28bits
+ *
+ *       - how far file is from bucket head              4bits
+ *
+ *       - page index                                   32bits
+ *
+ * First two data identify a file in the cache uniquely.
+ */
+
+#define PGC_OBJ_SHIFT (32 + 4)
+#define PGC_DEPTH_SHIFT (32)
+
+struct vvp_pgcache_id {
+        unsigned vpi_bucket;
+        unsigned vpi_depth;
+        uint32_t vpi_index;
+};
+
+static void vvp_pgcache_id_unpack(loff_t pos, struct vvp_pgcache_id *id)
+{
+        CLASSERT(sizeof(pos) == sizeof(__u64));
+
+        id->vpi_index  = pos & 0xffffffff;
+        id->vpi_depth  = (pos >> PGC_DEPTH_SHIFT) & 0xf;
+        id->vpi_bucket = ((unsigned long long)pos >> PGC_OBJ_SHIFT);
+}
+
+static loff_t vvp_pgcache_id_pack(struct vvp_pgcache_id *id)
+{
+        return
+                ((__u64)id->vpi_index) |
+                ((__u64)id->vpi_depth  << PGC_DEPTH_SHIFT) |
+                ((__u64)id->vpi_bucket << PGC_OBJ_SHIFT);
+}
+
+static struct cl_object *vvp_pgcache_obj(const struct lu_env *env,
+                                         struct lu_device *dev,
+                                         struct vvp_pgcache_id *id)
+{
+        struct hlist_head       *bucket;
+        struct lu_object_header *hdr;
+        struct lu_site          *site;
+        struct hlist_node       *scan;
+        struct lu_object_header *found;
+        struct cl_object        *clob;
+        unsigned                 depth;
+
+        LASSERT(lu_device_is_cl(dev));
+
+        site   = dev->ld_site;
+        bucket = site->ls_hash + (id->vpi_bucket & site->ls_hash_mask);
+        depth  = id->vpi_depth & 0xf;
+        found  = NULL;
+        clob   = NULL;
+
+        /* XXX copy of lu_object.c:htable_lookup() */
+        read_lock(&site->ls_guard);
+        hlist_for_each_entry(hdr, scan, bucket, loh_hash) {
+                if (depth-- == 0) {
+                        if (!lu_object_is_dying(hdr)) {
+                                if (atomic_add_return(1, &hdr->loh_ref) == 1)
+                                        ++ site->ls_busy;
+                                found = hdr;
+                        }
+                        break;
+                }
+        }
+        read_unlock(&site->ls_guard);
+
+        if (found != NULL) {
+                struct lu_object *lu_obj;
+
+                lu_obj = lu_object_locate(found, dev->ld_type);
+                if (lu_obj != NULL) {
+                        lu_object_ref_add(lu_obj, "dump", cfs_current());
+                        clob = lu2cl(lu_obj);
+                } else
+                        lu_object_put(env, lu_object_top(found));
+        } else if (depth > 0)
+                id->vpi_depth = 0xf;
+        return clob;
+}
+
+static loff_t vvp_pgcache_find(const struct lu_env *env,
+                               struct lu_device *dev, loff_t pos)
+{
+        struct cl_object     *clob;
+        struct lu_site       *site;
+        struct vvp_pgcache_id id;
+
+        site = dev->ld_site;
+        vvp_pgcache_id_unpack(pos, &id);
+
+        while (1) {
+                if (id.vpi_bucket >= site->ls_hash_size)
+                        return ~0ULL;
+                clob = vvp_pgcache_obj(env, dev, &id);
+                if (clob != NULL) {
+                        struct cl_object_header *hdr;
+                        int                      nr;
+                        struct cl_page          *pg;
+
+                        /* got an object. Find next page. */
+                        hdr = cl_object_header(clob);
+
+                        spin_lock(&hdr->coh_page_guard);
+                        nr = radix_tree_gang_lookup(&hdr->coh_tree,
+                                                    (void **)&pg,
+                                                    id.vpi_index, 1);
+                        if (nr > 0) {
+                                id.vpi_index = pg->cp_index;
+                                /* Cant support over 16T file */
+                                nr = !(pg->cp_index > 0xffffffff);
+                        }
+                        spin_unlock(&hdr->coh_page_guard);
+
+                        lu_object_ref_del(&clob->co_lu, "dump", cfs_current());
+                        cl_object_put(env, clob);
+                        if (nr > 0)
+                                return vvp_pgcache_id_pack(&id);
+                }
+                /* to the next object. */
+                ++id.vpi_depth;
+                id.vpi_depth &= 0xf;
+                if (id.vpi_depth == 0 && ++id.vpi_bucket == 0)
+                        return ~0ULL;
+                id.vpi_index = 0;
+        }
+}
+
+#define seq_page_flag(seq, page, flag, has_flags) do {                  \
+        if (test_bit(PG_##flag, &(page)->flags)) {                      \
+                seq_printf(seq, "%s"#flag, has_flags ? "|" : "");       \
+                has_flags = 1;                                          \
+        }                                                               \
+} while(0);
+
+static void vvp_pgcache_page_show(const struct lu_env *env,
+                                  struct seq_file *seq, struct cl_page *page)
+{
+        struct ccc_page *cpg;
+        cfs_page_t      *vmpage;
+        int              has_flags;
+
+        cpg = cl2ccc_page(cl_page_at(page, &vvp_device_type));
+        vmpage = cpg->cpg_page;
+        seq_printf(seq," %5i | %p %p %s %s %s %s | %p %lu/%u(%p) %lu %u [",
+                   0 /* gen */,
+                   cpg, page,
+                   "none",
+                   cpg->cpg_write_queued ? "wq" : "- ",
+                   cpg->cpg_defer_uptodate ? "du" : "- ",
+                   PageWriteback(vmpage) ? "wb" : "-",
+                   vmpage, vmpage->mapping->host->i_ino,
+                   vmpage->mapping->host->i_generation,
+                   vmpage->mapping->host, vmpage->index,
+                   page_count(vmpage));
+        has_flags = 0;
+        seq_page_flag(seq, vmpage, locked, has_flags);
+        seq_page_flag(seq, vmpage, error, has_flags);
+        seq_page_flag(seq, vmpage, referenced, has_flags);
+        seq_page_flag(seq, vmpage, uptodate, has_flags);
+        seq_page_flag(seq, vmpage, dirty, has_flags);
+#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,12))
+        seq_page_flag(seq, vmpage, highmem, has_flags);
+#endif
+        seq_page_flag(seq, vmpage, writeback, has_flags);
+        seq_printf(seq, "%s]\n", has_flags ? "" : "-");
+}
+
+static int vvp_pgcache_show(struct seq_file *f, void *v)
+{
+        loff_t                   pos;
+        struct ll_sb_info       *sbi;
+        struct cl_object        *clob;
+        struct lu_env           *env;
+        struct cl_page          *page;
+        struct cl_object_header *hdr;
+        struct vvp_pgcache_id    id;
+        int                      refcheck;
+        int                      result;
+
+        env = cl_env_get(&refcheck);
+        if (!IS_ERR(env)) {
+                pos = *(loff_t *) v;
+                vvp_pgcache_id_unpack(pos, &id);
+                sbi = f->private;
+                clob = vvp_pgcache_obj(env, &sbi->ll_cl->cd_lu_dev, &id);
+                if (clob != NULL) {
+                        hdr = cl_object_header(clob);
+
+                        spin_lock(&hdr->coh_page_guard);
+                        page = cl_page_lookup(hdr, id.vpi_index);
+                        spin_unlock(&hdr->coh_page_guard);
+
+                        seq_printf(f, "%8x@"DFID": ",
+                                   id.vpi_index, PFID(&hdr->coh_lu.loh_fid));
+                        if (page != NULL) {
+                                vvp_pgcache_page_show(env, f, page);
+                                cl_page_put(env, page);
+                        } else
+                                seq_puts(f, "missing\n");
+                        lu_object_ref_del(&clob->co_lu, "dump", cfs_current());
+                        cl_object_put(env, clob);
+                } else
+                        seq_printf(f, "%llx missing\n", pos);
+                cl_env_put(env, &refcheck);
+                result = 0;
+        } else
+                result = PTR_ERR(env);
+        return result;
+}
+
+static void *vvp_pgcache_start(struct seq_file *f, loff_t *pos)
+{
+        struct ll_sb_info *sbi;
+        struct lu_env     *env;
+        int                refcheck;
+
+        sbi = f->private;
+
+        env = cl_env_get(&refcheck);
+        if (!IS_ERR(env)) {
+                sbi = f->private;
+                if (sbi->ll_site->ls_hash_bits > 64 - PGC_OBJ_SHIFT)
+                        pos = ERR_PTR(-EFBIG);
+                else {
+                        *pos = vvp_pgcache_find(env, &sbi->ll_cl->cd_lu_dev,
+                                                *pos);
+                        if (*pos == ~0ULL)
+                                pos = NULL;
+                }
+                cl_env_put(env, &refcheck);
+        }
+        return pos;
+}
+
+static void *vvp_pgcache_next(struct seq_file *f, void *v, loff_t *pos)
+{
+        struct ll_sb_info *sbi;
+        struct lu_env     *env;
+        int                refcheck;
+
+        env = cl_env_get(&refcheck);
+        if (!IS_ERR(env)) {
+                sbi = f->private;
+                *pos = vvp_pgcache_find(env, &sbi->ll_cl->cd_lu_dev, *pos + 1);
+                if (*pos == ~0ULL)
+                        pos = NULL;
+                cl_env_put(env, &refcheck);
+        }
+        return pos;
+}
+
+static void vvp_pgcache_stop(struct seq_file *f, void *v)
+{
+        /* Nothing to do */
+}
+
+static struct seq_operations vvp_pgcache_ops = {
+        .start = vvp_pgcache_start,
+        .next  = vvp_pgcache_next,
+        .stop  = vvp_pgcache_stop,
+        .show  = vvp_pgcache_show
+};
+
+static int vvp_dump_pgcache_seq_open(struct inode *inode, struct file *filp)
+{
+        struct proc_dir_entry *dp  = PDE(inode);
+        struct ll_sb_info     *sbi = dp->data;
+        struct seq_file       *seq;
+        int                    result;
+
+        result = seq_open(filp, &vvp_pgcache_ops);
+        if (result == 0) {
+                seq = filp->private_data;
+                seq->private = sbi;
+        }
+        return result;
+}
+
+struct file_operations vvp_dump_pgcache_file_ops = {
+        .owner   = THIS_MODULE,
+        .open    = vvp_dump_pgcache_seq_open,
+        .read    = seq_read,
+        .llseek         = seq_lseek,
+        .release = seq_release,
+};
diff --git a/lustre/llite/vvp_internal.h b/lustre/llite/vvp_internal.h

new file mode 100644 (file)

index 0000000..42042a9
--- /dev/null
+++ b/lustre/llite/vvp_internal.h
@@ -0,0 +1,68 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Internal definitions for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#ifndef VVP_INTERNAL_H
+#define VVP_INTERNAL_H
+
+#ifndef __KERNEL__
+# error This file is kernel only.
+#endif
+
+#include <cl_object.h>
+#include "llite_internal.h"
+
+int               vvp_io_init     (const struct lu_env *env,
+                                   struct cl_object *obj, struct cl_io *io);
+int               vvp_lock_init   (const struct lu_env *env,
+                                   struct cl_object *obj, struct cl_lock *lock,
+                                   const struct cl_io *io);
+struct cl_page   *vvp_page_init   (const struct lu_env *env,
+                                   struct cl_object *obj,
+                                   struct cl_page *page, cfs_page_t *vmpage);
+struct lu_object *vvp_object_alloc(const struct lu_env *env,
+                                   const struct lu_object_header *hdr,
+                                   struct lu_device *dev);
+
+struct ccc_object *cl_inode2ccc(struct inode *inode);
+
+extern cfs_mem_cache_t *vvp_page_kmem;
+extern cfs_mem_cache_t *vvp_thread_kmem;
+
+#endif /* VVP_INTERNAL_H */
diff --git a/lustre/llite/vvp_io.c b/lustre/llite/vvp_io.c

new file mode 100644 (file)

index 0000000..30eae91
--- /dev/null
+++ b/lustre/llite/vvp_io.c
@@ -0,0 +1,996 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_io for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#ifndef __KERNEL__
+# error This file is kernel only.
+#endif
+
+#include <obd.h>
+#include <lustre_lite.h>
+
+#include "vvp_internal.h"
+
+static struct vvp_io *cl2vvp_io(const struct lu_env *env,
+                                const struct cl_io_slice *slice);
+
+/*****************************************************************************
+ *
+ * io operations.
+ *
+ */
+
+static int vvp_io_fault_iter_init(const struct lu_env *env,
+                                  const struct cl_io_slice *ios)
+{
+        struct vvp_io *vio   = cl2vvp_io(env, ios);
+        struct inode  *inode = ccc_object_inode(ios->cis_obj);
+
+        LASSERT(inode ==
+                cl2ccc_io(env, ios)->cui_fd->fd_file->f_dentry->d_inode);
+        vio->u.fault.ft_mtime = LTIME_S(inode->i_mtime);
+        return 0;
+}
+
+static void vvp_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+        struct cl_io     *io  = ios->cis_io;
+        struct cl_object *obj = io->ci_obj;
+
+        CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+        if (io->ci_type == CIT_WRITE)
+                up(&ll_i2info(ccc_object_inode(obj))->lli_write_sem);
+        else {
+                struct vvp_io     *vio  = cl2vvp_io(env, ios);
+                struct ccc_io     *cio  = cl2ccc_io(env, ios);
+
+                if (vio->cui_ra_window_set)
+                        ll_ra_read_ex(cio->cui_fd->fd_file, &vio->cui_bead);
+        }
+
+}
+
+static void vvp_io_fault_fini(const struct lu_env *env,
+                              const struct cl_io_slice *ios)
+{
+        struct cl_io   *io   = ios->cis_io;
+        struct cl_page *page = io->u.ci_fault.ft_page;
+
+        CLOBINVRNT(env, io->ci_obj, ccc_object_invariant(io->ci_obj));
+
+        if (page != NULL) {
+                lu_ref_del(&page->cp_reference, "fault", io);
+                cl_page_put(env, page);
+                io->u.ci_fault.ft_page = NULL;
+        }
+        vvp_io_fini(env, ios);
+}
+
+enum cl_lock_mode vvp_mode_from_vma(struct vm_area_struct *vma)
+{
+        /*
+         * we only want to hold PW locks if the mmap() can generate
+         * writes back to the file and that only happens in shared
+         * writable vmas
+         */
+        if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
+                return CLM_WRITE;
+        return CLM_READ;
+}
+
+static int vvp_mmap_locks(const struct lu_env *env,
+                          struct ccc_io *vio, struct cl_io *io)
+{
+        struct ccc_thread_info *cti = ccc_env_info(env);
+        struct vm_area_struct  *vma;
+        struct cl_lock_descr   *descr = &cti->cti_descr;
+        ldlm_policy_data_t      policy;
+        struct inode           *inode;
+        unsigned long           addr;
+        unsigned long           seg;
+        ssize_t                 count;
+        int                     result;
+        ENTRY;
+
+        LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+
+        if (cl_io_is_sendfile(io))
+                RETURN(0);
+
+        for (seg = 0; seg < vio->cui_nrsegs; seg++) {
+                const struct iovec *iv = &vio->cui_iov[seg];
+
+                addr = (unsigned long)iv->iov_base;
+                count = iv->iov_len;
+                if (count == 0)
+                        continue;
+
+                count += addr & (~CFS_PAGE_MASK);
+                addr &= CFS_PAGE_MASK;
+                while((vma = our_vma(addr, count)) != NULL) {
+                        LASSERT(vma->vm_file);
+
+                        inode = vma->vm_file->f_dentry->d_inode;
+                        /*
+                         * XXX: Required lock mode can be weakened: CIT_WRITE
+                         * io only ever reads user level buffer, and CIT_READ
+                         * only writes on it.
+                         */
+                        policy_from_vma(&policy, vma, addr, count);
+                        descr->cld_mode = vvp_mode_from_vma(vma);
+                        descr->cld_obj = ll_i2info(inode)->lli_clob;
+                        descr->cld_start = cl_index(descr->cld_obj,
+                                                    policy.l_extent.start);
+                        descr->cld_end = cl_index(descr->cld_obj,
+                                                  policy.l_extent.end);
+                        result = cl_io_lock_alloc_add(env, io, descr);
+                        if (result < 0)
+                                RETURN(result);
+                        if (vma->vm_end - addr >= count)
+                                break;
+                        count -= vma->vm_end - addr;
+                        addr = vma->vm_end;
+                }
+        }
+        RETURN(0);
+}
+
+static void vvp_io_update_iov(const struct lu_env *env,
+                              struct ccc_io *vio, struct cl_io *io)
+{
+        int i;
+        size_t size = io->u.ci_rw.crw_count;
+
+        vio->cui_iov_olen = 0;
+        if (cl_io_is_sendfile(io) || size == vio->cui_tot_count)
+                return;
+
+        if (vio->cui_tot_nrsegs == 0)
+                vio->cui_tot_nrsegs =  vio->cui_nrsegs;
+
+        for (i = 0; i < vio->cui_tot_nrsegs; i++) {
+                struct iovec *iv = &vio->cui_iov[i];
+
+                if (iv->iov_len < size)
+                        size -= iv->iov_len;
+                else {
+                        if (iv->iov_len > size) {
+                                vio->cui_iov_olen = iv->iov_len;
+                                iv->iov_len = size;
+                        }
+                        break;
+                }
+        }
+
+        vio->cui_nrsegs = i + 1;
+}
+
+static int vvp_io_rw_lock(const struct lu_env *env, struct cl_io *io,
+                          enum cl_lock_mode mode, loff_t start, loff_t end)
+{
+        struct ccc_io *cio = ccc_env_io(env);
+        int result;
+        int ast_flags = 0;
+
+        LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+        LASSERT(vvp_env_io(env)->cui_oneshot == 0);
+        ENTRY;
+
+        vvp_io_update_iov(env, cio, io);
+
+        if (io->u.ci_rw.crw_nonblock)
+                ast_flags |= CEF_NONBLOCK;
+        result = vvp_mmap_locks(env, cio, io);
+        if (result == 0)
+                result = ccc_io_one_lock(env, io, ast_flags, mode, start, end);
+        RETURN(result);
+}
+
+static int vvp_io_read_lock(const struct lu_env *env,
+                            const struct cl_io_slice *ios)
+{
+        struct cl_io         *io  = ios->cis_io;
+        struct ll_inode_info *lli = ll_i2info(ccc_object_inode(io->ci_obj));
+        int result;
+
+        ENTRY;
+        /* XXX: Layer violation, we shouldn't see lsm at llite level. */
+        if (lli->lli_smd != NULL) /* lsm-less file, don't need to lock */
+                result = vvp_io_rw_lock(env, io, CLM_READ,
+                                        io->u.ci_rd.rd.crw_pos,
+                                        io->u.ci_rd.rd.crw_pos +
+                                        io->u.ci_rd.rd.crw_count - 1);
+        else
+                result = 0;
+        RETURN(result);
+}
+
+static int vvp_io_fault_lock(const struct lu_env *env,
+                             const struct cl_io_slice *ios)
+{
+        struct cl_io *io   = ios->cis_io;
+        struct vvp_io *vio = cl2vvp_io(env, ios);
+        /*
+         * XXX LDLM_FL_CBPENDING
+         */
+        return ccc_io_one_lock_index
+                (env, io, 0, vvp_mode_from_vma(vio->u.fault.ft_vma),
+                 io->u.ci_fault.ft_index, io->u.ci_fault.ft_index);
+}
+
+static int vvp_io_write_lock(const struct lu_env *env,
+                             const struct cl_io_slice *ios)
+{
+        struct cl_io *io = ios->cis_io;
+        loff_t start;
+        loff_t end;
+        int    result;
+
+        if (cl2vvp_io(env, ios)->cui_oneshot == 0) {
+                if (io->u.ci_wr.wr_append) {
+                        start = 0;
+                        end   = OBD_OBJECT_EOF;
+                } else {
+                        start = io->u.ci_wr.wr.crw_pos;
+                        end   = start + io->u.ci_wr.wr.crw_count - 1;
+                }
+                result = vvp_io_rw_lock(env, io, CLM_WRITE, start, end);
+        } else
+                result = 0;
+        return result;
+}
+
+static int vvp_io_trunc_iter_init(const struct lu_env *env,
+                                  const struct cl_io_slice *ios)
+{
+        struct ccc_io *vio   = cl2ccc_io(env, ios);
+        struct inode  *inode = ccc_object_inode(ios->cis_obj);
+
+        /*
+         * We really need to get our PW lock before we change inode->i_size.
+         * If we don't we can race with other i_size updaters on our node,
+         * like ll_file_read.  We can also race with i_size propogation to
+         * other nodes through dirtying and writeback of final cached pages.
+         * This last one is especially bad for racing o_append users on other
+         * nodes.
+         */
+
+        UNLOCK_INODE_MUTEX(inode);
+        UP_WRITE_I_ALLOC_SEM(inode);
+        vio->u.trunc.cui_locks_released = 1;
+        return 0;
+}
+
+/**
+ * Implementation of cl_io_operations::vio_lock() method for CIT_TRUNC io.
+ *
+ * Handles "lockless io" mode when extent locking is done by server.
+ */
+static int vvp_io_trunc_lock(const struct lu_env *env,
+                             const struct cl_io_slice *ios)
+{
+        struct ccc_io     *vio   = cl2ccc_io(env, ios);
+        struct cl_io      *io    = ios->cis_io;
+        size_t new_size          = io->u.ci_truncate.tr_size;
+        __u32 enqflags = new_size == 0 ? CEF_DISCARD_DATA : 0;
+        int result;
+
+        vio->u.trunc.cui_local_lock = TRUNC_EXTENT;
+        result = ccc_io_one_lock(env, io, enqflags, CLM_WRITE,
+                                 new_size, OBD_OBJECT_EOF);
+        return result;
+}
+
+static int vvp_io_trunc_start(const struct lu_env *env,
+                              const struct cl_io_slice *ios)
+{
+        struct ccc_io        *cio   = cl2ccc_io(env, ios);
+        struct vvp_io        *vio   = cl2vvp_io(env, ios);
+        struct cl_io         *io    = ios->cis_io;
+        struct inode         *inode = ccc_object_inode(io->ci_obj);
+        struct cl_object     *obj   = ios->cis_obj;
+        size_t                size  = io->u.ci_truncate.tr_size;
+        pgoff_t               start = cl_index(obj, size);
+        int                   result;
+
+        LASSERT(cio->u.trunc.cui_locks_released);
+        LASSERT(vio->cui_oneshot == 0);
+
+        LOCK_INODE_MUTEX(inode);
+        DOWN_WRITE_I_ALLOC_SEM(inode);
+        cio->u.trunc.cui_locks_released = 0;
+
+        /*
+         * Only ll_inode_size_lock is taken at this level. lov_stripe_lock()
+         * is grabbed by ll_truncate() only over call to obd_adjust_kms().  If
+         * vmtruncate returns 0, then ll_truncate dropped ll_inode_size_lock()
+         */
+        ll_inode_size_lock(inode, 0);
+        result = vmtruncate(inode, size);
+        if (result != 0)
+                ll_inode_size_unlock(inode, 0);
+        /*
+         * If a page is partially truncated, keep it owned across truncate to
+         * prevent... races.
+         *
+         * XXX this properly belongs to osc, because races in question are OST
+         * specific.
+         */
+        if (cl_offset(obj, start) != size) {
+                struct cl_object_header *hdr;
+
+                hdr = cl_object_header(obj);
+                spin_lock(&hdr->coh_page_guard);
+                vio->cui_partpage = cl_page_lookup(hdr, start);
+                spin_unlock(&hdr->coh_page_guard);
+
+                if (vio->cui_partpage != NULL)
+                        /*
+                         * Wait for the transfer completion for a partially
+                         * truncated page to avoid dead-locking an OST with
+                         * the concurrent page-wise overlapping WRITE and
+                         * PUNCH requests.
+                         *
+                         * Partial page is disowned in vvp_io_trunc_end().
+                         */
+                        cl_page_own(env, io, vio->cui_partpage);
+        } else
+                vio->cui_partpage = NULL;
+        return result;
+}
+
+static void vvp_io_trunc_end(const struct lu_env *env,
+                             const struct cl_io_slice *ios)
+{
+        struct vvp_io *vio = cl2vvp_io(env, ios);
+
+        if (vio->cui_partpage != NULL) {
+                cl_page_disown(env, ios->cis_io, vio->cui_partpage);
+                cl_page_put(env, vio->cui_partpage);
+                vio->cui_partpage = NULL;
+        }
+}
+
+static void vvp_io_trunc_fini(const struct lu_env *env,
+                              const struct cl_io_slice *ios)
+{
+        struct ccc_io *cio   = ccc_env_io(env);
+        struct inode  *inode = ccc_object_inode(ios->cis_io->ci_obj);
+
+        if (cio->u.trunc.cui_locks_released) {
+                LOCK_INODE_MUTEX(inode);
+                DOWN_WRITE_I_ALLOC_SEM(inode);
+                cio->u.trunc.cui_locks_released = 0;
+        }
+        vvp_io_fini(env, ios);
+}
+
+#ifdef HAVE_FILE_READV
+static ssize_t lustre_generic_file_read(struct file *file,
+                                        struct ccc_io *vio, loff_t *ppos)
+{
+        return generic_file_readv(file, vio->cui_iov, vio->cui_nrsegs, ppos);
+}
+
+static ssize_t lustre_generic_file_write(struct file *file,
+                                         struct ccc_io *vio, loff_t *ppos)
+{
+        return generic_file_writev(file, vio->cui_iov, vio->cui_nrsegs, ppos);
+}
+#else
+static ssize_t lustre_generic_file_read(struct file *file,
+                                        struct ccc_io *vio, loff_t *ppos)
+{
+        return generic_file_aio_read(vio->cui_iocb, vio->cui_iov,
+                                     vio->cui_nrsegs, *ppos);
+}
+
+static ssize_t lustre_generic_file_write(struct file *file,
+                                        struct ccc_io *vio, loff_t *ppos)
+{
+        return generic_file_aio_write(vio->cui_iocb, vio->cui_iov,
+                                      vio->cui_nrsegs, *ppos);
+}
+#endif
+
+static int vvp_io_read_start(const struct lu_env *env,
+                             const struct cl_io_slice *ios)
+{
+        struct vvp_io     *vio   = cl2vvp_io(env, ios);
+        struct ccc_io     *cio   = cl2ccc_io(env, ios);
+        struct cl_io      *io    = ios->cis_io;
+        struct cl_object  *obj   = io->ci_obj;
+        struct inode      *inode = ccc_object_inode(obj);
+        struct ll_ra_read *bead  = &vio->cui_bead;
+        struct file       *file  = cio->cui_fd->fd_file;
+
+        int     result;
+        loff_t  pos = io->u.ci_rd.rd.crw_pos;
+        size_t  cnt = io->u.ci_rd.rd.crw_count;
+        size_t  tot = cio->cui_tot_count;
+
+        CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+        LASSERT(vio->cui_oneshot == 0);
+
+        CDEBUG(D_VFSTRACE, "read: -> [%lli, %lli)\n", pos, pos + cnt);
+
+        result = ccc_prep_size(env, obj, io, pos + tot - 1, 1);
+        if (result != 0)
+                return result;
+
+        LU_OBJECT_HEADER(D_INODE, env, &obj->co_lu,
+                        "Read ino %lu, "LPSZ" bytes, offset %lld, size %llu\n",
+                        inode->i_ino, cnt, pos, i_size_read(inode));
+
+        /* turn off the kernel's read-ahead */
+        cio->cui_fd->fd_file->f_ra.ra_pages = 0;
+
+        /* initialize read-ahead window once per syscall */
+        if (!vio->cui_ra_window_set) {
+                vio->cui_ra_window_set = 1;
+                bead->lrr_start = cl_index(obj, pos);
+                /*
+                 * XXX: explicit CFS_PAGE_SIZE
+                 */
+                bead->lrr_count = cl_index(obj, tot + CFS_PAGE_SIZE - 1);
+                ll_ra_read_in(file, bead);
+        }
+
+        /* BUG: 5972 */
+        file_accessed(file);
+        if (cl_io_is_sendfile(io)) {
+                result = generic_file_sendfile(file, &pos, cnt,
+                                vio->u.read.cui_actor, vio->u.read.cui_target);
+        } else {
+                result = lustre_generic_file_read(file, cio, &pos);
+        }
+
+        if (result >= 0) {
+                if (result < cnt)
+                        io->ci_continue = 0;
+                io->ci_nob += result;
+                ll_rw_stats_tally(ll_i2sbi(inode), current->pid,
+                                  cio->cui_fd, pos, result, 0);
+                result = 0;
+        }
+        return result;
+}
+
+static int vvp_io_write_start(const struct lu_env *env,
+                              const struct cl_io_slice *ios)
+{
+        struct ccc_io      *cio   = cl2ccc_io(env, ios);
+        struct cl_io       *io    = ios->cis_io;
+        struct cl_object   *obj   = io->ci_obj;
+        struct inode       *inode = ccc_object_inode(obj);
+        struct file        *file  = cio->cui_fd->fd_file;
+        ssize_t result = 0;
+        loff_t pos = io->u.ci_wr.wr.crw_pos;
+        size_t cnt = io->u.ci_wr.wr.crw_count;
+
+        ENTRY;
+
+        if (cl_io_is_append(io))
+                /*
+                 * PARALLEL IO This has to be changed for parallel IO doing
+                 * out-of-order writes.
+                 */
+                pos = io->u.ci_wr.wr.crw_pos = i_size_read(inode);
+
+        CDEBUG(D_VFSTRACE, "write: [%lli, %lli)\n", pos, pos + cnt);
+
+        if (cl2vvp_io(env, ios)->cui_oneshot > 0)
+                result = 0;
+        else
+                result = lustre_generic_file_write(file, cio, &pos);
+
+        if (result > 0) {
+                if (result < cnt)
+                        io->ci_continue = 0;
+                io->ci_nob += result;
+                ll_rw_stats_tally(ll_i2sbi(inode), current->pid,
+                                  cio->cui_fd, pos, result, 0);
+                result = 0;
+        }
+        RETURN(result);
+}
+
+static int vvp_io_fault_start(const struct lu_env *env,
+                              const struct cl_io_slice *ios)
+{
+        struct vvp_io       *vio     = cl2vvp_io(env, ios);
+        struct cl_io        *io      = ios->cis_io;
+        struct cl_object    *obj     = io->ci_obj;
+        struct inode        *inode   = ccc_object_inode(obj);
+        struct cl_fault_io  *fio     = &io->u.ci_fault;
+        struct vvp_fault_io *cfio    = &vio->u.fault;
+        cfs_page_t          *vmpage;
+        loff_t               offset;
+        int                  result  = 0;
+
+        LASSERT(vio->cui_oneshot == 0);
+
+        if (fio->ft_executable &&
+            LTIME_S(inode->i_mtime) != vio->u.fault.ft_mtime)
+                CWARN("binary "DFID
+                      " changed while waiting for the page fault lock\n",
+                      PFID(lu_object_fid(&obj->co_lu)));
+
+        /* offset of the last byte on the page */
+        offset = cl_offset(obj, fio->ft_index + 1) - 1;
+        LASSERT(cl_index(obj, offset) == fio->ft_index);
+        result = ccc_prep_size(env, obj, io, offset, 0);
+        if (result != 0)
+                return result;
+
+        vmpage = filemap_nopage(cfio->ft_vma, cfio->ft_address, cfio->ft_type);
+        if (vmpage != NOPAGE_SIGBUS && vmpage != NOPAGE_OOM)
+                LL_CDEBUG_PAGE(D_PAGE, vmpage,
+                               "got addr %lu type %lx\n",
+                               cfio->ft_address, (long)cfio->ft_type);
+        else
+                CDEBUG(D_PAGE, "got addr %lu type %lx - SIGBUS\n",
+                       cfio->ft_address, (long)cfio->ft_type);
+
+        if (vmpage == NOPAGE_SIGBUS)
+                result = -EFAULT;
+        else if (vmpage == NOPAGE_OOM)
+                result = -ENOMEM;
+        else {
+                struct cl_page *page;
+                loff_t          size;
+                pgoff_t         last; /* last page in a file data region */
+
+                /* Temporarily lock vmpage to keep cl_page_find() happy. */
+                lock_page(vmpage);
+                page = cl_page_find(env, obj, fio->ft_index, vmpage,
+                                    CPT_CACHEABLE);
+                unlock_page(vmpage);
+                if (!IS_ERR(page)) {
+                        size = i_size_read(inode);
+                        last = cl_index(obj, size - 1);
+                        if (fio->ft_index == last)
+                                /*
+                                 * Last page is mapped partially.
+                                 */
+                                fio->ft_nob = size - cl_offset(obj,
+                                                               fio->ft_index);
+                        else
+                                fio->ft_nob = cl_page_size(obj);
+                        lu_ref_add(&page->cp_reference, "fault", io);
+                        fio->ft_page = page;
+                        /*
+                         * Certain 2.6 kernels return not-NULL from
+                         * filemap_nopage() when page is beyond the file size,
+                         * on the grounds that "An external ptracer can access
+                         * pages that normally aren't accessible.." Don't
+                         * propagate such page fault to the lower layers to
+                         * avoid side-effects like KMS updates.
+                         */
+                        if (fio->ft_index > last)
+                                result = +1;
+                } else
+                        result = PTR_ERR(page);
+        }
+        return result;
+}
+
+static void vvp_io_advance(const struct lu_env *env,
+                           const struct cl_io_slice *ios, size_t nob)
+{
+        struct ccc_io    *vio = cl2ccc_io(env, ios);
+        struct cl_io     *io  = ios->cis_io;
+        struct cl_object *obj = ios->cis_io->ci_obj;
+
+        CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+
+        if (!cl_io_is_sendfile(io) && io->ci_continue) {
+                /* update the iov */
+                LASSERT(vio->cui_tot_nrsegs >= vio->cui_nrsegs);
+                LASSERT(vio->cui_tot_count  >= nob);
+
+                vio->cui_iov        += vio->cui_nrsegs;
+                vio->cui_tot_nrsegs -= vio->cui_nrsegs;
+                vio->cui_tot_count  -= nob;
+
+                if (vio->cui_iov_olen) {
+                        struct iovec *iv;
+
+                        vio->cui_iov--;
+                        vio->cui_tot_nrsegs++;
+                        iv = &vio->cui_iov[0];
+                        iv->iov_base += iv->iov_len;
+                        LASSERT(vio->cui_iov_olen > iv->iov_len);
+                        iv->iov_len = vio->cui_iov_olen - iv->iov_len;
+                }
+        }
+}
+
+static int vvp_io_read_page(const struct lu_env *env,
+                            const struct cl_io_slice *ios,
+                            const struct cl_page_slice *slice)
+{
+        struct cl_io              *io     = ios->cis_io;
+        struct cl_object          *obj    = slice->cpl_obj;
+        struct ccc_page           *cp     = cl2ccc_page(slice);
+        struct cl_page            *page   = slice->cpl_page;
+        struct inode              *inode  = ccc_object_inode(obj);
+        struct ll_sb_info         *sbi    = ll_i2sbi(inode);
+        struct ll_file_data       *fd     = cl2ccc_io(env, ios)->cui_fd;
+        struct ll_readahead_state *ras    = &fd->fd_ras;
+        cfs_page_t                *vmpage = cp->cpg_page;
+        struct cl_2queue          *queue  = &io->ci_queue;
+
+        CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+        LASSERT(cl2vvp_io(env, ios)->cui_oneshot == 0);
+        LASSERT(slice->cpl_obj == obj);
+
+        ENTRY;
+
+        if (sbi->ll_ra_info.ra_max_pages)
+                ras_update(sbi, inode, ras, page->cp_index,
+                           cp->cpg_defer_uptodate);
+
+        /* Sanity check whether the page is protected by a lock. */
+        if (likely(!(fd->fd_flags & LL_FILE_IGNORE_LOCK))) {
+                int rc;
+
+                rc = cl_page_is_under_lock(env, io, page);
+                if (rc != -EBUSY) {
+                        CL_PAGE_HEADER(D_WARNING, env, page, "%s: %i\n",
+                                       rc == -ENODATA ? "without a lock" :
+                                       "match failed", rc);
+                        if (rc != -ENODATA)
+                                RETURN(rc);
+                }
+        }
+
+        if (cp->cpg_defer_uptodate) {
+                cp->cpg_ra_used = 1;
+                cl_page_export(env, page);
+        }
+        /*
+         * Add page into the queue even when it is marked uptodate above.
+         * this will unlock it automatically as part of cl_page_list_disown().
+         */
+        cl_2queue_add(queue, page);
+        if (sbi->ll_ra_info.ra_max_pages)
+                ll_readahead(env, io, ras,
+                             vmpage->mapping, &queue->c2_qin, fd->fd_flags);
+
+        RETURN(0);
+}
+
+static int vvp_page_sync_io(const struct lu_env *env, struct cl_io *io,
+                            struct cl_page *page, struct ccc_page *cp,
+                            int from, int to, enum cl_req_type crt)
+{
+        struct cl_2queue  *queue;
+        struct ccc_object *cobo   = cl2ccc(page->cp_obj);
+        struct cl_sync_io *anchor = &ccc_env_info(env)->cti_sync_io;
+
+        int writing = io->ci_type == CIT_WRITE;
+        int result;
+
+        LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+
+        queue = &io->ci_queue;
+
+        cl_2queue_init_page(queue, page);
+
+        if (writing)
+                /* Do not pass llap here as it is sync write. */
+                vvp_write_pending(cobo, cp);
+
+        cl_sync_io_init(anchor, 1);
+        cp->cpg_sync_io = anchor;
+        cl_page_clip(env, page, from, to);
+        result = cl_io_submit_rw(env, io, crt, queue);
+        if (result == 0)
+                result = cl_sync_io_wait(env, io, &queue->c2_qout, anchor);
+        else
+                cp->cpg_sync_io = NULL;
+        LASSERT(cl_page_is_owned(page, io));
+        cl_page_clip(env, page, 0, CFS_PAGE_SIZE);
+
+        if (crt == CRT_READ)
+                /*
+                 * in CRT_WRITE case page is left locked even in case of
+                 * error.
+                 */
+                cl_page_list_disown(env, io, &queue->c2_qin);
+        cl_2queue_fini(env, queue);
+
+        return result;
+}
+
+/**
+ * Prepare partially written-to page for a write.
+ */
+static int vvp_io_prepare_partial(const struct lu_env *env, struct cl_io *io,
+                                  struct cl_object *obj, struct cl_page *pg,
+                                  struct ccc_page *cp,
+                                  unsigned from, unsigned to)
+{
+        struct cl_attr *attr   = &ccc_env_info(env)->cti_attr;
+        loff_t          offset = cl_offset(obj, pg->cp_index);
+        int             result;
+
+        cl_object_attr_lock(obj);
+        result = cl_object_attr_get(env, obj, attr);
+        cl_object_attr_unlock(obj);
+        if (result == 0) {
+                /*
+                 * If are writing to a new page, no need to read old data.
+                 * The extent locking will have updated the KMS, and for our
+                 * purposes here we can treat it like i_size.
+                 */
+                if (attr->cat_kms <= offset) {
+                        char *kaddr = kmap_atomic(cp->cpg_page, KM_USER0);
+
+                        memset(kaddr, 0, cl_page_size(obj));
+                        kunmap_atomic(kaddr, KM_USER0);
+                } else if (cp->cpg_defer_uptodate)
+                        cp->cpg_ra_used = 1;
+                else
+                        result = vvp_page_sync_io(env, io, pg, cp,
+                                                  0, CFS_PAGE_SIZE, CRT_READ);
+                /*
+                 * In older implementations, obdo_refresh_inode is called here
+                 * to update the inode because the write might modify the
+                 * object info at OST. However, this has been proven useless,
+                 * since LVB functions will be called when user space program
+                 * tries to retrieve inode attribute.  Also, see bug 15909 for
+                 * details. -jay
+                 */
+                if (result == 0)
+                        cl_page_export(env, pg);
+        }
+        return result;
+}
+
+static int vvp_io_prepare_write(const struct lu_env *env,
+                                const struct cl_io_slice *ios,
+                                const struct cl_page_slice *slice,
+                                unsigned from, unsigned to)
+{
+        struct cl_object *obj    = slice->cpl_obj;
+        struct ccc_page  *cp     = cl2ccc_page(slice);
+        struct cl_page   *pg     = slice->cpl_page;
+        cfs_page_t       *vmpage = cp->cpg_page;
+
+        int result;
+
+        ENTRY;
+
+        LINVRNT(cl_page_is_vmlocked(env, pg));
+        LASSERT(vmpage->mapping->host == ccc_object_inode(obj));
+
+        result = 0;
+
+        CL_PAGE_HEADER(D_PAGE, env, pg, "preparing: [%d, %d]\n", from, to);
+        if (!PageUptodate(vmpage)) {
+                /*
+                 * We're completely overwriting an existing page, so _don't_
+                 * set it up to date until commit_write
+                 */
+                if (from == 0 && to == CFS_PAGE_SIZE) {
+                        CL_PAGE_HEADER(D_PAGE, env, pg, "full page write\n");
+                        POISON_PAGE(page, 0x11);
+                } else
+                        result = vvp_io_prepare_partial(env, ios->cis_io, obj,
+                                                        pg, cp, from, to);
+        } else
+                CL_PAGE_HEADER(D_PAGE, env, pg, "uptodate\n");
+        RETURN(result);
+}
+
+static int vvp_io_commit_write(const struct lu_env *env,
+                               const struct cl_io_slice *ios,
+                               const struct cl_page_slice *slice,
+                               unsigned from, unsigned to)
+{
+        struct cl_object  *obj    = slice->cpl_obj;
+        struct cl_io      *io     = ios->cis_io;
+        struct ccc_page   *cp     = cl2ccc_page(slice);
+        struct cl_page    *pg     = slice->cpl_page;
+        struct inode      *inode  = ccc_object_inode(obj);
+        struct ll_sb_info *sbi    = ll_i2sbi(inode);
+        cfs_page_t        *vmpage = cp->cpg_page;
+
+        int    result;
+        int    tallyop;
+        loff_t size;
+
+        ENTRY;
+
+        LINVRNT(cl_page_is_vmlocked(env, pg));
+        LASSERT(vmpage->mapping->host == inode);
+
+        LU_OBJECT_HEADER(D_INODE, env, &obj->co_lu, "commiting page write\n");
+        CL_PAGE_HEADER(D_PAGE, env, pg, "committing: [%d, %d]\n", from, to);
+
+        /*
+         * queue a write for some time in the future the first time we
+         * dirty the page.
+         *
+         * This is different from what other file systems do: they usually
+         * just mark page (and some of its buffers) dirty and rely on
+         * balance_dirty_pages() to start a write-back. Lustre wants write-back
+         * to be started earlier for the following reasons:
+         *
+         *     (1) with a large number of clients we need to limit the amount
+         *     of cached data on the clients a lot;
+         *
+         *     (2) large compute jobs generally want compute-only then io-only
+         *     and the IO should complete as quickly as possible;
+         *
+         *     (3) IO is batched up to the RPC size and is async until the
+         *     client max cache is hit
+         *     (/proc/fs/lustre/osc/OSC.../max_dirty_mb)
+         *
+         */
+        if (!PageDirty(vmpage)) {
+                tallyop = LPROC_LL_DIRTY_MISSES;
+                vvp_write_pending(cl2ccc(obj), cp);
+                set_page_dirty(vmpage);
+                result = cl_page_cache_add(env, io, pg, CRT_WRITE);
+                if (result == -EDQUOT)
+                        /*
+                         * Client ran out of disk space grant. Possible
+                         * strategies are:
+                         *
+                         *     (a) do a sync write, renewing grant;
+                         *
+                         *     (b) stop writing on this stripe, switch to the
+                         *     next one.
+                         *
+                         * (b) is a part of "parallel io" design that is the
+                         * ultimate goal. (a) is what "old" client did, and
+                         * what the new code continues to do for the time
+                         * being.
+                         */
+                        result = vvp_page_sync_io(env, io, pg, cp,
+                                                  from, to, CRT_WRITE);
+        } else {
+                tallyop = LPROC_LL_DIRTY_HITS;
+                result = 0;
+        }
+        ll_stats_ops_tally(sbi, tallyop, 1);
+
+        size = cl_offset(obj, pg->cp_index) + to;
+
+        if (result == 0) {
+                if (size > i_size_read(inode))
+                        i_size_write(inode, size);
+                cl_page_export(env, pg);
+        } else if (size > i_size_read(inode))
+                cl_page_discard(env, io, pg);
+        RETURN(result);
+}
+
+static const struct cl_io_operations vvp_io_ops = {
+        .op = {
+                [CIT_READ] = {
+                        .cio_fini      = vvp_io_fini,
+                        .cio_lock      = vvp_io_read_lock,
+                        .cio_start     = vvp_io_read_start,
+                        .cio_advance   = vvp_io_advance
+                },
+                [CIT_WRITE] = {
+                        .cio_fini      = vvp_io_fini,
+                        .cio_lock      = vvp_io_write_lock,
+                        .cio_start     = vvp_io_write_start,
+                        .cio_advance   = vvp_io_advance
+                },
+                [CIT_TRUNC] = {
+                        .cio_fini       = vvp_io_trunc_fini,
+                        .cio_iter_init  = vvp_io_trunc_iter_init,
+                        .cio_lock       = vvp_io_trunc_lock,
+                        .cio_start      = vvp_io_trunc_start,
+                        .cio_end        = vvp_io_trunc_end
+                },
+                [CIT_FAULT] = {
+                        .cio_fini      = vvp_io_fault_fini,
+                        .cio_iter_init = vvp_io_fault_iter_init,
+                        .cio_lock      = vvp_io_fault_lock,
+                        .cio_start     = vvp_io_fault_start,
+                        .cio_end       = ccc_io_end
+                },
+                [CIT_MISC] = {
+                        .cio_fini   = vvp_io_fini
+                }
+        },
+        .cio_read_page     = vvp_io_read_page,
+        .cio_prepare_write = vvp_io_prepare_write,
+        .cio_commit_write  = vvp_io_commit_write
+};
+
+int vvp_io_init(const struct lu_env *env, struct cl_object *obj,
+                struct cl_io *io)
+{
+        struct vvp_io      *vio   = vvp_env_io(env);
+        struct ccc_io      *cio   = ccc_env_io(env);
+        struct inode       *inode = ccc_object_inode(obj);
+        struct ll_sb_info  *sbi   = ll_i2sbi(inode);
+        int                 result;
+
+        CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+        ENTRY;
+
+        CL_IO_SLICE_CLEAN(cio, cui_cl);
+        cl_io_slice_add(io, &cio->cui_cl, obj, &vvp_io_ops);
+        vio->cui_oneshot = 0;
+        vio->cui_ra_window_set = 0;
+        result = 0;
+        if (io->ci_type == CIT_READ || io->ci_type == CIT_WRITE) {
+                int    op;
+                size_t count;
+
+                count = io->u.ci_rw.crw_count;
+                op    = io->ci_type == CIT_READ ?
+                        LPROC_LL_READ_BYTES : LPROC_LL_WRITE_BYTES;
+                if (io->ci_type == CIT_WRITE)
+                        down(&ll_i2info(inode)->lli_write_sem);
+                /* "If nbyte is 0, read() will return 0 and have no other
+                 *  results."  -- Single Unix Spec */
+                if (count == 0)
+                        result = 1;
+                else {
+                        cio->cui_tot_count = count;
+                        cio->cui_tot_nrsegs = 0;
+                        ll_stats_ops_tally(sbi, op, count);
+                }
+        } else if (io->ci_type == CIT_TRUNC) {
+                /* lockless truncate? */
+                ll_stats_ops_tally(sbi, LPROC_LL_TRUNC, 1);
+        }
+        RETURN(result);
+}
+
+static struct vvp_io *cl2vvp_io(const struct lu_env *env,
+                                const struct cl_io_slice *slice)
+{
+        /* Caling just for assertion */
+        cl2ccc_io(env, slice);
+        return vvp_env_io(env);
+}
+
diff --git a/lustre/llite/vvp_lock.c b/lustre/llite/vvp_lock.c

new file mode 100644 (file)

index 0000000..f0c487d
--- /dev/null
+++ b/lustre/llite/vvp_lock.c
@@ -0,0 +1,89 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_lock for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#ifndef __KERNEL__
+# error This file is kernel only.
+#endif
+
+#include <obd.h>
+#include <lustre_lite.h>
+
+#include "vvp_internal.h"
+
+/*****************************************************************************
+ *
+ * Vvp lock functions.
+ *
+ */
+
+/**
+ * Estimates lock value for the purpose of managing the lock cache during
+ * memory shortages.
+ *
+ * Locks for memory mapped files are almost infinitely precious, others are
+ * junk. "Mapped locks" are heavy, but not infinitely heavy, so that they are
+ * ordered within themselves by weights assigned from other layers.
+ */
+static unsigned long vvp_lock_weigh(const struct lu_env *env,
+                                    const struct cl_lock_slice *slice)
+{
+        struct ccc_object *cob = cl2ccc(slice->cls_obj);
+
+        ENTRY;
+        RETURN(atomic_read(&cob->cob_mmap_cnt) > 0 ? ~0UL >> 2 : 0);
+}
+
+static const struct cl_lock_operations vvp_lock_ops = {
+        .clo_fini      = ccc_lock_fini,
+        .clo_enqueue   = ccc_lock_enqueue,
+        .clo_wait      = ccc_lock_wait,
+        .clo_unuse     = ccc_lock_unuse,
+        .clo_fits_into = ccc_lock_fits_into,
+        .clo_state     = ccc_lock_state,
+        .clo_weigh     = vvp_lock_weigh
+};
+
+int vvp_lock_init(const struct lu_env *env, struct cl_object *obj,
+                  struct cl_lock *lock, const struct cl_io *io)
+{
+        return ccc_lock_init(env, obj, lock, io, &vvp_lock_ops);
+}
diff --git a/lustre/llite/vvp_object.c b/lustre/llite/vvp_object.c

new file mode 100644 (file)

index 0000000..412a877
--- /dev/null
+++ b/lustre/llite/vvp_object.c
@@ -0,0 +1,153 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * cl_object implementation for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#ifndef __KERNEL__
+# error This file is kernel only.
+#endif
+
+#include <libcfs/libcfs.h>
+
+#include <obd.h>
+#include <lustre_lite.h>
+
+#include "vvp_internal.h"
+
+/*****************************************************************************
+ *
+ * Object operations.
+ *
+ */
+
+static int vvp_object_print(const struct lu_env *env, void *cookie,
+                            lu_printer_t p, const struct lu_object *o)
+{
+        struct ccc_object    *obj   = lu2ccc(o);
+        struct inode         *inode = obj->cob_inode;
+        struct ll_inode_info *lli;
+
+        (*p)(env, cookie, "(%s %i %i) inode: %p ",
+             list_empty(&obj->cob_pending_list) ? "-" : "+",
+             obj->cob_transient_pages, atomic_read(&obj->cob_mmap_cnt), inode);
+        if (inode) {
+                lli = ll_i2info(inode);
+                (*p)(env, cookie, "%lu/%u %o %u %i %p "DFID,
+                     inode->i_ino, inode->i_generation, inode->i_mode,
+                     inode->i_nlink, atomic_read(&inode->i_count),
+                     lli->lli_clob, PFID(&lli->lli_fid));
+        }
+        return 0;
+}
+
+static int vvp_attr_get(const struct lu_env *env, struct cl_object *obj,
+                        struct cl_attr *attr)
+{
+        struct inode *inode = ccc_object_inode(obj);
+
+        /*
+         * lov overwrites most of these fields in
+         * lov_attr_get()->...lov_merge_lvb_kms(), except when inode
+         * attributes are newer.
+         */
+
+        attr->cat_size = i_size_read(inode);
+        attr->cat_mtime = LTIME_S(inode->i_mtime);
+        attr->cat_atime = LTIME_S(inode->i_atime);
+        attr->cat_ctime = LTIME_S(inode->i_ctime);
+        attr->cat_blocks = inode->i_blocks;
+        attr->cat_uid = inode->i_uid;
+        attr->cat_gid = inode->i_gid;
+        /* KMS is not known by this layer */
+        return 0; /* layers below have to fill in the rest */
+}
+
+static int vvp_attr_set(const struct lu_env *env, struct cl_object *obj,
+                        const struct cl_attr *attr, unsigned valid)
+{
+        struct inode *inode = ccc_object_inode(obj);
+
+        if (valid & CAT_UID)
+                inode->i_uid = attr->cat_uid;
+        if (valid & CAT_GID)
+                inode->i_gid = attr->cat_gid;
+        if (0 && valid & CAT_SIZE)
+                i_size_write(inode, attr->cat_size);
+        /* not currently necessary */
+        if (0 && valid & (CAT_UID|CAT_GID|CAT_SIZE))
+                mark_inode_dirty(inode);
+        return 0;
+}
+
+static const struct cl_object_operations vvp_ops = {
+        .coo_page_init = vvp_page_init,
+        .coo_lock_init = vvp_lock_init,
+        .coo_io_init   = vvp_io_init,
+        .coo_attr_get  = vvp_attr_get,
+        .coo_attr_set  = vvp_attr_set,
+        .coo_conf_set  = ccc_conf_set,
+        .coo_glimpse   = ccc_object_glimpse
+};
+
+static const struct lu_object_operations vvp_lu_obj_ops = {
+        .loo_object_init  = ccc_object_init,
+        .loo_object_free  = ccc_object_free,
+        .loo_object_print = vvp_object_print
+};
+
+struct ccc_object *cl_inode2ccc(struct inode *inode)
+{
+        struct cl_inode_info *lli = cl_i2info(inode);
+        struct cl_object     *obj = lli->lli_clob;
+        struct lu_object     *lu;
+
+        LASSERT(obj != NULL);
+        lu = lu_object_locate(obj->co_lu.lo_header, &vvp_device_type);
+        LASSERT(lu != NULL);
+        return lu2ccc(lu);
+}
+
+struct lu_object *vvp_object_alloc(const struct lu_env *env,
+                                   const struct lu_object_header *hdr,
+                                   struct lu_device *dev)
+{
+        return ccc_object_alloc(env, hdr, dev, &vvp_ops, &vvp_lu_obj_ops);
+}
+
diff --git a/lustre/llite/vvp_page.c b/lustre/llite/vvp_page.c

new file mode 100644 (file)

index 0000000..2432c50
--- /dev/null
+++ b/lustre/llite/vvp_page.c
@@ -0,0 +1,566 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_page for VVP layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LLITE
+
+#ifndef __KERNEL__
+# error This file is kernel only.
+#endif
+
+#include <obd.h>
+#include <lustre_lite.h>
+
+#include "vvp_internal.h"
+
+/*****************************************************************************
+ *
+ * Page operations.
+ *
+ */
+
+static void vvp_page_fini_common(struct ccc_page *cp)
+{
+        cfs_page_t *vmpage = cp->cpg_page;
+
+        LASSERT(vmpage != NULL);
+        page_cache_release(vmpage);
+        OBD_SLAB_FREE_PTR(cp, vvp_page_kmem);
+}
+
+static void vvp_page_fini(const struct lu_env *env,
+                          struct cl_page_slice *slice)
+{
+        struct ccc_page *cp = cl2ccc_page(slice);
+        cfs_page_t *vmpage  = cp->cpg_page;
+
+        /*
+         * vmpage->private was already cleared when page was moved into
+         * VPG_FREEING state.
+         */
+        LASSERT((struct cl_page *)vmpage->private != slice->cpl_page);
+        vvp_page_fini_common(cp);
+}
+
+static void vvp_page_own(const struct lu_env *env,
+                         const struct cl_page_slice *slice, struct cl_io *_)
+{
+        struct ccc_page *vpg    = cl2ccc_page(slice);
+        cfs_page_t      *vmpage = vpg->cpg_page;
+
+        LASSERT(vmpage != NULL);
+        lock_page(vmpage);
+        wait_on_page_writeback(vmpage);
+}
+
+static void vvp_page_assume(const struct lu_env *env,
+                            const struct cl_page_slice *slice, struct cl_io *_)
+{
+        cfs_page_t *vmpage = cl2vm_page(slice);
+
+        LASSERT(vmpage != NULL);
+        LASSERT(PageLocked(vmpage));
+        wait_on_page_writeback(vmpage);
+}
+
+static void vvp_page_unassume(const struct lu_env *env,
+                              const struct cl_page_slice *slice,
+                              struct cl_io *_)
+{
+        cfs_page_t *vmpage = cl2vm_page(slice);
+
+        LASSERT(vmpage != NULL);
+        LASSERT(PageLocked(vmpage));
+}
+
+static void vvp_page_disown(const struct lu_env *env,
+                            const struct cl_page_slice *slice, struct cl_io *io)
+{
+        cfs_page_t *vmpage = cl2vm_page(slice);
+
+        LASSERT(vmpage != NULL);
+        LASSERT(PageLocked(vmpage));
+
+        unlock_page(cl2vm_page(slice));
+}
+
+static void vvp_page_discard(const struct lu_env *env,
+                             const struct cl_page_slice *slice, struct cl_io *_)
+{
+        cfs_page_t           *vmpage  = cl2vm_page(slice);
+        struct address_space *mapping = vmpage->mapping;
+        struct ccc_page      *cpg     = cl2ccc_page(slice);
+
+        LASSERT(vmpage != NULL);
+        LASSERT(PageLocked(vmpage));
+
+        if (cpg->cpg_defer_uptodate && !cpg->cpg_ra_used)
+                ll_ra_stats_inc(mapping, RA_STAT_DISCARDED);
+
+        /*
+         * truncate_complete_page() calls
+         * a_ops->invalidatepage()->cl_page_delete()->vvp_page_delete().
+         */
+        truncate_complete_page(mapping, vmpage);
+}
+
+static int vvp_page_unmap(const struct lu_env *env,
+                          const struct cl_page_slice *slice, struct cl_io *_)
+{
+        cfs_page_t *vmpage = cl2vm_page(slice);
+        __u64       offset = vmpage->index << CFS_PAGE_SHIFT;
+
+        LASSERT(vmpage != NULL);
+        LASSERT(PageLocked(vmpage));
+        /*
+         * XXX is it safe to call this with the page lock held?
+         */
+        ll_teardown_mmaps(vmpage->mapping, offset, offset + CFS_PAGE_SIZE);
+        return 0;
+}
+
+static void vvp_page_delete(const struct lu_env *env,
+                            const struct cl_page_slice *slice)
+{
+        cfs_page_t       *vmpage = cl2vm_page(slice);
+        struct inode     *inode  = vmpage->mapping->host;
+        struct cl_object *obj    = slice->cpl_obj;
+
+        LASSERT(PageLocked(vmpage));
+        LASSERT((struct cl_page *)vmpage->private == slice->cpl_page);
+        LASSERT(inode == ccc_object_inode(obj));
+
+        vvp_write_complete(cl2ccc(obj), cl2ccc_page(slice));
+        ClearPagePrivate(vmpage);
+        vmpage->private = 0;
+        /*
+         * Reference from vmpage to cl_page is removed, but the reference back
+         * is still here. It is removed later in vvp_page_fini().
+         */
+}
+
+static void vvp_page_export(const struct lu_env *env,
+                            const struct cl_page_slice *slice)
+{
+        cfs_page_t *vmpage = cl2vm_page(slice);
+
+        LASSERT(vmpage != NULL);
+        LASSERT(PageLocked(vmpage));
+        SetPageUptodate(vmpage);
+}
+
+static int vvp_page_is_vmlocked(const struct lu_env *env,
+                                const struct cl_page_slice *slice)
+{
+        return PageLocked(cl2vm_page(slice)) ? -EBUSY : -ENODATA;
+}
+
+static int vvp_page_prep_read(const struct lu_env *env,
+                              const struct cl_page_slice *slice,
+                              struct cl_io *_)
+{
+        ENTRY;
+        /* Skip the page already marked as PG_uptodate. */
+        RETURN(PageUptodate(cl2vm_page(slice)) ? -EALREADY : 0);
+}
+
+static int vvp_page_prep_write(const struct lu_env *env,
+                               const struct cl_page_slice *slice,
+                               struct cl_io *_)
+{
+        cfs_page_t *vmpage = cl2vm_page(slice);
+        int result;
+
+        if (clear_page_dirty_for_io(vmpage)) {
+                set_page_writeback(vmpage);
+                result = 0;
+        } else
+                result = -EALREADY;
+        return result;
+}
+
+/**
+ * Handles page transfer errors at VM level.
+ *
+ * This takes inode as a separate argument, because inode on which error is to
+ * be set can be different from \a vmpage inode in case of direct-io.
+ */
+static void vvp_vmpage_error(struct inode *inode, cfs_page_t *vmpage, int ioret)
+{
+        if (ioret == 0)
+                ClearPageError(vmpage);
+        else if (ioret != -EINTR) {
+                SetPageError(vmpage);
+                if (ioret == -ENOSPC)
+                        set_bit(AS_ENOSPC, &inode->i_mapping->flags);
+                else
+                        set_bit(AS_EIO, &inode->i_mapping->flags);
+        }
+}
+
+static void vvp_page_completion_common(const struct lu_env *env,
+                                       struct ccc_page *cp, int ioret)
+{
+        struct cl_page    *clp    = cp->cpg_cl.cpl_page;
+        cfs_page_t        *vmpage = cp->cpg_page;
+        struct inode      *inode  = ccc_object_inode(clp->cp_obj);
+        struct cl_sync_io *anchor = cp->cpg_sync_io;
+
+        LINVRNT(cl_page_is_vmlocked(env, clp));
+        KLASSERT(!PageWriteback(vmpage));
+
+        vvp_vmpage_error(inode, vmpage, ioret);
+
+        if (anchor != NULL) {
+                cp->cpg_sync_io  = NULL;
+                cl_sync_io_note(anchor, ioret);
+        } else if (clp->cp_type == CPT_CACHEABLE)
+                unlock_page(vmpage);
+}
+
+static void vvp_page_completion_read(const struct lu_env *env,
+                                     const struct cl_page_slice *slice,
+                                     int ioret)
+{
+        struct ccc_page *cp    = cl2ccc_page(slice);
+        struct cl_page  *page  = cl_page_top(slice->cpl_page);
+        struct inode    *inode = ccc_object_inode(page->cp_obj);
+        ENTRY;
+
+        CL_PAGE_HEADER(D_PAGE, env, page, "completing READ with %d\n", ioret);
+
+        if (cp->cpg_defer_uptodate)
+                ll_ra_count_put(ll_i2sbi(inode), 1);
+
+        if (ioret == 0)  {
+                /* XXX: do we need this for transient pages? */
+                if (!cp->cpg_defer_uptodate)
+                        cl_page_export(env, page);
+        } else
+                cp->cpg_defer_uptodate = 0;
+        vvp_page_completion_common(env, cp, ioret);
+
+        EXIT;
+}
+
+static void vvp_page_completion_write_common(const struct lu_env *env,
+                                             const struct cl_page_slice *slice,
+                                             int ioret)
+{
+        struct ccc_page *cp = cl2ccc_page(slice);
+
+        if (ioret == 0) {
+                cp->cpg_write_queued = 0;
+                /*
+                 * Only ioret == 0, write succeed, then this page could be
+                 * deleted from the pending_writing count.
+                 */
+                vvp_write_complete(cl2ccc(slice->cpl_obj), cp);
+        }
+        vvp_page_completion_common(env, cp, ioret);
+}
+
+static void vvp_page_completion_write(const struct lu_env *env,
+                                      const struct cl_page_slice *slice,
+                                      int ioret)
+{
+        struct ccc_page *cp     = cl2ccc_page(slice);
+        struct cl_page  *pg     = slice->cpl_page;
+        cfs_page_t      *vmpage = cp->cpg_page;
+
+        ENTRY;
+
+        LINVRNT(cl_page_is_vmlocked(env, pg));
+        LASSERT(PageWriteback(vmpage));
+
+        CL_PAGE_HEADER(D_PAGE, env, pg, "completing WRITE with %d\n", ioret);
+
+        end_page_writeback(vmpage);
+        LASSERT(!PageWriteback(vmpage));
+
+        vvp_page_completion_write_common(env, slice, ioret);
+        EXIT;
+}
+
+/**
+ * Implements cl_page_operations::cpo_make_ready() method.
+ *
+ * This is called to yank page from the transfer page and to send it out as a
+ * part of transfer. This function try-locks the page. If try-lock failed,
+ * page is owned by some concurrent IO, and should be skipped (this is bad,
+ * but hopefully rare situation, as it usually results in transfer being
+ * shorter than possible).
+ *
+ * \retval 0      success, page can be placed into transfer
+ *
+ * \retval -EAGAIN page is either used by concurrent IO has been
+ * truncated. Skip it.
+ */
+static int vvp_page_make_ready(const struct lu_env *env,
+                               const struct cl_page_slice *slice)
+{
+        cfs_page_t *vmpage = cl2vm_page(slice);
+        struct cl_page *pg = slice->cpl_page;
+        int result;
+
+        result = -EAGAIN;
+        /* we're trying to write, but the page is locked.. come back later */
+        if (!TestSetPageLocked(vmpage)) {
+                if (pg->cp_state == CPS_CACHED) {
+                        /*
+                         * We can cancel IO if page wasn't dirty after all.
+                         */
+                        clear_page_dirty_for_io(vmpage);
+                        /*
+                         * This actually clears the dirty bit in the radix
+                         * tree.
+                         */
+                        set_page_writeback(vmpage);
+
+                        CL_PAGE_HEADER(D_PAGE, env, pg, "readied\n");
+                        result = 0;
+                } else
+                        /*
+                         * Page was concurrently truncated.
+                         */
+                        LASSERT(pg->cp_state == CPS_FREEING);
+        }
+        RETURN(result);
+}
+
+static int vvp_page_print(const struct lu_env *env,
+                          const struct cl_page_slice *slice,
+                          void *cookie, lu_printer_t printer)
+{
+        struct ccc_page *vp = cl2ccc_page(slice);
+        cfs_page_t      *vmpage = vp->cpg_page;
+
+        (*printer)(env, cookie, LUSTRE_VVP_NAME"-page@%p(%d:%d:%d) "
+                   "vm@%p ",
+                   vp, vp->cpg_defer_uptodate, vp->cpg_ra_used,
+                   vp->cpg_write_queued, vmpage);
+        if (vmpage != NULL) {
+                (*printer)(env, cookie, "%lx %d:%d %lx %lu %slru",
+                           (long)vmpage->flags, page_count(vmpage),
+                           page_mapcount(vmpage), vmpage->private,
+                           page_index(vmpage),
+                           list_empty(&vmpage->lru) ? "not-" : "");
+        }
+        (*printer)(env, cookie, "\n");
+        return 0;
+}
+
+static int osc_page_cancel(const struct lu_env *env,
+                           const struct cl_page_slice *slice)
+{
+        struct ccc_page *vp = cl2ccc_page(slice);
+        LASSERT(vp->cpg_sync_io != NULL);
+        return 0;
+}
+
+static const struct cl_page_operations vvp_page_ops = {
+        .cpo_own           = vvp_page_own,
+        .cpo_assume        = vvp_page_assume,
+        .cpo_unassume      = vvp_page_unassume,
+        .cpo_disown        = vvp_page_disown,
+        .cpo_vmpage        = ccc_page_vmpage,
+        .cpo_discard       = vvp_page_discard,
+        .cpo_delete        = vvp_page_delete,
+        .cpo_unmap         = vvp_page_unmap,
+        .cpo_export        = vvp_page_export,
+        .cpo_is_vmlocked   = vvp_page_is_vmlocked,
+        .cpo_fini          = vvp_page_fini,
+        .cpo_print         = vvp_page_print,
+        .cpo_is_under_lock = ccc_page_is_under_lock,
+        .cpo_cancel        = osc_page_cancel,
+        .io = {
+                [CRT_READ] = {
+                        .cpo_prep        = vvp_page_prep_read,
+                        .cpo_completion  = vvp_page_completion_read,
+                        .cpo_make_ready  = ccc_fail,
+                },
+                [CRT_WRITE] = {
+                        .cpo_prep        = vvp_page_prep_write,
+                        .cpo_completion  = vvp_page_completion_write,
+                        .cpo_make_ready  = vvp_page_make_ready,
+                }
+        }
+};
+
+static void vvp_transient_page_verify(const struct cl_page *page)
+{
+        struct inode *inode = ccc_object_inode(page->cp_obj);
+
+        LASSERT(!TRYLOCK_INODE_MUTEX(inode));
+        /* LASSERT_SEM_LOCKED(&inode->i_alloc_sem); */
+}
+
+static void vvp_transient_page_own(const struct lu_env *env,
+                                   const struct cl_page_slice *slice,
+                                   struct cl_io *_)
+{
+        vvp_transient_page_verify(slice->cpl_page);
+}
+
+static void vvp_transient_page_assume(const struct lu_env *env,
+                                      const struct cl_page_slice *slice,
+                                      struct cl_io *_)
+{
+        vvp_transient_page_verify(slice->cpl_page);
+}
+
+static void vvp_transient_page_unassume(const struct lu_env *env,
+                                        const struct cl_page_slice *slice,
+                                        struct cl_io *_)
+{
+        vvp_transient_page_verify(slice->cpl_page);
+}
+
+static void vvp_transient_page_disown(const struct lu_env *env,
+                                      const struct cl_page_slice *slice,
+                                      struct cl_io *_)
+{
+        vvp_transient_page_verify(slice->cpl_page);
+}
+
+static void vvp_transient_page_discard(const struct lu_env *env,
+                                       const struct cl_page_slice *slice,
+                                       struct cl_io *_)
+{
+        struct cl_page *page = slice->cpl_page;
+
+        vvp_transient_page_verify(slice->cpl_page);
+
+        /*
+         * For transient pages, remove it from the radix tree.
+         */
+        cl_page_delete(env, page);
+}
+
+static int vvp_transient_page_is_vmlocked(const struct lu_env *env,
+                                          const struct cl_page_slice *slice)
+{
+        struct inode    *inode = ccc_object_inode(slice->cpl_obj);
+        int              locked;
+
+        locked = !TRYLOCK_INODE_MUTEX(inode);
+        if (!locked)
+                UNLOCK_INODE_MUTEX(inode);
+        return locked ? -EBUSY : -ENODATA;
+}
+
+static void
+vvp_transient_page_completion_write(const struct lu_env *env,
+                                    const struct cl_page_slice *slice,
+                                    int ioret)
+{
+        vvp_transient_page_verify(slice->cpl_page);
+        vvp_page_completion_write_common(env, slice, ioret);
+}
+
+
+static void vvp_transient_page_fini(const struct lu_env *env,
+                                    struct cl_page_slice *slice)
+{
+        struct ccc_page *cp = cl2ccc_page(slice);
+        struct cl_page *clp = slice->cpl_page;
+        struct ccc_object *clobj = cl2ccc(clp->cp_obj);
+
+        vvp_page_fini_common(cp);
+        LASSERT(!TRYLOCK_INODE_MUTEX(clobj->cob_inode));
+        clobj->cob_transient_pages--;
+}
+
+static const struct cl_page_operations vvp_transient_page_ops = {
+        .cpo_own           = vvp_transient_page_own,
+        .cpo_assume        = vvp_transient_page_assume,
+        .cpo_unassume      = vvp_transient_page_unassume,
+        .cpo_disown        = vvp_transient_page_disown,
+        .cpo_discard       = vvp_transient_page_discard,
+        .cpo_vmpage        = ccc_page_vmpage,
+        .cpo_fini          = vvp_transient_page_fini,
+        .cpo_is_vmlocked   = vvp_transient_page_is_vmlocked,
+        .cpo_print         = vvp_page_print,
+        .cpo_is_under_lock = ccc_page_is_under_lock,
+        .cpo_cancel        = osc_page_cancel,
+        .io = {
+                [CRT_READ] = {
+                        .cpo_prep        = ccc_transient_page_prep,
+                        .cpo_completion  = vvp_page_completion_read,
+                },
+                [CRT_WRITE] = {
+                        .cpo_prep        = ccc_transient_page_prep,
+                        .cpo_completion  = vvp_transient_page_completion_write,
+                }
+        }
+};
+
+struct cl_page *vvp_page_init(const struct lu_env *env, struct cl_object *obj,
+                              struct cl_page *page, cfs_page_t *vmpage)
+{
+        struct ccc_page *cpg;
+        int result;
+
+        CLOBINVRNT(env, obj, ccc_object_invariant(obj));
+
+        OBD_SLAB_ALLOC_PTR(cpg, vvp_page_kmem);
+        if (cpg != NULL) {
+                cpg->cpg_page = vmpage;
+                page_cache_get(vmpage);
+
+                CFS_INIT_LIST_HEAD(&cpg->cpg_pending_linkage);
+                if (page->cp_type == CPT_CACHEABLE) {
+                        SetPagePrivate(vmpage);
+                        vmpage->private = (unsigned long)page;
+                        cl_page_slice_add(page, &cpg->cpg_cl, obj,
+                                          &vvp_page_ops);
+                } else {
+                        struct ccc_object *clobj = cl2ccc(obj);
+
+                        LASSERT(!TRYLOCK_INODE_MUTEX(clobj->cob_inode));
+                        cl_page_slice_add(page, &cpg->cpg_cl, obj,
+                                          &vvp_transient_page_ops);
+                        clobj->cob_transient_pages++;
+                }
+                result = 0;
+        } else
+                result = -ENOMEM;
+        return ERR_PTR(result);
+}
+
diff --git a/lustre/lmv/lmv_internal.h b/lustre/lmv/lmv_internal.h

index 818d53d..2311f4e 100644 (file)
--- a/lustre/lmv/lmv_internal.h
+++ b/lustre/lmv/lmv_internal.h
@@ -49,15 +49,15 @@
         ((it) ? ldlm_it2str((it)->it_op) : "0")
  
  struct lmv_stripe {
-        /** 
-         * Dir stripe fid. 
+        /**
+         * Dir stripe fid.
           */
          struct lu_fid           ls_fid;
-        /** 
-         * Cached home mds number for @li_fid. 
+        /**
+         * Cached home mds number for @li_fid.
           */
          mdsno_t                 ls_mds;
-        /** 
+        /**
           * Stripe object size.
           */
          unsigned long           ls_size;
@@ -78,15 +78,15 @@ struct lmv_object {
           * Sema for protecting fields.
           */
          struct semaphore        lo_guard;
-        /** 
+        /**
           * Object state like O_FREEING.
           */
          int                     lo_state;
-        /** 
-         * Object ref counter. 
+        /**
+         * Object ref counter.
           */
          atomic_t                lo_count;
-        /** 
+        /**
           * Object master fid.
           */
          struct lu_fid           lo_fid;
@@ -94,15 +94,15 @@ struct lmv_object {
           * Object hash type to find stripe by name.
           */
          __u32                  lo_hashtype;
-        /** 
-         * Number of stripes. 
+        /**
+         * Number of stripes.
           */
          int                     lo_objcount;
-        /** 
-         * Array of sub-objs. 
+        /**
+         * Array of sub-objs.
           */
          struct lmv_stripe      *lo_stripes;
-        /** 
+        /**
           * Pointer to LMV obd.
           */
          struct obd_device      *lo_obd;
@@ -233,7 +233,7 @@ lmv_find_target(struct lmv_obd *lmv, const struct lu_fid *fid)
  {
          mdsno_t mds;
          int rc;
-        
+
          rc = lmv_fld_lookup(lmv, fid, &mds);
          if (rc)
                  return ERR_PTR(rc);
diff --git a/lustre/lmv/lmv_obd.c b/lustre/lmv/lmv_obd.c

index 1c8f940..ad6c65f 100644 (file)
--- a/lustre/lmv/lmv_obd.c
+++ b/lustre/lmv/lmv_obd.c
@@ -127,7 +127,7 @@ static int lmv_set_mdc_active(struct lmv_obd *lmv, struct obd_uuid *uuid,
                  GOTO(out_lmv_lock, rc);
          }
  
-        CDEBUG(D_INFO, "Marking OBD %p %sactive\n", obd, 
+        CDEBUG(D_INFO, "Marking OBD %p %sactive\n", obd,
                 activate ? "" : "in");
          lmv_activate_target(lmv, tgt, activate);
          EXIT;
@@ -199,7 +199,7 @@ static int lmv_notify(struct obd_device *obd, struct obd_device *watched,
          } else if (ev == OBD_NOTIFY_OCD) {
                  conn_data = &watched->u.cli.cl_import->imp_connect_data;
  
-                /* 
+                /*
                   * Set connect data to desired target, update exp_connect_flags.
                   */
                  rc = lmv_set_mdc_data(lmv, uuid, conn_data);
@@ -219,14 +219,14 @@ static int lmv_notify(struct obd_device *obd, struct obd_device *watched,
          }
  #if 0
          else if (ev == OBD_NOTIFY_DISCON) {
-                /* 
-                 * For disconnect event, flush fld cache for failout MDS case. 
+                /*
+                 * For disconnect event, flush fld cache for failout MDS case.
                   */
                  fld_client_flush(&lmv->lmv_fld);
          }
  #endif
-        /* 
-         * Pass the notification up the chain. 
+        /*
+         * Pass the notification up the chain.
           */
          if (obd->obd_observer)
                  rc = obd_notify(obd->obd_observer, watched, ev, data);
@@ -236,7 +236,7 @@ static int lmv_notify(struct obd_device *obd, struct obd_device *watched,
  
  /**
   * This is fake connect function. Its purpose is to initialize lmv and say
- * caller that everything is okay. Real connection will be performed later. 
+ * caller that everything is okay. Real connection will be performed later.
   */
  static int lmv_connect(const struct lu_env *env,
                         struct lustre_handle *conn, struct obd_device *obd,
@@ -259,9 +259,9 @@ static int lmv_connect(const struct lu_env *env,
  
          exp = class_conn2export(conn);
  
-        /* 
+        /*
           * We don't want to actually do the underlying connections more than
-         * once, so keep track. 
+         * once, so keep track.
           */
          lmv->refcount++;
          if (lmv->refcount > 1) {
@@ -286,11 +286,11 @@ static int lmv_connect(const struct lu_env *env,
          }
  #endif
  
-        /* 
+        /*
           * All real clients should perform actual connection right away, because
           * it is possible, that LMV will not have opportunity to connect targets
           * and MDC stuff will be called directly, for instance while reading
-         * ../mdc/../kbytesfree procfs file, etc. 
+         * ../mdc/../kbytesfree procfs file, etc.
           */
          if (data->ocd_connect_flags & OBD_CONNECT_REAL)
                  rc = lmv_check_connect(obd);
@@ -416,7 +416,7 @@ int lmv_connect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt)
  
          mdc_exp = class_conn2export(&conn);
  
-        /* 
+        /*
           * Init fid sequence client for this mdc and add new fld target.
           */
          rc = obd_fid_init(mdc_exp);
@@ -440,7 +440,7 @@ int lmv_connect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt)
          }
  
          if (obd->obd_observer) {
-                /* 
+                /*
                   * Tell the observer about the new target.
                   */
                  rc = obd_notify(obd->obd_observer, mdc_exp->exp_obd,
@@ -455,7 +455,7 @@ int lmv_connect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt)
          tgt->ltd_exp = mdc_exp;
          lmv->desc.ld_active_tgt_count++;
  
-        /* 
+        /*
           * Copy connect data, it may be used later.
           */
          lmv->datas[tgt->ltd_idx] = *mdc_data;
@@ -684,8 +684,8 @@ static int lmv_disconnect(struct obd_export *exp)
          if (!lmv->tgts)
                  goto out_local;
  
-        /* 
-         * Only disconnect the underlying layers on the final disconnect. 
+        /*
+         * Only disconnect the underlying layers on the final disconnect.
           */
          lmv->refcount--;
          if (lmv->refcount != 0)
@@ -807,7 +807,7 @@ static int lmv_nid_policy(struct lmv_obd *lmv)
  {
          struct obd_import *imp;
          __u32              id;
-        
+
          /*
           * XXX: To get nid we assume that underlying obd device is mdc.
           */
@@ -836,7 +836,7 @@ static int lmv_choose_mds(struct lmv_obd *lmv, struct md_op_data *op_data,
  }
  
  /**
- * This is _inode_ placement policy function (not name). 
+ * This is _inode_ placement policy function (not name).
   */
  static int lmv_placement_policy(struct obd_device *obd,
                                  struct md_op_data *op_data,
@@ -913,8 +913,8 @@ int __lmv_fid_alloc(struct lmv_obd *lmv, struct lu_fid *fid,
          ENTRY;
  
          tgt = lmv_get_target(lmv, mds);
-    
-        /* 
+
+        /*
           * New seq alloc and FLD setup should be atomic. Otherwise we may find
           * on server that seq in new allocated fid is not yet known.
           */
@@ -923,20 +923,20 @@ int __lmv_fid_alloc(struct lmv_obd *lmv, struct lu_fid *fid,
          if (!tgt->ltd_active)
                  GOTO(out, rc = -ENODEV);
  
-        /* 
-         * Asking underlaying tgt layer to allocate new fid. 
+        /*
+         * Asking underlaying tgt layer to allocate new fid.
           */
          rc = obd_fid_alloc(tgt->ltd_exp, fid, NULL);
          if (rc > 0) {
                  LASSERT(fid_is_sane(fid));
  
-                /* 
-                 * Client switches to new sequence, setup FLD. 
+                /*
+                 * Client switches to new sequence, setup FLD.
                   */
                  rc = fld_client_create(&lmv->lmv_fld, fid_seq(fid),
                                         mds, NULL);
                  if (rc) {
-                        /* 
+                        /*
                           * Delete just allocated fid sequence in case
                           * of fail back.
                           */
@@ -1218,7 +1218,7 @@ static int lmv_setxattr(struct obd_export *exp, const struct lu_fid *fid,
          if (IS_ERR(tgt))
                  RETURN(PTR_ERR(tgt));
  
-        rc = md_setxattr(tgt->ltd_exp, fid, oc, valid, name, input, 
+        rc = md_setxattr(tgt->ltd_exp, fid, oc, valid, name, input,
                           input_size, output_size, flags, suppgid,
                           request);
  
@@ -1280,7 +1280,7 @@ static int lmv_getattr(struct obd_export *exp, const struct lu_fid *fid,
                                  continue;
                          }
  
-                        /* 
+                        /*
                           * Skip master object.
                           */
                          if (lu_fid_eq(&obj->lo_fid, &obj->lo_stripes[i].ls_fid))
@@ -1369,8 +1369,8 @@ int lmv_handle_split(struct obd_export *exp, const struct lu_fid *fid)
          if (IS_ERR(tgt))
                  RETURN(PTR_ERR(tgt));
  
-        /* 
-         * Time to update mea of parent fid. 
+        /*
+         * Time to update mea of parent fid.
           */
          rc = md_getattr(tgt->ltd_exp, fid, NULL, valid, mealen, &req);
          if (rc) {
@@ -1449,7 +1449,7 @@ repeat:
          else if (rc)
                  RETURN(rc);
  
-        CDEBUG(D_INODE, "CREATE '%*s' on "DFID" -> mds #"LPU64"\n", 
+        CDEBUG(D_INODE, "CREATE '%*s' on "DFID" -> mds #"LPU64"\n",
                 op_data->op_namelen, op_data->op_name, PFID(&op_data->op_fid1),
                 op_data->op_mds);
  
@@ -1560,8 +1560,8 @@ cleanup:
          OBD_FREE_PTR(op_data2);
  
          if (rc != 0) {
-                /* 
-                 * Drop all taken locks. 
+                /*
+                 * Drop all taken locks.
                   */
                  while (--i >= 0) {
                          if (lockh[i].cookie)
@@ -1599,8 +1599,8 @@ lmv_enqueue_remote(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
          CDEBUG(D_INODE, "REMOTE_ENQUEUE '%s' on "DFID" -> "DFID"\n",
                 LL_IT2STR(it), PFID(&op_data->op_fid1), PFID(&body->fid1));
  
-        /* 
-         * We got LOOKUP lock, but we really need attrs. 
+        /*
+         * We got LOOKUP lock, but we really need attrs.
           */
          pmode = it->d.lustre.it_lock_mode;
          LASSERT(pmode != 0);
@@ -1678,7 +1678,7 @@ lmv_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
  
          CDEBUG(D_INODE, "ENQUEUE '%s' on "DFID" -> mds #%d\n",
                 LL_IT2STR(it), PFID(&op_data->op_fid1), tgt->ltd_idx);
-        
+
          rc = md_enqueue(tgt->ltd_exp, einfo, it, op_data, lockh,
                          lmm, lmmsize, req, extra_lock_flags);
  
@@ -1803,7 +1803,7 @@ static int lmv_early_cancel_slaves(struct obd_export *exp,
          obj = lmv_object_find(obd, op_fid);
          if (obj == NULL)
                  RETURN(-EALREADY);
-                
+
          policy.l_inodebits.bits = bits;
          for (i = 0; i < obj->lo_objcount; i++) {
                  tgt = lmv_get_target(lmv, obj->lo_stripes[i].ls_mds);
@@ -1811,12 +1811,12 @@ static int lmv_early_cancel_slaves(struct obd_export *exp,
                  if (op_tgt != tgt->ltd_idx) {
                          CDEBUG(D_INODE, "EARLY_CANCEL slave "DFID" -> mds #%d\n",
                                 PFID(st_fid), tgt->ltd_idx);
-                        rc = md_cancel_unused(tgt->ltd_exp, st_fid, &policy, 
+                        rc = md_cancel_unused(tgt->ltd_exp, st_fid, &policy,
                                                mode, LDLM_FL_ASYNC, NULL);
                          if (rc)
                                  GOTO(out_put_obj, rc);
                  } else {
-                        CDEBUG(D_INODE, 
+                        CDEBUG(D_INODE,
                                 "EARLY_CANCEL skip operation target %d on "DFID"\n",
                                 op_tgt, PFID(st_fid));
                          /*
@@ -1865,7 +1865,7 @@ static int lmv_early_cancel(struct obd_export *exp, struct md_op_data *op_data,
                          rc = md_cancel_unused(tgt->ltd_exp, fid, &policy,
                                                mode, LDLM_FL_ASYNC, NULL);
                  } else {
-                        CDEBUG(D_INODE, 
+                        CDEBUG(D_INODE,
                                 "EARLY_CANCEL skip operation target %d on "DFID"\n",
                                 op_tgt, PFID(fid));
                          op_data->op_flags |= flag;
@@ -1929,8 +1929,8 @@ repeat:
          op_data->op_cap = cfs_curproc_cap_pack();
          tgt = lmv_get_target(lmv, mds);
  
-        /* 
-         * Cancel UPDATE lock on child (fid1). 
+        /*
+         * Cancel UPDATE lock on child (fid1).
           */
          op_data->op_flags |= MF_MDC_CANCEL_FID2;
          rc = lmv_early_cancel(exp, op_data, tgt->ltd_idx, LCK_EX,
@@ -2025,34 +2025,34 @@ repeat:
          src_tgt = lmv_get_target(lmv, mds1);
          tgt_tgt = lmv_get_target(lmv, mds2);
  
-        /* 
+        /*
           * LOOKUP lock on src child (fid3) should also be cancelled for
-         * src_tgt in mdc_rename. 
+         * src_tgt in mdc_rename.
           */
          op_data->op_flags |= MF_MDC_CANCEL_FID1 | MF_MDC_CANCEL_FID3;
  
-        /* 
+        /*
           * Cancel UPDATE locks on tgt parent (fid2), tgt_tgt is its
-         * own target. 
+         * own target.
           */
-        rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx, 
-                              LCK_EX, MDS_INODELOCK_UPDATE, 
+        rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx,
+                              LCK_EX, MDS_INODELOCK_UPDATE,
                                MF_MDC_CANCEL_FID2);
  
-        /* 
+        /*
           * Cancel LOOKUP locks on tgt child (fid4) for parent tgt_tgt.
           */
          if (rc == 0) {
-                rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx, 
+                rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx,
                                        LCK_EX, MDS_INODELOCK_LOOKUP,
                                        MF_MDC_CANCEL_FID4);
          }
  
-        /* 
-         * Cancel all the locks on tgt child (fid4). 
+        /*
+         * Cancel all the locks on tgt child (fid4).
           */
          if (rc == 0)
-                rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx, 
+                rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx,
                                        LCK_EX, MDS_INODELOCK_FULL,
                                        MF_MDC_CANCEL_FID4);
  
@@ -2062,7 +2062,7 @@ repeat:
  
          if (rc == -ERESTART) {
                  LASSERT(*request != NULL);
-                DEBUG_REQ(D_WARNING|D_RPCTRACE, *request, 
+                DEBUG_REQ(D_WARNING|D_RPCTRACE, *request,
                            "Got -ERESTART during rename!\n");
                  ptlrpc_req_finished(*request);
                  *request = NULL;
@@ -2164,7 +2164,7 @@ static int lmv_sync(struct obd_export *exp, const struct lu_fid *fid,
  
  /**
   * Main purpose of LMV blocking ast is to remove split directory LMV
- * presentation object (struct lmv_object) attached to the lock being revoked. 
+ * presentation object (struct lmv_object) attached to the lock being revoked.
   */
  int lmv_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
                       void *data, int flag)
@@ -2184,7 +2184,7 @@ int lmv_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
                  }
                  break;
          case LDLM_CB_CANCELING:
-                /* 
+                /*
                   * Time to drop cached attrs for split directory object
                   */
                  obj = lock->l_ast_data;
@@ -2312,7 +2312,7 @@ static int lmv_readpage(struct obd_export *exp, const struct lu_fid *fid,
                  hash_adj += rank * seg_size;
  
                  CDEBUG(D_INODE, "Readpage hash adjustment: %x "LPX64" "
-                       LPX64"/%x -> "LPX64"/%x\n", rank, hash_adj, 
+                       LPX64"/%x -> "LPX64"/%x\n", rank, hash_adj,
                         offset, tgt0_idx, offset + hash_adj, tgt_idx);
  
                  offset = (offset + hash_adj) & MAX_HASH_SIZE;
@@ -2411,17 +2411,17 @@ repeat:
          op_data->op_fsgid = current->fsgid;
          op_data->op_cap = cfs_curproc_cap_pack();
  
-        /* 
+        /*
           * If child's fid is given, cancel unused locks for it if it is from
           * another export than parent.
           *
-         * LOOKUP lock for child (fid3) should also be cancelled on parent 
-         * tgt_tgt in mdc_unlink(). 
+         * LOOKUP lock for child (fid3) should also be cancelled on parent
+         * tgt_tgt in mdc_unlink().
           */
          op_data->op_flags |= MF_MDC_CANCEL_FID1 | MF_MDC_CANCEL_FID3;
  
-        /* 
-         * Cancel FULL locks on child (fid3). 
+        /*
+         * Cancel FULL locks on child (fid3).
           */
          rc = lmv_early_cancel(exp, op_data, tgt->ltd_idx, LCK_EX,
                                MDS_INODELOCK_FULL, MF_MDC_CANCEL_FID3);
@@ -2468,7 +2468,7 @@ static int lmv_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage)
  }
  
  static int lmv_get_info(struct obd_export *exp, __u32 keylen,
-                        void *key, __u32 *vallen, void *val, 
+                        void *key, __u32 *vallen, void *val,
                          struct lov_stripe_md *lsm)
  {
          struct obd_device       *obd;
@@ -2496,8 +2496,8 @@ static int lmv_get_info(struct obd_export *exp, __u32 keylen,
                  for (i = 0, tgts = lmv->tgts; i < lmv->desc.ld_tgt_count;
                       i++, tgts++) {
  
-                        /* 
-                         * All tgts should be connected when this gets called. 
+                        /*
+                         * All tgts should be connected when this gets called.
                           */
                          if (!tgts || !tgts->ltd_exp) {
                                  CERROR("target not setup?\n");
@@ -2514,9 +2514,9 @@ static int lmv_get_info(struct obd_export *exp, __u32 keylen,
                  if (rc)
                          RETURN(rc);
  
-                /* 
+                /*
                   * Forwarding this request to first MDS, it should know LOV
-                 * desc. 
+                 * desc.
                   */
                  rc = obd_get_info(lmv->tgts[0].ltd_exp, keylen, key,
                                    vallen, val, NULL);
@@ -2657,8 +2657,8 @@ int lmv_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp,
          {
                  magic = le32_to_cpu(mea->mea_magic);
          } else {
-                /* 
-                 * Old mea is not handled here. 
+                /*
+                 * Old mea is not handled here.
                   */
                  CERROR("Old not supportable EA is found\n");
                  LBUG();
@@ -2676,7 +2676,7 @@ int lmv_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp,
  }
  
  static int lmv_cancel_unused(struct obd_export *exp, const struct lu_fid *fid,
-                             ldlm_policy_data_t *policy, ldlm_mode_t mode, 
+                             ldlm_policy_data_t *policy, ldlm_mode_t mode,
                               int flags, void *opaque)
  {
          struct obd_device       *obd = exp->exp_obd;
@@ -2723,11 +2723,11 @@ ldlm_mode_t lmv_lock_match(struct obd_export *exp, int flags,
  
          CDEBUG(D_INODE, "Lock match for "DFID"\n", PFID(fid));
  
-        /* 
+        /*
           * With CMD every object can have two locks in different namespaces:
           * lookup lock in space of mds storing direntry and update/open lock in
           * space of mds storing inode. Thus we check all targets, not only that
-         * one fid was created in. 
+         * one fid was created in.
           */
          for (i = 0; i < lmv->desc.ld_tgt_count; i++) {
                  rc = md_lock_match(lmv->tgts[i].ltd_exp, flags, fid,
@@ -2862,10 +2862,10 @@ int lmv_intent_getattr_async(struct obd_export *exp,
                                              (char *)op_data->op_name,
                                              op_data->op_namelen);
                          op_data->op_fid1 = obj->lo_stripes[sidx].ls_fid;
-                        tgt = lmv_get_target(lmv, 
+                        tgt = lmv_get_target(lmv,
                                               obj->lo_stripes[sidx].ls_mds);
                          CDEBUG(D_INODE,
-                               "Choose slave dir ("DFID") -> mds #%d\n", 
+                               "Choose slave dir ("DFID") -> mds #%d\n",
                                 PFID(&op_data->op_fid1), tgt->ltd_idx);
                  } else {
                          tgt = lmv_find_target(lmv, &op_data->op_fid1);
@@ -2883,7 +2883,7 @@ int lmv_intent_getattr_async(struct obd_export *exp,
                  if (minfo->mi_it.it_op & IT_LOOKUP)
                          minfo->mi_it.it_op = IT_GETATTR;
          }
-        
+
          if (IS_ERR(tgt))
                  RETURN(PTR_ERR(tgt));
  
diff --git a/lustre/lmv/lproc_lmv.c b/lustre/lmv/lproc_lmv.c

index e880d23..555f4dc 100644 (file)
--- a/lustre/lmv/lproc_lmv.c
+++ b/lustre/lmv/lproc_lmv.c
@@ -90,7 +90,7 @@ static int lmv_rd_placement(char *page, char **start, off_t off, int count,
          LASSERT(dev != NULL);
          lmv = &dev->u.lmv;
          *eof = 1;
-        return snprintf(page, count, "%s\n", 
+        return snprintf(page, count, "%s\n",
                          placement_policy2name(lmv->lmv_placement));
  
  }
@@ -182,7 +182,7 @@ static int lmv_tgt_seq_show(struct seq_file *p, void *v)
          struct obd_device       *dev = p->private;
          struct lmv_obd          *lmv = &dev->u.lmv;
          int                      idx = tgt - &(lmv->tgts[0]);
-        
+
          return seq_printf(p, "%d: %s %sACTIVE\n", idx, tgt->ltd_uuid.uuid,
                            tgt->ltd_active ? "" : "IN");
  }
@@ -199,7 +199,7 @@ static int lmv_target_seq_open(struct inode *inode, struct file *file)
          struct proc_dir_entry   *dp = PDE(inode);
          struct seq_file         *seq;
          int                     rc;
-        
+
          rc = seq_open(file, &lmv_tgt_sops);
          if (rc)
                  return rc;
diff --git a/lustre/lov/Makefile.in b/lustre/lov/Makefile.in

index 0f223f8..5a2aad7 100644 (file)
--- a/lustre/lov/Makefile.in
+++ b/lustre/lov/Makefile.in
@@ -1,4 +1,4 @@
  MODULES := lov
-lov-objs := lov_log.o lov_obd.o lov_pack.o lproc_lov.o lov_offset.o lov_merge.o lov_request.o lov_qos.o lov_ea.o lov_pool.o
+lov-objs := lov_log.o lov_obd.o lov_pack.o lproc_lov.o lov_offset.o lov_merge.o lov_request.o lov_qos.o lov_ea.o lov_dev.o lov_object.o lov_page.o lov_lock.o lov_io.o lovsub_dev.o lovsub_object.o lovsub_page.o lovsub_lock.o lovsub_io.o lov_pool.o
  
  @INCLUDE_RULES@
diff --git a/lustre/lov/autoMakefile.am b/lustre/lov/autoMakefile.am

index c65e095..e18070c 100644 (file)
--- a/lustre/lov/autoMakefile.am
+++ b/lustre/lov/autoMakefile.am
@@ -36,7 +36,7 @@
  
  if LIBLUSTRE
  noinst_LIBRARIES = liblov.a
-liblov_a_SOURCES = lov_log.c lov_pool.c lov_obd.c lov_pack.c lov_request.c lov_offset.c lov_qos.c lov_merge.c lov_ea.c lov_internal.h
+liblov_a_SOURCES = lov_log.c lov_pool.c lov_obd.c lov_pack.c lov_request.c lov_offset.c lov_qos.c lov_merge.c lov_ea.c lov_internal.h lov_cl_internal.h lov_dev.c lov_object.c lov_page.c lov_lock.c lov_io.c lovsub_dev.c lovsub_object.c lovsub_page.c lovsub_lock.c lovsub_io.c
  liblov_a_CPPFLAGS = $(LLCPPFLAGS)
  liblov_a_CFLAGS = $(LLCFLAGS)
  endif
@@ -51,12 +51,22 @@ macos_PROGRAMS := lov
  
  lov_SOURCES :=          \
          lov_log.c       \
-        lov_pool.c     \
+        lov_pool.c         \
          lov_obd.c       \
          lov_pack.c      \
          lov_request.c   \
          lov_merge.c     \
          lov_qos.c       \
+        lov_dev.c       \
+        lov_object.c    \
+        lov_page.c      \
+        lov_lock.c      \
+        lov_io.c        \
+        lovsub_dev.c    \
+        lovsub_object.c \
+        lovsub_page.c   \
+        lovsub_lock.c   \
+        lovsub_io.c     \
          lov_offset.c    \
          lov_internal.h
  
@@ -74,5 +84,5 @@ endif # MODULES
  
  install-data-hook: $(install_data_hook)
  
-DIST_SOURCES = $(lov-objs:.o=.c) lov_internal.h
+DIST_SOURCES = $(lov-objs:.o=.c) lov_internal.h lov_cl_internal.h
  MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ 
diff --git a/lustre/lov/lov_cl_internal.h b/lustre/lov/lov_cl_internal.h

new file mode 100644 (file)

index 0000000..aa9583f
--- /dev/null
+++ b/lustre/lov/lov_cl_internal.h
@@ -0,0 +1,782 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Internal interfaces of LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#ifndef LOV_CL_INTERNAL_H
+#define LOV_CL_INTERNAL_H
+
+#ifdef __KERNEL__
+# include <libcfs/libcfs.h>
+#else
+# include <liblustre.h>
+#endif
+
+#include <obd.h>
+#include <cl_object.h>
+#include "lov_internal.h"
+
+/** \addtogroup lov lov @{ */
+
+/** \defgroup lov lov
+ * Logical object volume layer. This layer implements data striping (raid0).
+ *
+ * At the lov layer top-entity (object, page, lock, io) is connected to one or
+ * more sub-entities: top-object, representing a file is connected to a set of
+ * sub-objects, each representing a stripe, file-level top-lock is connected
+ * to a set of per-stripe sub-locks, top-page is connected to a (single)
+ * sub-page, and a top-level IO is connected to a set of (potentially
+ * concurrent) sub-IO's.
+ *
+ * Sub-object, sub-page, and sub-io have well-defined top-object and top-page
+ * respectively, while a single sub-lock can be part of multiple top-locks.
+ *
+ * Reference counting models are different for different types of entities:
+ *
+ *     - top-object keeps a reference to its sub-objects, and destroys them
+ *       when it is destroyed.
+ *
+ *     - top-page keeps a reference to its sub-page, and destroys it when it
+ *       is destroyed.
+ *
+ *     - sub-lock keep a reference to its top-locks. Top-lock keeps a
+ *       reference (and a hold, see cl_lock_hold()) on its sub-locks when it
+ *       actively using them (that is, in cl_lock_state::CLS_QUEUING,
+ *       cl_lock_state::CLS_ENQUEUED, cl_lock_state::CLS_HELD states). When
+ *       moving into cl_lock_state::CLS_CACHED state, top-lock releases a
+ *       hold. From this moment top-lock has only a 'weak' reference to its
+ *       sub-locks. This reference is protected by top-lock
+ *       cl_lock::cll_guard, and will be automatically cleared by the sub-lock
+ *       when the latter is destroyed. When a sub-lock is canceled, a
+ *       reference to it is removed from the top-lock array, and top-lock is
+ *       moved into CLS_NEW state. It is guaranteed that all sub-locks exits
+ *       while their top-lock is in CLS_HELD or CLS_CACHED states.
+ *
+ *     - IO's are not reference counted.
+ *
+ * To implement a connection between top and sub entities, lov layer is split
+ * into two pieces: lov ("upper half"), and lovsub ("bottom half"), both
+ * implementing full set of cl-interfaces. For example, top-object has clu and
+ * lov layers, and it's sub-object has lovsub and osc layers. lovsub layer is
+ * used to track child-parent relationship.
+ *
+ * @{
+ */
+
+struct lovsub_device;
+struct lovsub_object;
+struct lovsub_lock;
+
+enum lov_device_flags {
+        LOV_DEV_INITIALIZED = 1 << 0
+};
+
+/*
+ * Upper half.
+ */
+
+/**
+ * Resources that are used in memory-cleaning path, and whose allocation
+ * cannot fail even when memory is tight. They are preallocated in sufficient
+ * quantities in lov_device::ld_emerg[], and access to them is serialized
+ * lov_device::ld_mutex.
+ */
+struct lov_device_emerg {
+        /**
+         * Page list used to submit IO when memory is in pressure.
+         */
+        struct cl_page_list emrg_page_list;
+        /**
+         * sub-io's shared by all threads accessing this device when memory is
+         * too low to allocate sub-io's dynamically.
+         */
+        struct cl_io        emrg_subio;
+        /**
+         * Environments used by sub-io's in
+         * lov_device_emerg::emrg_subio.
+         */
+        struct lu_env      *emrg_env;
+        /**
+         * Refchecks for lov_device_emerg::emrg_env.
+         *
+         * \see cl_env_get()
+         */
+        int                 emrg_refcheck;
+};
+
+struct lov_device {
+        /*
+         * XXX Locking of lov-private data is missing.
+         */
+        struct cl_device          ld_cl;
+        struct lov_obd           *ld_lov;
+        /** size of lov_device::ld_target[] array */
+        __u32                     ld_target_nr;
+        struct lovsub_device    **ld_target;
+        __u32                     ld_flags;
+
+        /** Emergency resources used in memory-cleansing paths. */
+        struct lov_device_emerg **ld_emrg;
+        /**
+         * Serializes access to lov_device::ld_emrg in low-memory
+         * conditions.
+         */
+        struct mutex              ld_mutex;
+};
+
+/**
+ * Layout type.
+ */
+enum lov_layout_type {
+        /** empty file without body */
+        LLT_EMPTY,
+        /** striped file */
+        LLT_RAID0,
+        /** join file */
+        LLT_JOIN,
+        LLT_NR
+};
+
+/**
+ * lov-specific file state.
+ *
+ * lov object has particular layout type, determining how top-object is built
+ * on top of sub-objects. Layout type can change dynamically. When this
+ * happens, lov_object::lo_type_guard semaphore is taken in exclusive mode,
+ * all state pertaining to the old layout type is destroyed, and new state is
+ * constructed. All object methods take said semaphore in the shared mode,
+ * providing serialization against transition between layout types.
+ *
+ * To avoid multiple `if' or `switch' statements, selecting behavior for the
+ * current layout type, object methods perform double-dispatch, invoking
+ * function corresponding to the current layout type.
+ */
+struct lov_object {
+        struct cl_object       lo_cl;
+        /**
+         * Serializes object operations with transitions between layout types.
+         *
+         * This semaphore is taken in shared mode by all object methods, and
+         * is taken in exclusive mode when object type is changed.
+         *
+         * \see lov_object::lo_type
+         */
+        struct rw_semaphore    lo_type_guard;
+        /**
+         * Type of an object. Protected by lov_object::lo_type_guard.
+         */
+        enum lov_layout_type   lo_type;
+
+        union lov_layout_state {
+                struct lov_layout_raid0 {
+                        unsigned               lo_nr;
+                        struct lov_stripe_md  *lo_lsm;
+                        struct lovsub_object **lo_sub;
+                        /**
+                         * When this is true, lov_object::lo_attr contains
+                         * valid up to date attributes for a top-level
+                         * object. This field is reset to 0 when attributes of
+                         * any sub-object change.
+                         */
+                        int                    lo_attr_valid;
+                        /**
+                         * Cached object attribute, built from sub-object
+                         * attributes.
+                         */
+                        struct cl_attr         lo_attr;
+                } raid0;
+                struct lov_layout_state_empty {
+                } empty;
+                struct lov_layout_state_join {
+                } join;
+        } u;
+        /**
+         * Thread that acquired lov_object::lo_type_guard in an exclusive
+         * mode.
+         */
+        cfs_task_t            *lo_owner;
+};
+
+/**
+ * Flags that top-lock can set on each of its sub-locks.
+ */
+enum lov_sub_flags {
+        /** Top-lock acquired a hold (cl_lock_hold()) on a sub-lock. */
+        LSF_HELD = 1 << 0
+};
+
+/**
+ * State lov_lock keeps for each sub-lock.
+ */
+struct lov_lock_sub {
+        /** sub-lock itself */
+        struct lovsub_lock  *sub_lock;
+        /** An array of per-sub-lock flags, taken from enum lov_sub_flags */
+        unsigned             sub_flags;
+        int                  sub_stripe;
+        struct cl_lock_descr sub_descr;
+        struct cl_lock_descr sub_got;
+};
+
+/**
+ * lov-specific lock state.
+ */
+struct lov_lock {
+        struct cl_lock_slice   lls_cl;
+        /** Number of sub-locks in this lock */
+        int                    lls_nr;
+        /**
+         * Number of existing sub-locks.
+         */
+        unsigned               lls_nr_filled;
+        /**
+         * Set when sub-lock was canceled, while top-lock was being
+         * unlocked.
+         */
+        int                    lls_unuse_race;
+        /**
+         * An array of sub-locks
+         *
+         * There are two issues with managing sub-locks:
+         *
+         *     - sub-locks are concurrently canceled, and
+         *
+         *     - sub-locks are shared with other top-locks.
+         *
+         * To manage cancellation, top-lock acquires a hold on a sublock
+         * (lov_sublock_adopt()) when the latter is inserted into
+         * lov_lock::lls_sub[]. This hold is released (lov_sublock_release())
+         * when top-lock is going into CLS_CACHED state or destroyed. Hold
+         * prevents sub-lock from cancellation.
+         *
+         * Sub-lock sharing means, among other things, that top-lock that is
+         * in the process of creation (i.e., not yet inserted into lock list)
+         * is already accessible to other threads once at least one of its
+         * sub-locks is created, see lov_lock_sub_init().
+         *
+         * Sub-lock can be in one of the following states:
+         *
+         *     - doesn't exist, lov_lock::lls_sub[]::sub_lock == NULL. Such
+         *       sub-lock was either never created (top-lock is in CLS_NEW
+         *       state), or it was created, then canceled, then destroyed
+         *       (lov_lock_unlink() cleared sub-lock pointer in the top-lock).
+         *
+         *     - sub-lock exists and is on
+         *       hold. (lov_lock::lls_sub[]::sub_flags & LSF_HELD). This is a
+         *       normal state of a sub-lock in CLS_HELD and CLS_CACHED states
+         *       of a top-lock.
+         *
+         *     - sub-lock exists, but is not held by the top-lock. This
+         *       happens after top-lock released a hold on sub-locks before
+         *       going into cache (lov_lock_unuse()).
+         *
+         * \todo To support wide-striping, array has to be replaced with a set
+         * of queues to avoid scanning.
+         */
+        struct lov_lock_sub   *lls_sub;
+        /**
+         * Original description with which lock was enqueued.
+         */
+        struct cl_lock_descr   lls_orig;
+};
+
+struct lov_page {
+        struct cl_page_slice lps_cl;
+        int                  lps_invalid;
+};
+
+/*
+ * Bottom half.
+ */
+
+struct lovsub_device {
+        struct cl_device   acid_cl;
+        struct lov_device *acid_super;
+        int                acid_idx;
+        struct cl_device  *acid_next;
+};
+
+struct lovsub_object {
+        struct cl_object_header lso_header;
+        struct cl_object        lso_cl;
+        struct lov_object      *lso_super;
+        int                     lso_index;
+};
+
+/**
+ * A link between a top-lock and a sub-lock. Separate data-structure is
+ * necessary, because top-locks and sub-locks are in M:N relationship.
+ *
+ * \todo This can be optimized for a (by far) most frequent case of a single
+ * top-lock per sub-lock.
+ */
+struct lov_lock_link {
+        struct lov_lock *lll_super;
+        /** An index within parent lock. */
+        int              lll_idx;
+        /**
+         * A linkage into per sub-lock list of all corresponding top-locks,
+         * hanging off lovsub_lock::lss_parents.
+         */
+        struct list_head lll_list;
+};
+
+/**
+ * Lock state at lovsub layer.
+ */
+struct lovsub_lock {
+        struct cl_lock_slice  lss_cl;
+        /**
+         * List of top-locks that have given sub-lock as their part. Protected
+         * by cl_lock::cll_guard mutex.
+         */
+        struct list_head      lss_parents;
+        /**
+         * Top-lock that initiated current operation on this sub-lock. This is
+         * only set during top-to-bottom lock operations like enqueue, and is
+         * used to optimize state change notification. Protected by
+         * cl_lock::cll_guard mutex.
+         *
+         * \see lovsub_lock_state_one().
+         */
+        struct cl_lock       *lss_active;
+};
+
+struct lovsub_page {
+        struct cl_page_slice lsb_cl;
+};
+
+
+struct lov_thread_info {
+        struct cl_object_conf   lti_stripe_conf;
+        struct lu_fid           lti_fid;
+        struct cl_lock_descr    lti_ldescr;
+        struct ost_lvb          lti_lvb;
+        struct cl_2queue        lti_cl2q;
+        union  lov_layout_state lti_state;
+        struct cl_lock_closure  lti_closure;
+};
+
+/**
+ * State that lov_io maintains for every sub-io.
+ */
+struct lov_io_sub {
+        int                  sub_stripe;
+        /**
+         * sub-io for a stripe. Ideally sub-io's can be stopped and resumed
+         * independently, with lov acting as a scheduler to maximize overall
+         * throughput.
+         */
+        struct cl_io        *sub_io;
+        /**
+         * Linkage into a list (hanging off lov_io::lis_active) of all
+         * sub-io's active for the current IO iteration.
+         */
+        struct list_head     sub_linkage;
+        /**
+         * true, iff cl_io_init() was successfully executed against
+         * lov_io_sub::sub_io.
+         */
+        int                  sub_io_initialized;
+        /**
+         * True, iff lov_io_sub::sub_io and lov_io_sub::sub_env weren't
+         * allocated, but borrowed from a per-device emergency pool.
+         */
+        int                  sub_borrowed;
+        /**
+         * environment, in which sub-io executes.
+         */
+        struct lu_env *sub_env;
+        /**
+         * environment's refcheck.
+         *
+         * \see cl_env_get()
+         */
+        int                  sub_refcheck;
+        int                  sub_refcheck2;
+        int                  sub_reenter;
+        void                *sub_cookie;
+};
+
+/**
+ * IO state private for LOV.
+ */
+struct lov_io {
+        /** super-class */
+        struct cl_io_slice lis_cl;
+        /**
+         * Pointer to the object slice. This is a duplicate of
+         * lov_io::lis_cl::cis_object.
+         */
+        struct lov_object *lis_object;
+        /**
+         * Original end-of-io position for this IO, set by the upper layer as
+         * cl_io::u::ci_rw::pos + cl_io::u::ci_rw::count. lov remembers this,
+         * changes pos and count to fit IO into a single stripe and uses saved
+         * value to determine when IO iterations have to stop.
+         *
+         * This is used only for CIT_READ and CIT_WRITE io's.
+         */
+        loff_t             lis_io_endpos;
+
+        /**
+         * starting position within a file, for the current io loop iteration
+         * (stripe), used by ci_io_loop().
+         */
+        obd_off            lis_pos;
+        /**
+         * end position with in a file, for the current stripe io. This is
+         * exclusive (i.e., next offset after last byte affected by io).
+         */
+        obd_off            lis_endpos;
+
+        int                lis_mem_frozen;
+        int                lis_stripe_count;
+        int                lis_active_subios;
+
+        /**
+         * the index of ls_single_subio in ls_subios array
+         */
+        int                lis_single_subio_index;
+        struct cl_io       lis_single_subio;
+
+        /**
+         * size of ls_subios array, actually the highest stripe #
+         */
+        int                lis_nr_subios;
+        struct lov_io_sub *lis_subs;
+        /**
+         * List of active sub-io's.
+         */
+        struct list_head   lis_active;
+};
+
+struct lov_session {
+        struct lov_io ls_io;
+};
+
+/**
+ * State of transfer for lov.
+ */
+struct lov_req {
+        struct cl_req_slice lr_cl;
+};
+
+/**
+ * State of transfer for lovsub.
+ */
+struct lovsub_req {
+        struct cl_req_slice lsrq_cl;
+};
+
+extern struct lu_device_type lov_device_type;
+extern struct lu_device_type lovsub_device_type;
+
+extern struct lu_context_key lov_key;
+extern struct lu_context_key lov_session_key;
+
+extern cfs_mem_cache_t *lov_page_kmem;
+extern cfs_mem_cache_t *lov_lock_kmem;
+extern cfs_mem_cache_t *lov_object_kmem;
+extern cfs_mem_cache_t *lov_thread_kmem;
+extern cfs_mem_cache_t *lov_session_kmem;
+extern cfs_mem_cache_t *lov_req_kmem;
+
+extern cfs_mem_cache_t *lovsub_page_kmem;
+extern cfs_mem_cache_t *lovsub_lock_kmem;
+extern cfs_mem_cache_t *lovsub_object_kmem;
+extern cfs_mem_cache_t *lovsub_req_kmem;
+
+extern cfs_mem_cache_t *lov_lock_link_kmem;
+
+int   lov_object_init     (const struct lu_env *env, struct lu_object *obj,
+                           const struct lu_object_conf *conf);
+int   lovsub_object_init  (const struct lu_env *env, struct lu_object *obj,
+                           const struct lu_object_conf *conf);
+int   lov_lock_init       (const struct lu_env *env, struct cl_object *obj,
+                           struct cl_lock *lock, const struct cl_io *io);
+int   lov_io_init         (const struct lu_env *env, struct cl_object *obj,
+                           struct cl_io *io);
+int   lovsub_lock_init    (const struct lu_env *env, struct cl_object *obj,
+                           struct cl_lock *lock, const struct cl_io *io);
+
+int   lov_lock_init_raid0 (const struct lu_env *env, struct cl_object *obj,
+                           struct cl_lock *lock, const struct cl_io *io);
+int   lov_io_init_raid0   (const struct lu_env *env, struct cl_object *obj,
+                           struct cl_io *io);
+int   lov_io_init_empty   (const struct lu_env *env, struct cl_object *obj,
+                           struct cl_io *io);
+void  lov_lock_unlink     (const struct lu_env *env, struct lov_lock_link *link,
+                           struct lovsub_lock *sub);
+
+void  lov_sub_put         (struct lov_io_sub *sub);
+int   lov_sublock_modify  (const struct lu_env *env, struct lov_lock *lov,
+                           struct lovsub_lock *sublock,
+                           const struct cl_lock_descr *d, int idx);
+
+
+struct cl_page *lov_page_init   (const struct lu_env *env, struct cl_object *ob,
+                                 struct cl_page *page, cfs_page_t *vmpage);
+struct cl_page *lovsub_page_init(const struct lu_env *env, struct cl_object *ob,
+                                 struct cl_page *page, cfs_page_t *vmpage);
+
+struct cl_page   *lov_page_init_empty(const struct lu_env *env,
+                                      struct cl_object *obj,
+                                      struct cl_page *page, cfs_page_t *vmpage);
+struct cl_page   *lov_page_init_raid0(const struct lu_env *env,
+                                      struct cl_object *obj,
+                                      struct cl_page *page, cfs_page_t *vmpage);
+struct lu_object *lov_object_alloc   (const struct lu_env *env,
+                                      const struct lu_object_header *hdr,
+                                      struct lu_device *dev);
+struct lu_object *lovsub_object_alloc(const struct lu_env *env,
+                                      const struct lu_object_header *hdr,
+                                      struct lu_device *dev);
+
+struct lov_lock_link *lov_lock_link_find(const struct lu_env *env,
+                                         struct lov_lock *lck,
+                                         struct lovsub_lock *sub);
+struct lov_io_sub    *lov_page_subio    (const struct lu_env *env,
+                                         struct lov_io *lio,
+                                         const struct cl_page_slice *slice);
+
+
+#define lov_foreach_target(lov, var)                    \
+        for (var = 0; var < lov_targets_nr(lov); ++var)
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ * Accessors.
+ *
+ */
+
+static inline struct lov_session *lov_env_session(const struct lu_env *env)
+{
+        struct lov_session *ses;
+
+        ses = lu_context_key_get(env->le_ses, &lov_session_key);
+        LASSERT(ses != NULL);
+        return ses;
+}
+
+static inline struct lov_io *lov_env_io(const struct lu_env *env)
+{
+        return &lov_env_session(env)->ls_io;
+}
+
+static inline int lov_is_object(const struct lu_object *obj)
+{
+        return obj->lo_dev->ld_type == &lov_device_type;
+}
+
+static inline int lovsub_is_object(const struct lu_object *obj)
+{
+        return obj->lo_dev->ld_type == &lovsub_device_type;
+}
+
+static inline struct lu_device *lov2lu_dev(struct lov_device *lov)
+{
+        return &lov->ld_cl.cd_lu_dev;
+}
+
+static inline struct lov_device *lu2lov_dev(const struct lu_device *d)
+{
+        LINVRNT(d->ld_type == &lov_device_type);
+        return container_of0(d, struct lov_device, ld_cl.cd_lu_dev);
+}
+
+static inline struct cl_device *lovsub2cl_dev(struct lovsub_device *lovsub)
+{
+        return &lovsub->acid_cl;
+}
+
+static inline struct lu_device *lovsub2lu_dev(struct lovsub_device *lovsub)
+{
+        return &lovsub2cl_dev(lovsub)->cd_lu_dev;
+}
+
+static inline struct lovsub_device *lu2lovsub_dev(const struct lu_device *d)
+{
+        LINVRNT(d->ld_type == &lovsub_device_type);
+        return container_of0(d, struct lovsub_device, acid_cl.cd_lu_dev);
+}
+
+static inline struct lovsub_device *cl2lovsub_dev(const struct cl_device *d)
+{
+        LINVRNT(d->cd_lu_dev.ld_type == &lovsub_device_type);
+        return container_of0(d, struct lovsub_device, acid_cl);
+}
+
+static inline struct lu_object *lov2lu(struct lov_object *lov)
+{
+        return &lov->lo_cl.co_lu;
+}
+
+static inline struct cl_object *lov2cl(struct lov_object *lov)
+{
+        return &lov->lo_cl;
+}
+
+static inline struct lov_object *lu2lov(const struct lu_object *obj)
+{
+        LINVRNT(lov_is_object(obj));
+        return container_of0(obj, struct lov_object, lo_cl.co_lu);
+}
+
+static inline struct lov_object *cl2lov(const struct cl_object *obj)
+{
+        LINVRNT(lov_is_object(&obj->co_lu));
+        return container_of0(obj, struct lov_object, lo_cl);
+}
+
+static inline struct lu_object *lovsub2lu(struct lovsub_object *los)
+{
+        return &los->lso_cl.co_lu;
+}
+
+static inline struct cl_object *lovsub2cl(struct lovsub_object *los)
+{
+        return &los->lso_cl;
+}
+
+static inline struct lovsub_object *cl2lovsub(const struct cl_object *obj)
+{
+        LINVRNT(lovsub_is_object(&obj->co_lu));
+        return container_of0(obj, struct lovsub_object, lso_cl);
+}
+
+static inline struct lovsub_object *lu2lovsub(const struct lu_object *obj)
+{
+        LINVRNT(lovsub_is_object(obj));
+        return container_of0(obj, struct lovsub_object, lso_cl.co_lu);
+}
+
+static inline struct lovsub_lock *
+cl2lovsub_lock(const struct cl_lock_slice *slice)
+{
+        LINVRNT(lovsub_is_object(&slice->cls_obj->co_lu));
+        return container_of(slice, struct lovsub_lock, lss_cl);
+}
+
+static inline struct lovsub_lock *cl2sub_lock(const struct cl_lock *lock)
+{
+        const struct cl_lock_slice *slice;
+
+        slice = cl_lock_at(lock, &lovsub_device_type);
+        LASSERT(slice != NULL);
+        return cl2lovsub_lock(slice);
+}
+
+static inline struct lov_lock *cl2lov_lock(const struct cl_lock_slice *slice)
+{
+        LINVRNT(lov_is_object(&slice->cls_obj->co_lu));
+        return container_of(slice, struct lov_lock, lls_cl);
+}
+
+static inline struct lov_page *cl2lov_page(const struct cl_page_slice *slice)
+{
+        LINVRNT(lov_is_object(&slice->cpl_obj->co_lu));
+        return container_of0(slice, struct lov_page, lps_cl);
+}
+
+static inline struct lov_req *cl2lov_req(const struct cl_req_slice *slice)
+{
+        return container_of0(slice, struct lov_req, lr_cl);
+}
+
+static inline struct lovsub_page *
+cl2lovsub_page(const struct cl_page_slice *slice)
+{
+        LINVRNT(lovsub_is_object(&slice->cpl_obj->co_lu));
+        return container_of0(slice, struct lovsub_page, lsb_cl);
+}
+
+static inline struct lovsub_req *cl2lovsub_req(const struct cl_req_slice *slice)
+{
+        return container_of0(slice, struct lovsub_req, lsrq_cl);
+}
+
+static inline struct cl_page *lov_sub_page(const struct cl_page_slice *slice)
+{
+        return slice->cpl_page->cp_child;
+}
+
+static inline struct lov_io *cl2lov_io(const struct lu_env *env,
+                                const struct cl_io_slice *ios)
+{
+        struct lov_io *lio;
+
+        lio = container_of(ios, struct lov_io, lis_cl);
+        LASSERT(lio == lov_env_io(env));
+        return lio;
+}
+
+static inline int lov_targets_nr(const struct lov_device *lov)
+{
+        return lov->ld_lov->desc.ld_tgt_count;
+}
+
+static inline struct lov_thread_info *lov_env_info(const struct lu_env *env)
+{
+        struct lov_thread_info *info;
+
+        info = lu_context_key_get(&env->le_ctx, &lov_key);
+        LASSERT(info != NULL);
+        return info;
+}
+
+static inline struct lov_layout_raid0 *lov_r0(struct lov_object *lov)
+{
+        struct lov_layout_raid0 *raid0;
+
+        LASSERT(lov->lo_type == LLT_RAID0);
+        raid0 = &lov->u.raid0;
+        LASSERT(raid0->lo_lsm->lsm_wire.lw_magic == LOV_MAGIC);
+        return raid0;
+}
+
+/** @} lov */
+
+#endif
+
diff --git a/lustre/lov/lov_dev.c b/lustre/lov/lov_dev.c

new file mode 100644 (file)

index 0000000..32dfe3a
--- /dev/null
+++ b/lustre/lov/lov_dev.c
@@ -0,0 +1,540 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_device and cl_device_type for LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+/* class_name2obd() */
+#include <obd_class.h>
+
+#include "lov_cl_internal.h"
+
+cfs_mem_cache_t *lov_page_kmem;
+cfs_mem_cache_t *lov_lock_kmem;
+cfs_mem_cache_t *lov_object_kmem;
+cfs_mem_cache_t *lov_thread_kmem;
+cfs_mem_cache_t *lov_session_kmem;
+cfs_mem_cache_t *lov_req_kmem;
+
+cfs_mem_cache_t *lovsub_page_kmem;
+cfs_mem_cache_t *lovsub_lock_kmem;
+cfs_mem_cache_t *lovsub_object_kmem;
+cfs_mem_cache_t *lovsub_req_kmem;
+
+cfs_mem_cache_t *lov_lock_link_kmem;
+
+/** Lock class of lov_device::ld_mutex. */
+struct lock_class_key cl_lov_device_mutex_class;
+
+struct lu_kmem_descr lov_caches[] = {
+        {
+                .ckd_cache = &lov_page_kmem,
+                .ckd_name  = "lov_page_kmem",
+                .ckd_size  = sizeof (struct lov_page)
+        },
+        {
+                .ckd_cache = &lov_lock_kmem,
+                .ckd_name  = "lov_lock_kmem",
+                .ckd_size  = sizeof (struct lov_lock)
+        },
+        {
+                .ckd_cache = &lov_object_kmem,
+                .ckd_name  = "lov_object_kmem",
+                .ckd_size  = sizeof (struct lov_object)
+        },
+        {
+                .ckd_cache = &lov_thread_kmem,
+                .ckd_name  = "lov_thread_kmem",
+                .ckd_size  = sizeof (struct lov_thread_info)
+        },
+        {
+                .ckd_cache = &lov_session_kmem,
+                .ckd_name  = "lov_session_kmem",
+                .ckd_size  = sizeof (struct lov_session)
+        },
+        {
+                .ckd_cache = &lov_req_kmem,
+                .ckd_name  = "lov_req_kmem",
+                .ckd_size  = sizeof (struct lov_req)
+        },
+        {
+                .ckd_cache = &lovsub_page_kmem,
+                .ckd_name  = "lovsub_page_kmem",
+                .ckd_size  = sizeof (struct lovsub_page)
+        },
+        {
+                .ckd_cache = &lovsub_lock_kmem,
+                .ckd_name  = "lovsub_lock_kmem",
+                .ckd_size  = sizeof (struct lovsub_lock)
+        },
+        {
+                .ckd_cache = &lovsub_object_kmem,
+                .ckd_name  = "lovsub_object_kmem",
+                .ckd_size  = sizeof (struct lovsub_object)
+        },
+        {
+                .ckd_cache = &lovsub_req_kmem,
+                .ckd_name  = "lovsub_req_kmem",
+                .ckd_size  = sizeof (struct lovsub_req)
+        },
+        {
+                .ckd_cache = &lov_lock_link_kmem,
+                .ckd_name  = "lov_lock_link_kmem",
+                .ckd_size  = sizeof (struct lov_lock_link)
+        },
+        {
+                .ckd_cache = NULL
+        }
+};
+
+/*****************************************************************************
+ *
+ * Lov transfer operations.
+ *
+ */
+
+static void lov_req_completion(const struct lu_env *env,
+                               const struct cl_req_slice *slice, int ioret)
+{
+        struct lov_req *lr;
+
+        ENTRY;
+        lr = cl2lov_req(slice);
+        OBD_SLAB_FREE_PTR(lr, lov_req_kmem);
+        EXIT;
+}
+
+static const struct cl_req_operations lov_req_ops = {
+        .cro_completion = lov_req_completion
+};
+
+/*****************************************************************************
+ *
+ * Lov device and device type functions.
+ *
+ */
+
+static void *lov_key_init(const struct lu_context *ctx,
+                          struct lu_context_key *key)
+{
+        struct lov_thread_info *info;
+
+        OBD_SLAB_ALLOC_PTR(info, lov_thread_kmem);
+        if (info != NULL)
+                CFS_INIT_LIST_HEAD(&info->lti_closure.clc_list);
+        else
+                info = ERR_PTR(-ENOMEM);
+        return info;
+}
+
+static void lov_key_fini(const struct lu_context *ctx,
+                         struct lu_context_key *key, void *data)
+{
+        struct lov_thread_info *info = data;
+        LINVRNT(list_empty(&info->lti_closure.clc_list));
+        OBD_SLAB_FREE_PTR(info, lov_thread_kmem);
+}
+
+struct lu_context_key lov_key = {
+        .lct_tags = LCT_CL_THREAD,
+        .lct_init = lov_key_init,
+        .lct_fini = lov_key_fini
+};
+
+static void *lov_session_key_init(const struct lu_context *ctx,
+                                  struct lu_context_key *key)
+{
+        struct lov_session *info;
+
+        OBD_SLAB_ALLOC_PTR(info, lov_session_kmem);
+        if (info == NULL)
+                info = ERR_PTR(-ENOMEM);
+        return info;
+}
+
+static void lov_session_key_fini(const struct lu_context *ctx,
+                                 struct lu_context_key *key, void *data)
+{
+        struct lov_session *info = data;
+        OBD_SLAB_FREE_PTR(info, lov_session_kmem);
+}
+
+struct lu_context_key lov_session_key = {
+        .lct_tags = LCT_SESSION,
+        .lct_init = lov_session_key_init,
+        .lct_fini = lov_session_key_fini
+};
+
+/* type constructor/destructor: lov_type_{init,fini,start,stop}() */
+LU_TYPE_INIT_FINI(lov, &lov_key, &lov_session_key);
+
+static struct lu_device *lov_device_fini(const struct lu_env *env,
+                                         struct lu_device *d)
+{
+        int i;
+        struct lov_device *ld = lu2lov_dev(d);
+
+        LASSERT(ld->ld_lov != NULL);
+        if (ld->ld_target == NULL)
+                RETURN(NULL);
+
+        lov_foreach_target(ld, i) {
+                struct lovsub_device *lsd;
+
+                lsd = ld->ld_target[i];
+                if (lsd != NULL) {
+                        cl_stack_fini(env, lovsub2cl_dev(lsd));
+                        ld->ld_target[i] = NULL;
+                }
+        }
+        RETURN(NULL);
+}
+
+static int lov_device_init(const struct lu_env *env, struct lu_device *d,
+                           const char *name, struct lu_device *next)
+{
+        struct lov_device *ld = lu2lov_dev(d);
+        int i;
+        int rc = 0;
+
+        LASSERT(d->ld_site != NULL);
+        if (ld->ld_target == NULL)
+                RETURN(rc);
+
+        lov_foreach_target(ld, i) {
+                struct lovsub_device *lsd;
+                struct cl_device     *cl;
+                struct lov_tgt_desc  *desc;
+
+                desc = ld->ld_lov->lov_tgts[i];
+                if (desc->ltd_active) {
+                        cl = cl_type_setup(env, d->ld_site, &lovsub_device_type,
+                                           desc->ltd_exp->exp_obd->obd_lu_dev);
+                        if (IS_ERR(cl)) {
+                                rc = PTR_ERR(cl);
+                                break;
+                        }
+                        lsd = cl2lovsub_dev(cl);
+                        lsd->acid_idx = i;
+                        lsd->acid_super = ld;
+                        ld->ld_target[i] = lsd;
+                }
+        }
+
+        if (rc)
+                lov_device_fini(env, d);
+        else
+                ld->ld_flags |= LOV_DEV_INITIALIZED;
+
+        RETURN(rc);
+}
+
+static int lov_req_init(const struct lu_env *env, struct cl_device *dev,
+                        struct cl_req *req)
+{
+        struct lov_req *lr;
+        int result;
+
+        ENTRY;
+        OBD_SLAB_ALLOC_PTR(lr, lov_req_kmem);
+        if (lr != NULL) {
+                cl_req_slice_add(req, &lr->lr_cl, dev, &lov_req_ops);
+                result = 0;
+        } else
+                result = -ENOMEM;
+        RETURN(result);
+}
+
+static const struct cl_device_operations lov_cl_ops = {
+        .cdo_req_init = lov_req_init
+};
+
+static void lov_emerg_free(struct lov_device_emerg **emrg, int nr)
+{
+        int i;
+
+        for (i = 0; i < nr; ++i) {
+                struct lov_device_emerg *em;
+
+                em = emrg[i];
+                if (em != NULL) {
+                        LASSERT(em->emrg_page_list.pl_nr == 0);
+                        if (em->emrg_env != NULL)
+                                cl_env_put(em->emrg_env, &em->emrg_refcheck);
+                        OBD_FREE_PTR(em);
+                }
+        }
+        OBD_FREE(emrg, nr * sizeof emrg[0]);
+}
+
+static struct lu_device *lov_device_free(const struct lu_env *env,
+                                         struct lu_device *d)
+{
+        struct lov_device *ld = lu2lov_dev(d);
+        const int          nr = ld->ld_target_nr;
+
+        cl_device_fini(lu2cl_dev(d));
+        if (ld->ld_target != NULL)
+                OBD_FREE(ld->ld_target, nr * sizeof ld->ld_target[0]);
+        if (ld->ld_emrg != NULL)
+                lov_emerg_free(ld->ld_emrg, nr);
+        OBD_FREE_PTR(ld);
+        return NULL;
+}
+
+static void lov_cl_del_target(const struct lu_env *env, struct lu_device *dev,
+                              __u32 index)
+{
+        struct lov_device *ld = lu2lov_dev(dev);
+        ENTRY;
+
+        if (ld->ld_target[index] != NULL) {
+                cl_stack_fini(env, lovsub2cl_dev(ld->ld_target[index]));
+                ld->ld_target[index] = NULL;
+        }
+        EXIT;
+}
+
+static struct lov_device_emerg **lov_emerg_alloc(int nr)
+{
+        struct lov_device_emerg **emerg;
+        int i;
+        int result;
+
+        OBD_ALLOC(emerg, nr * sizeof emerg[0]);
+        if (emerg == NULL)
+                return ERR_PTR(-ENOMEM);
+        for (result = i = 0; i < nr && result == 0; i++) {
+                struct lov_device_emerg *em;
+                void *cookie;
+
+                OBD_ALLOC_PTR(em);
+                if (em != NULL) {
+                        emerg[i] = em;
+                        cl_page_list_init(&em->emrg_page_list);
+                        cookie = cl_env_reenter();
+                        em->emrg_env = cl_env_alloc(&em->emrg_refcheck,
+                                                    LCT_REMEMBER|LCT_NOREF);
+                        cl_env_reexit(cookie);
+                        if (!IS_ERR(em->emrg_env))
+                                em->emrg_env->le_ctx.lc_cookie = 0x2;
+                        else {
+                                result = PTR_ERR(em->emrg_env);
+                                em->emrg_env = NULL;
+                        }
+                } else
+                        result = -ENOMEM;
+        }
+        if (result != 0) {
+                lov_emerg_free(emerg, nr);
+                emerg = ERR_PTR(result);
+        }
+        return emerg;
+}
+
+static int lov_expand_targets(const struct lu_env *env, struct lov_device *dev)
+{
+        int   result;
+        __u32 tgt_size;
+        __u32 sub_size;
+
+        ENTRY;
+        result = 0;
+        tgt_size = dev->ld_lov->lov_tgt_size;
+        sub_size = dev->ld_target_nr;
+        if (sub_size < tgt_size) {
+                struct lovsub_device    **newd;
+                struct lov_device_emerg **emerg;
+                const size_t              sz   = sizeof newd[0];
+
+                emerg = lov_emerg_alloc(tgt_size);
+                if (IS_ERR(emerg))
+                        RETURN(PTR_ERR(emerg));
+
+                OBD_ALLOC(newd, tgt_size * sz);
+                if (newd != NULL) {
+                        mutex_lock(&dev->ld_mutex);
+                        if (sub_size > 0) {
+                                memcpy(newd, dev->ld_target, sub_size * sz);
+                                OBD_FREE(dev->ld_target, sub_size * sz);
+                        }
+                        dev->ld_target    = newd;
+                        dev->ld_target_nr = tgt_size;
+
+                        if (dev->ld_emrg != NULL)
+                                lov_emerg_free(dev->ld_emrg, sub_size);
+                        dev->ld_emrg = emerg;
+                        mutex_unlock(&dev->ld_mutex);
+                } else {
+                        lov_emerg_free(emerg, tgt_size);
+                        result = -ENOMEM;
+                }
+        }
+        RETURN(result);
+}
+
+static int lov_cl_add_target(const struct lu_env *env, struct lu_device *dev,
+                             __u32 index)
+{
+        struct obd_device    *obd = dev->ld_obd;
+        struct lov_device    *ld  = lu2lov_dev(dev);
+        struct lov_tgt_desc  *tgt;
+        struct lovsub_device *lsd;
+        struct cl_device     *cl;
+        int rc;
+        ENTRY;
+
+        lov_getref(obd);
+
+        tgt = obd->u.lov.lov_tgts[index];
+        LASSERT(tgt != NULL);
+
+        rc = lov_expand_targets(env, ld);
+        if (rc == 0 && ld->ld_flags & LOV_DEV_INITIALIZED) {
+                LASSERT(dev->ld_site != NULL);
+                cl = cl_type_setup(env, dev->ld_site, &lovsub_device_type,
+                                   tgt->ltd_exp->exp_obd->obd_lu_dev);
+                if (!IS_ERR(cl)) {
+                        lsd = cl2lovsub_dev(cl);
+                        lsd->acid_idx = index;
+                        lsd->acid_super = ld;
+                        ld->ld_target[index] = lsd;
+                } else {
+                        CERROR("add failed (%d), deleting %s\n", rc,
+                               obd_uuid2str(&tgt->ltd_uuid));
+                        lov_cl_del_target(env, dev, index);
+                        rc = PTR_ERR(cl);
+                }
+        }
+        lov_putref(obd);
+        RETURN(rc);
+}
+
+static int lov_process_config(const struct lu_env *env,
+                              struct lu_device *d, struct lustre_cfg *cfg)
+{
+        struct obd_device *obd = d->ld_obd;
+        int cmd;
+        int rc;
+        int gen;
+        __u32 index;
+
+        lov_getref(obd);
+
+        cmd = cfg->lcfg_command;
+        rc = lov_process_config_base(d->ld_obd, cfg, &index, &gen);
+        if (rc == 0) {
+                switch(cmd) {
+                case LCFG_LOV_ADD_OBD:
+                case LCFG_LOV_ADD_INA:
+                        rc = lov_cl_add_target(env, d, index);
+                        if (rc != 0)
+                                lov_del_target(d->ld_obd, index, 0, 0);
+                        break;
+                case LCFG_LOV_DEL_OBD:
+                        lov_cl_del_target(env, d, index);
+                        break;
+                }
+        }
+        lov_putref(obd);
+        RETURN(rc);
+}
+
+static const struct lu_device_operations lov_lu_ops = {
+        .ldo_object_alloc      = lov_object_alloc,
+        .ldo_process_config    = lov_process_config,
+};
+
+static struct lu_device *lov_device_alloc(const struct lu_env *env,
+                                          struct lu_device_type *t,
+                                          struct lustre_cfg *cfg)
+{
+        struct lu_device *d;
+        struct lov_device *ld;
+        struct obd_device *obd;
+        int rc;
+
+        OBD_ALLOC_PTR(ld);
+        if (ld == NULL)
+                RETURN(ERR_PTR(-ENOMEM));
+
+        cl_device_init(&ld->ld_cl, t);
+        d = lov2lu_dev(ld);
+        d->ld_ops        = &lov_lu_ops;
+        ld->ld_cl.cd_ops = &lov_cl_ops;
+
+        mutex_init(&ld->ld_mutex);
+        lockdep_set_class(&ld->ld_mutex, &cl_lov_device_mutex_class);
+
+        /* setup the LOV OBD */
+        obd = class_name2obd(lustre_cfg_string(cfg, 0));
+        LASSERT(obd != NULL);
+        rc = lov_setup(obd, cfg);
+        if (rc) {
+                lov_device_free(env, d);
+                RETURN(ERR_PTR(rc));
+        }
+
+        ld->ld_lov = &obd->u.lov;
+        RETURN(d);
+}
+
+static const struct lu_device_type_operations lov_device_type_ops = {
+        .ldto_init = lov_type_init,
+        .ldto_fini = lov_type_fini,
+
+        .ldto_start = lov_type_start,
+        .ldto_stop  = lov_type_stop,
+
+        .ldto_device_alloc = lov_device_alloc,
+        .ldto_device_free  = lov_device_free,
+
+        .ldto_device_init    = lov_device_init,
+        .ldto_device_fini    = lov_device_fini
+};
+
+struct lu_device_type lov_device_type = {
+        .ldt_tags     = LU_DEVICE_CL,
+        .ldt_name     = LUSTRE_LOV_NAME,
+        .ldt_ops      = &lov_device_type_ops,
+        .ldt_ctx_tags = LCT_CL_THREAD
+};
+EXPORT_SYMBOL(lov_device_type);
+
+/** @} lov */
diff --git a/lustre/lov/lov_ea.c b/lustre/lov/lov_ea.c

index 457d9dc..564246a 100755 (executable)
--- a/lustre/lov/lov_ea.c
+++ b/lustre/lov/lov_ea.c
@@ -500,7 +500,7 @@ static int lsm_revalidate_join(struct lov_stripe_md *lsm,
          OBD_ALLOC(lsm->lsm_array->lai_ext_array,lsm->lsm_array->lai_ext_count *
                                                  sizeof (struct lov_extent));
          if (!lsm->lsm_array->lai_ext_array)
-                GOTO(release_ctxt, rc = -ENOMEM);        
+                GOTO(release_ctxt, rc = -ENOMEM);
  
          CDEBUG(D_INFO, "get lsm logid: "LPU64":"LPU64"\n",
                 lsm->lsm_array->lai_array_id.lgl_oid,
@@ -526,7 +526,7 @@ release_ctxt:
          RETURN(rc);
  }
  
-int lsm_destroy_join(struct lov_stripe_md *lsm, struct obdo *oa, 
+int lsm_destroy_join(struct lov_stripe_md *lsm, struct obdo *oa,
                        struct obd_export *md_exp)
  {
          struct llog_ctxt *ctxt;
diff --git a/lustre/lov/lov_internal.h b/lustre/lov/lov_internal.h

index c9468ae..b7e49c9 100644 (file)
--- a/lustre/lov/lov_internal.h
+++ b/lustre/lov/lov_internal.h
@@ -82,26 +82,6 @@ struct lov_request_set {
          struct list_head         set_list;
  };
  
-#define LOV_AP_MAGIC 8200
-
-struct lov_async_page {
-        int                             lap_magic;
-        int                             lap_stripe;
-        obd_off                         lap_sub_offset;
-        obd_id                          lap_loi_id;
-        obd_gr                          lap_loi_gr;
-        void                            *lap_sub_cookie;
-        struct obd_async_page_ops       *lap_caller_ops;
-        void                            *lap_caller_data;
-};
-
-static inline struct lov_async_page *lap_from_cookie(void *ptr)
-{
-        struct lov_async_page *ap = ptr;
-        LASSERT(ap->lap_magic == LOV_AP_MAGIC);
-        return ap;
-}
-
  extern cfs_mem_cache_t *lov_oinfo_slab;
  
  static inline void lov_llh_addref(void *llhp)
@@ -142,7 +122,7 @@ static inline void lov_llh_put(struct lov_lock_handles *llh)
                  atomic_read(&llh->llh_refcount) < 0x5a5a);
          if (atomic_dec_and_test(&llh->llh_refcount)) {
                  class_handle_unhash(&llh->llh_handle);
-                /* The structure may be held by other threads because RCU. 
+                /* The structure may be held by other threads because RCU.
                   *   -jxiong */
                  if (atomic_read(&llh->llh_refcount))
                          return;
@@ -163,6 +143,8 @@ int lov_merge_lvb(struct obd_export *exp, struct lov_stripe_md *lsm,
                    struct ost_lvb *lvb, int kms_only);
  int lov_adjust_kms(struct obd_export *exp, struct lov_stripe_md *lsm,
                     obd_off size, int shrink);
+int lov_merge_lvb_kms(struct lov_stripe_md *lsm,
+                      struct ost_lvb *lvb, __u64 *kms_place);
  
  /* lov_offset.c */
  obd_size lov_stripe_size(struct lov_stripe_md *lsm, obd_size ost_size,
@@ -264,10 +246,16 @@ void lov_fix_desc_qos_maxage(__u32 *val);
  int lov_get_stripecnt(struct lov_obd *lov, __u32 stripe_count);
  void lov_getref(struct obd_device *obd);
  void lov_putref(struct obd_device *obd);
-
+int lov_connect_obd(struct obd_device *obd, __u32 index, int activate,
+                    struct obd_connect_data *data);
+int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
+int lov_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg,
+                            __u32 *indexp, int *genp);
+int lov_del_target(struct obd_device *obd, __u32 index,
+                   struct obd_uuid *uuidp, int gen);
  /* lov_log.c */
  int lov_llog_init(struct obd_device *obd, struct obd_llog_group *olg,
-                  struct obd_device *tgt, int count, struct llog_catid *logid, 
+                  struct obd_device *tgt, int count, struct llog_catid *logid,
                    struct obd_uuid *uuid);
  int lov_llog_finish(struct obd_device *obd, int count);
  
@@ -312,6 +300,9 @@ static inline void lprocfs_lov_init_vars(struct lprocfs_static_vars *lvars)
  }
  #endif
  
+/* lov_cl.c */
+extern struct lu_device_type lov_device_type;
+
  /* pools */
  extern lustre_hash_ops_t pool_hash_operations;
  /* ost_pool methods */
@@ -330,5 +321,4 @@ void lov_dump_pool(int level, struct pool_desc *pool);
  struct pool_desc *lov_find_pool(struct lov_obd *lov, char *poolname);
  int lov_check_index_in_pool(__u32 idx, struct pool_desc *pool);
  
-
  #endif
diff --git a/lustre/lov/lov_io.c b/lustre/lov/lov_io.c

new file mode 100644 (file)

index 0000000..346a992
--- /dev/null
+++ b/lustre/lov/lov_io.c
@@ -0,0 +1,894 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_io for LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov lov @{ */
+
+static void lov_sub_enter(struct lov_io_sub *sub)
+{
+        ENTRY;
+        if (sub->sub_reenter++ == 0) {
+                sub->sub_cookie = cl_env_reenter();
+                cl_env_implant(sub->sub_env, &sub->sub_refcheck2);
+        }
+        EXIT;
+}
+
+static void lov_sub_exit(struct lov_io_sub *sub)
+{
+        ENTRY;
+        if (--sub->sub_reenter == 0) {
+                cl_env_unplant(sub->sub_env, &sub->sub_refcheck2);
+                cl_env_reexit(sub->sub_cookie);
+        }
+        EXIT;
+}
+
+static void lov_io_sub_fini(const struct lu_env *env, struct lov_io *lio,
+                            struct lov_io_sub *sub)
+{
+        ENTRY;
+        if (sub->sub_io != NULL) {
+                if (sub->sub_io_initialized) {
+                        lov_sub_enter(sub);
+                        cl_io_fini(sub->sub_env, sub->sub_io);
+                        lov_sub_exit(sub);
+                        sub->sub_io_initialized = 0;
+                        lio->lis_active_subios--;
+                }
+                if (sub->sub_stripe == lio->lis_single_subio_index)
+                        lio->lis_single_subio_index = -1;
+                else if (!sub->sub_borrowed)
+                        OBD_FREE_PTR(sub->sub_io);
+                sub->sub_io = NULL;
+        }
+        if (sub->sub_env != NULL && !IS_ERR(sub->sub_env)) {
+                if (!sub->sub_borrowed)
+                        cl_env_put(sub->sub_env, &sub->sub_refcheck);
+                sub->sub_env = NULL;
+        }
+        EXIT;
+}
+
+static void lov_io_sub_inherit(struct cl_io *io, struct lov_io *lio,
+                               int stripe, loff_t start, loff_t end)
+{
+        struct lov_stripe_md *lsm    = lov_r0(lio->lis_object)->lo_lsm;
+        struct cl_io         *parent = lio->lis_cl.cis_io;
+
+        switch(io->ci_type) {
+        case CIT_TRUNC: {
+                size_t new_size = parent->u.ci_truncate.tr_size;
+
+                new_size = lov_size_to_stripe(lsm, new_size, stripe);
+                io->u.ci_truncate.tr_capa = parent->u.ci_truncate.tr_capa;
+                io->u.ci_truncate.tr_size = new_size;
+                break;
+        }
+        case CIT_FAULT: {
+                struct cl_object *obj = parent->ci_obj;
+                loff_t off = cl_offset(obj, parent->u.ci_fault.ft_index);
+
+                io->u.ci_fault = parent->u.ci_fault;
+                off = lov_size_to_stripe(lsm, off, stripe);
+                io->u.ci_fault.ft_index = cl_index(obj, off);
+                break;
+        }
+        case CIT_READ:
+        case CIT_WRITE: {
+                io->u.ci_rw.crw_pos = start;
+                io->u.ci_rw.crw_count = end - start;
+                break;
+        }
+        default:
+                break;
+        }
+}
+
+static int lov_io_sub_init(const struct lu_env *env, struct lov_io *lio,
+                           struct lov_io_sub *sub)
+{
+        struct lov_object *lov = lio->lis_object;
+        struct lov_device *ld  = lu2lov_dev(lov2cl(lov)->co_lu.lo_dev);
+        struct cl_io      *sub_io;
+        struct cl_object  *sub_obj;
+        struct cl_io      *io  = lio->lis_cl.cis_io;
+
+        int stripe = sub->sub_stripe;
+        int result;
+
+        LASSERT(sub->sub_io == NULL);
+        LASSERT(sub->sub_env == NULL);
+        LASSERT(sub->sub_stripe < lio->lis_stripe_count);
+        ENTRY;
+
+        result = 0;
+        sub->sub_io_initialized = 0;
+        sub->sub_borrowed = 0;
+
+        /*
+         * First sub-io. Use ->lis_single_subio and current environment, to
+         * avoid dynamic allocation.
+         */
+        if (lio->lis_active_subios == 0) {
+                sub->sub_io = &lio->lis_single_subio;
+                lio->lis_single_subio_index = stripe;
+                sub->sub_env = cl_env_get(&sub->sub_refcheck);
+                LASSERT(sub->sub_env == env);
+        } else if (lio->lis_mem_frozen) {
+                LASSERT(mutex_is_locked(&ld->ld_mutex));
+                sub->sub_io  = &ld->ld_emrg[stripe]->emrg_subio;
+                sub->sub_env = ld->ld_emrg[stripe]->emrg_env;
+                sub->sub_borrowed = 1;
+        } else {
+                void *cookie;
+
+                /* obtain new environment */
+                cookie = cl_env_reenter();
+                sub->sub_env = cl_env_get(&sub->sub_refcheck);
+                cl_env_reexit(cookie);
+
+                OBD_ALLOC_PTR(sub->sub_io);
+                if (IS_ERR(sub->sub_env))
+                        result = PTR_ERR(sub->sub_env);
+                else if (sub->sub_io == NULL)
+                        result = -ENOMEM;
+        }
+
+        if (result == 0) {
+                sub_obj = lovsub2cl(lov_r0(lov)->lo_sub[stripe]);
+                sub_io  = sub->sub_io;
+
+                sub_io->ci_obj    = sub_obj;
+                sub_io->ci_result = 0;
+
+                sub_io->ci_parent  = io;
+                sub_io->ci_lockreq = io->ci_lockreq;
+                sub_io->ci_type    = io->ci_type;
+
+                lov_sub_enter(sub);
+                result = cl_io_sub_init(sub->sub_env, sub_io,
+                                        io->ci_type, sub_obj);
+                lov_sub_exit(sub);
+                if (result >= 0) {
+                        lio->lis_active_subios++;
+                        sub->sub_io_initialized = 1;
+                        result = 0;
+                }
+        }
+        if (result != 0)
+                lov_io_sub_fini(env, lio, sub);
+        RETURN(result);
+}
+
+static struct lov_io_sub *lov_sub_get(const struct lu_env *env,
+                                      struct lov_io *lio, int stripe)
+{
+        int rc;
+        struct lov_io_sub *sub = &lio->lis_subs[stripe];
+
+        LASSERT(stripe < lio->lis_stripe_count);
+        ENTRY;
+
+        if (!sub->sub_io_initialized) {
+                sub->sub_stripe = stripe;
+                rc = lov_io_sub_init(env, lio, sub);
+        } else
+                rc = 0;
+        if (rc == 0)
+                lov_sub_enter(sub);
+        else
+                sub = ERR_PTR(rc);
+        RETURN(sub);
+}
+
+void lov_sub_put(struct lov_io_sub *sub)
+{
+        lov_sub_exit(sub);
+}
+
+/*****************************************************************************
+ *
+ * Lov io operations.
+ *
+ */
+
+static int lov_page_stripe(const struct cl_page *page)
+{
+        struct lovsub_object *subobj;
+
+        ENTRY;
+        subobj = lu2lovsub(
+                lu_object_locate(page->cp_child->cp_obj->co_lu.lo_header,
+                                 &lovsub_device_type));
+        LASSERT(subobj != NULL);
+        RETURN(subobj->lso_index);
+}
+
+struct lov_io_sub *lov_page_subio(const struct lu_env *env, struct lov_io *lio,
+                                  const struct cl_page_slice *slice)
+{
+        struct lov_stripe_md *lsm  = lov_r0(lio->lis_object)->lo_lsm;
+        struct cl_page       *page = slice->cpl_page;
+        int stripe;
+
+        LASSERT(lio->lis_cl.cis_io != NULL);
+        LASSERT(cl2lov(slice->cpl_obj) == lio->lis_object);
+        LASSERT(lsm != NULL);
+        LASSERT(lio->lis_nr_subios > 0);
+        ENTRY;
+
+        stripe = lov_page_stripe(page);
+        RETURN(lov_sub_get(env, lio, stripe));
+}
+
+
+static int lov_io_subio_init(const struct lu_env *env, struct lov_io *lio,
+                             struct cl_io *io)
+{
+        struct lov_object    *lov = lio->lis_object;
+        struct lov_stripe_md *lsm = lov_r0(lov)->lo_lsm;
+        int result;
+
+        LASSERT(lio->lis_object != NULL);
+        ENTRY;
+
+        /*
+         * Need to be optimized, we can't afford to allocate a piece of memory
+         * when writing a page. -jay
+         */
+        OBD_ALLOC(lio->lis_subs,
+                  lsm->lsm_stripe_count * sizeof lio->lis_subs[0]);
+        if (lio->lis_subs != NULL) {
+                lio->lis_nr_subios = lio->lis_stripe_count;
+                lio->lis_single_subio_index = -1;
+                lio->lis_active_subios = 0;
+                result = 0;
+        } else
+                result = -ENOMEM;
+        RETURN(result);
+}
+
+static void lov_io_slice_init(struct lov_io *lio,
+                              struct lov_object *obj, struct cl_io *io)
+{
+        struct lov_stripe_md *lsm = lov_r0(obj)->lo_lsm;
+
+        LASSERT(lsm != NULL);
+        ENTRY;
+
+        io->ci_result = 0;
+        lio->lis_object = obj;
+        lio->lis_stripe_count = lsm->lsm_stripe_count;
+
+        switch (io->ci_type) {
+        case CIT_READ:
+        case CIT_WRITE:
+                lio->lis_pos = io->u.ci_rw.crw_pos;
+                lio->lis_endpos = io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count;
+                lio->lis_io_endpos = lio->lis_endpos;
+                if (cl_io_is_append(io)) {
+                        LASSERT(io->ci_type == CIT_WRITE);
+                        lio->lis_pos = 0;
+                        lio->lis_endpos = OBD_OBJECT_EOF;
+                }
+                break;
+
+        case CIT_TRUNC:
+                lio->lis_pos = io->u.ci_truncate.tr_size;
+                lio->lis_endpos = OBD_OBJECT_EOF;
+                break;
+
+        case CIT_FAULT: {
+                pgoff_t index = io->u.ci_fault.ft_index;
+                lio->lis_pos = cl_offset(io->ci_obj, index);
+                lio->lis_endpos = cl_offset(io->ci_obj, index + 1);
+                break;
+        }
+
+        case CIT_MISC:
+                lio->lis_pos = 0;
+                lio->lis_endpos = OBD_OBJECT_EOF;
+                break;
+
+        default:
+                LBUG();
+        }
+
+        EXIT;
+}
+
+static void lov_io_fini(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+        struct lov_io *lio = cl2lov_io(env, ios);
+        int i;
+
+        ENTRY;
+        if (lio->lis_subs != NULL) {
+                for (i = 0; i < lio->lis_nr_subios; i++)
+                        lov_io_sub_fini(env, lio, &lio->lis_subs[i]);
+                OBD_FREE(lio->lis_subs,
+                         lio->lis_nr_subios * sizeof lio->lis_subs[0]);
+                lio->lis_nr_subios = 0;
+        }
+        EXIT;
+}
+
+static obd_off lov_offset_mod(obd_off val, int delta)
+{
+        if (val != OBD_OBJECT_EOF)
+                val += delta;
+        return val;
+}
+
+static int lov_io_iter_init(const struct lu_env *env,
+                            const struct cl_io_slice *ios)
+{
+        struct lov_io        *lio = cl2lov_io(env, ios);
+        struct lov_stripe_md *lsm = lov_r0(lio->lis_object)->lo_lsm;
+        struct lov_io_sub    *sub;
+        obd_off endpos;
+        obd_off start;
+        obd_off end;
+        int stripe;
+        int rc = 0;
+
+        ENTRY;
+        endpos = lov_offset_mod(lio->lis_endpos, -1);
+        for (stripe = 0; stripe < lio->lis_stripe_count; stripe++) {
+                if (!lov_stripe_intersects(lsm, stripe, lio->lis_pos,
+                                           endpos, &start, &end))
+                        continue;
+
+                end = lov_offset_mod(end, +1);
+                sub = lov_sub_get(env, lio, stripe);
+                if (!IS_ERR(sub)) {
+                        lov_io_sub_inherit(sub->sub_io, lio, stripe,
+                                           start, end);
+                        rc = cl_io_iter_init(sub->sub_env, sub->sub_io);
+                        lov_sub_put(sub);
+                        CDEBUG(D_VFSTRACE, "shrink: %i [%llu, %llu)\n",
+                               stripe, start, end);
+                } else
+                        rc = PTR_ERR(sub);
+                if (!rc)
+                        list_add_tail(&sub->sub_linkage, &lio->lis_active);
+                else
+                        break;
+        }
+        RETURN(rc);
+}
+
+static int lov_io_rw_iter_init(const struct lu_env *env,
+                               const struct cl_io_slice *ios)
+{
+        struct lov_io        *lio = cl2lov_io(env, ios);
+        struct cl_io         *io  = ios->cis_io;
+        struct lov_stripe_md *lsm = lov_r0(cl2lov(ios->cis_obj))->lo_lsm;
+        loff_t start = io->u.ci_rw.crw_pos;
+        loff_t next;
+        int ssize = lsm->lsm_stripe_size;
+
+        LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE);
+        ENTRY;
+
+        /* fast path for common case. */
+        if (lio->lis_nr_subios != 1 && !cl_io_is_append(io)) {
+
+                do_div(start, ssize);
+                next = (start + 1) * ssize;
+                if (next <= start * ssize)
+                        next = ~0ull;
+
+                io->ci_continue = next < lio->lis_io_endpos;
+                io->u.ci_rw.crw_count = min_t(loff_t, lio->lis_io_endpos,
+                                              next) - io->u.ci_rw.crw_pos;
+                lio->lis_pos    = io->u.ci_rw.crw_pos;
+                lio->lis_endpos = io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count;
+                CDEBUG(D_VFSTRACE, "stripe: %llu chunk: [%llu, %llu) %llu\n",
+                       (__u64)start, lio->lis_pos, lio->lis_endpos,
+                       (__u64)lio->lis_io_endpos);
+        }
+        /*
+         * XXX The following call should be optimized: we know, that
+         * [lio->lis_pos, lio->lis_endpos) intersects with exactly one stripe.
+         */
+        RETURN(lov_io_iter_init(env, ios));
+}
+
+static int lov_io_call(const struct lu_env *env, struct lov_io *lio,
+                       int (*iofunc)(const struct lu_env *, struct cl_io *))
+{
+        struct lov_io_sub *sub;
+        int rc = 0;
+
+        ENTRY;
+        list_for_each_entry(sub, &lio->lis_active, sub_linkage) {
+                lov_sub_enter(sub);
+                rc = iofunc(sub->sub_env, sub->sub_io);
+                lov_sub_exit(sub);
+                if (rc)
+                        break;
+        }
+        RETURN(rc);
+}
+
+static int lov_io_lock(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+        ENTRY;
+        RETURN(lov_io_call(env, cl2lov_io(env, ios), cl_io_lock));
+}
+
+static int lov_io_start(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+        ENTRY;
+        RETURN(lov_io_call(env, cl2lov_io(env, ios), cl_io_start));
+}
+
+static int lov_io_end_wrapper(const struct lu_env *env, struct cl_io *io)
+{
+        ENTRY;
+        /*
+         * It's possible that lov_io_start() wasn't called against this
+         * sub-io, either because previous sub-io failed, or upper layer
+         * completed IO.
+         */
+        if (io->ci_state == CIS_IO_GOING)
+                cl_io_end(env, io);
+        else
+                io->ci_state = CIS_IO_FINISHED;
+        RETURN(0);
+}
+
+static int lov_io_iter_fini_wrapper(const struct lu_env *env, struct cl_io *io)
+{
+        cl_io_iter_fini(env, io);
+        RETURN(0);
+}
+
+static int lov_io_unlock_wrapper(const struct lu_env *env, struct cl_io *io)
+{
+        cl_io_unlock(env, io);
+        RETURN(0);
+}
+
+static void lov_io_end(const struct lu_env *env, const struct cl_io_slice *ios)
+{
+        int rc;
+
+        rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_end_wrapper);
+        LASSERT(rc == 0);
+}
+
+static void lov_io_iter_fini(const struct lu_env *env,
+                             const struct cl_io_slice *ios)
+{
+        struct lov_io *lio = cl2lov_io(env, ios);
+        int rc;
+
+        ENTRY;
+        rc = lov_io_call(env, lio, lov_io_iter_fini_wrapper);
+        LASSERT(rc == 0);
+        while (!list_empty(&lio->lis_active))
+                list_del_init(lio->lis_active.next);
+        EXIT;
+}
+
+static void lov_io_unlock(const struct lu_env *env,
+                          const struct cl_io_slice *ios)
+{
+        int rc;
+
+        ENTRY;
+        rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_unlock_wrapper);
+        LASSERT(rc == 0);
+        EXIT;
+}
+
+
+static struct cl_page_list *lov_io_submit_qin(struct lov_device *ld,
+                                              struct cl_page_list *qin,
+                                              int idx, int alloc)
+{
+        return alloc ? &qin[idx] : &ld->ld_emrg[idx]->emrg_page_list;
+}
+
+/**
+ * lov implementation of cl_operations::cio_submit() method. It takes a list
+ * of pages in \a queue, splits it into per-stripe sub-lists, invokes
+ * cl_io_submit() on underlying devices to submit sub-lists, and then splices
+ * everything back.
+ *
+ * Major complication of this function is a need to handle memory cleansing:
+ * cl_io_submit() is called to write out pages as a part of VM memory
+ * reclamation, and hence it may not fail due to memory shortages (system
+ * dead-locks otherwise). To deal with this, some resources (sub-lists,
+ * sub-environment, etc.) are allocated per-device on "startup" (i.e., in a
+ * not-memory cleansing context), and in case of memory shortage, these
+ * pre-allocated resources are used by lov_io_submit() under
+ * lov_device::ld_mutex mutex.
+ */
+static int lov_io_submit(const struct lu_env *env,
+                         const struct cl_io_slice *ios,
+                         enum cl_req_type crt, struct cl_2queue *queue)
+{
+        struct lov_io          *lio = cl2lov_io(env, ios);
+        struct lov_object      *obj = lio->lis_object;
+        struct lov_device       *ld = lu2lov_dev(lov2cl(obj)->co_lu.lo_dev);
+        struct cl_page_list    *qin = &queue->c2_qin;
+        struct cl_2queue      *cl2q = &lov_env_info(env)->lti_cl2q;
+        struct cl_page_list *stripes_qin = NULL;
+        struct cl_page *page;
+        struct cl_page *tmp;
+        int stripe;
+
+#define QIN(stripe) lov_io_submit_qin(ld, stripes_qin, stripe, alloc)
+
+        int rc = 0;
+        int alloc =
+#if defined(__KERNEL__) && defined(__linux__)
+                !(current->flags & PF_MEMALLOC);
+#else
+                1;
+#endif
+        ENTRY;
+        if (lio->lis_active_subios == 1) {
+                int idx = lio->lis_single_subio_index;
+                struct lov_io_sub *sub;
+
+                LASSERT(idx < lio->lis_nr_subios);
+                sub = lov_sub_get(env, lio, idx);
+                LASSERT(!IS_ERR(sub));
+                LASSERT(sub->sub_io == &lio->lis_single_subio);
+                rc = cl_io_submit_rw(sub->sub_env, sub->sub_io, crt, queue);
+                lov_sub_put(sub);
+                RETURN(rc);
+        }
+
+        LASSERT(lio->lis_subs != NULL);
+        if (alloc) {
+                OBD_ALLOC(stripes_qin,
+                          sizeof(*stripes_qin) * lio->lis_nr_subios);
+                if (stripes_qin == NULL)
+                        RETURN(-ENOMEM);
+
+                for (stripe = 0; stripe < lio->lis_nr_subios; stripe++)
+                        cl_page_list_init(&stripes_qin[stripe]);
+        } else {
+                /*
+                 * If we get here, it means pageout & swap doesn't help.
+                 * In order to not make things worse, even don't try to
+                 * allocate the memory with __GFP_NOWARN. -jay
+                 */
+                mutex_lock(&ld->ld_mutex);
+                lio->lis_mem_frozen = 1;
+        }
+
+        cl_2queue_init(cl2q);
+        cl_page_list_for_each_safe(page, tmp, qin) {
+                stripe = lov_page_stripe(page);
+                cl_page_list_move(QIN(stripe), qin, page);
+        }
+
+        for (stripe = 0; stripe < lio->lis_nr_subios; stripe++) {
+                struct lov_io_sub   *sub;
+                struct cl_page_list *sub_qin = QIN(stripe);
+
+                if (list_empty(&sub_qin->pl_pages))
+                        continue;
+
+                cl_page_list_splice(sub_qin, &cl2q->c2_qin);
+                sub = lov_sub_get(env, lio, stripe);
+                if (!IS_ERR(sub)) {
+                        rc = cl_io_submit_rw(sub->sub_env, sub->sub_io,
+                                             crt, cl2q);
+                        lov_sub_put(sub);
+                } else
+                        rc = PTR_ERR(sub);
+                cl_page_list_splice(&cl2q->c2_qin,  &queue->c2_qin);
+                cl_page_list_splice(&cl2q->c2_qout, &queue->c2_qout);
+                if (rc != 0)
+                        break;
+        }
+
+        for (stripe = 0; stripe < lio->lis_nr_subios; stripe++) {
+                struct cl_page_list *sub_qin = QIN(stripe);
+
+                if (list_empty(&sub_qin->pl_pages))
+                        continue;
+
+                cl_page_list_splice(sub_qin, qin);
+        }
+
+        if (alloc) {
+                OBD_FREE(stripes_qin,
+                         sizeof(*stripes_qin) * lio->lis_nr_subios);
+        } else {
+                int i;
+
+                for (i = 0; i < lio->lis_nr_subios; i++) {
+                        struct cl_io *cio = lio->lis_subs[i].sub_io;
+
+                        if (cio && cio == &ld->ld_emrg[i]->emrg_subio)
+                                lov_io_sub_fini(env, lio, &lio->lis_subs[i]);
+                }
+                lio->lis_mem_frozen = 0;
+                mutex_unlock(&ld->ld_mutex);
+        }
+
+        RETURN(rc);
+#undef QIN
+}
+
+static int lov_io_prepare_write(const struct lu_env *env,
+                                const struct cl_io_slice *ios,
+                                const struct cl_page_slice *slice,
+                                unsigned from, unsigned to)
+{
+        struct lov_io     *lio      = cl2lov_io(env, ios);
+        struct cl_page    *sub_page = lov_sub_page(slice);
+        struct lov_io_sub *sub;
+        int result;
+
+        ENTRY;
+        sub = lov_page_subio(env, lio, slice);
+        if (!IS_ERR(sub)) {
+                result = cl_io_prepare_write(sub->sub_env, sub->sub_io,
+                                             sub_page, from, to);
+                lov_sub_put(sub);
+        } else
+                result = PTR_ERR(sub);
+        RETURN(result);
+}
+
+static int lov_io_commit_write(const struct lu_env *env,
+                               const struct cl_io_slice *ios,
+                               const struct cl_page_slice *slice,
+                               unsigned from, unsigned to)
+{
+        struct lov_io     *lio      = cl2lov_io(env, ios);
+        struct cl_page    *sub_page = lov_sub_page(slice);
+        struct lov_io_sub *sub;
+        int result;
+
+        ENTRY;
+        sub = lov_page_subio(env, lio, slice);
+        if (!IS_ERR(sub)) {
+                result = cl_io_commit_write(sub->sub_env, sub->sub_io,
+                                            sub_page, from, to);
+                lov_sub_put(sub);
+        } else
+                result = PTR_ERR(sub);
+        RETURN(result);
+}
+
+static int lov_io_fault_start(const struct lu_env *env,
+                              const struct cl_io_slice *ios)
+{
+        struct cl_fault_io *fio;
+        struct lov_io      *lio;
+        struct lov_io_sub  *sub;
+
+        ENTRY;
+        fio = &ios->cis_io->u.ci_fault;
+        lio = cl2lov_io(env, ios);
+        sub = lov_sub_get(env, lio, lov_page_stripe(fio->ft_page));
+        sub->sub_io->u.ci_fault.ft_nob = fio->ft_nob;
+        lov_sub_put(sub);
+        RETURN(lov_io_start(env, ios));
+}
+
+static const struct cl_io_operations lov_io_ops = {
+        .op = {
+                [CIT_READ] = {
+                        .cio_fini      = lov_io_fini,
+                        .cio_iter_init = lov_io_rw_iter_init,
+                        .cio_iter_fini = lov_io_iter_fini,
+                        .cio_lock      = lov_io_lock,
+                        .cio_unlock    = lov_io_unlock,
+                        .cio_start     = lov_io_start,
+                        .cio_end       = lov_io_end
+                },
+                [CIT_WRITE] = {
+                        .cio_fini      = lov_io_fini,
+                        .cio_iter_init = lov_io_rw_iter_init,
+                        .cio_iter_fini = lov_io_iter_fini,
+                        .cio_lock      = lov_io_lock,
+                        .cio_unlock    = lov_io_unlock,
+                        .cio_start     = lov_io_start,
+                        .cio_end       = lov_io_end
+                },
+                [CIT_TRUNC] = {
+                        .cio_fini      = lov_io_fini,
+                        .cio_iter_init = lov_io_iter_init,
+                        .cio_iter_fini = lov_io_iter_fini,
+                        .cio_lock      = lov_io_lock,
+                        .cio_unlock    = lov_io_unlock,
+                        .cio_start     = lov_io_start,
+                        .cio_end       = lov_io_end
+                },
+                [CIT_FAULT] = {
+                        .cio_fini      = lov_io_fini,
+                        .cio_iter_init = lov_io_iter_init,
+                        .cio_iter_fini = lov_io_iter_fini,
+                        .cio_lock      = lov_io_lock,
+                        .cio_unlock    = lov_io_unlock,
+                        .cio_start     = lov_io_fault_start,
+                        .cio_end       = lov_io_end
+                },
+                [CIT_MISC] = {
+                        .cio_fini   = lov_io_fini
+                }
+        },
+        .req_op = {
+                 [CRT_READ] = {
+                         .cio_submit    = lov_io_submit
+                 },
+                 [CRT_WRITE] = {
+                         .cio_submit    = lov_io_submit
+                 }
+         },
+        .cio_prepare_write = lov_io_prepare_write,
+        .cio_commit_write  = lov_io_commit_write
+};
+
+/*****************************************************************************
+ *
+ * Empty lov io operations.
+ *
+ */
+
+static void lov_empty_io_fini(const struct lu_env *env,
+                              const struct cl_io_slice *ios)
+{
+        ENTRY;
+        EXIT;
+}
+
+static void lov_empty_impossible(const struct lu_env *env,
+                                 struct cl_io_slice *ios)
+{
+        LBUG();
+}
+
+#define LOV_EMPTY_IMPOSSIBLE ((void *)lov_empty_impossible)
+
+/**
+ * An io operation vector for files without stripes.
+ */
+static const struct cl_io_operations lov_empty_io_ops = {
+        .op = {
+                [CIT_READ] = {
+#if 0
+                        .cio_fini       = lov_empty_io_fini,
+                        .cio_iter_init  = LOV_EMPTY_IMPOSSIBLE,
+                        .cio_lock       = LOV_EMPTY_IMPOSSIBLE,
+                        .cio_start      = LOV_EMPTY_IMPOSSIBLE,
+                        .cio_end        = LOV_EMPTY_IMPOSSIBLE
+#endif
+                },
+                [CIT_WRITE] = {
+                        .cio_fini      = lov_empty_io_fini,
+                        .cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
+                        .cio_lock      = LOV_EMPTY_IMPOSSIBLE,
+                        .cio_start     = LOV_EMPTY_IMPOSSIBLE,
+                        .cio_end       = LOV_EMPTY_IMPOSSIBLE
+                },
+                [CIT_TRUNC] = {
+                        .cio_fini      = lov_empty_io_fini,
+                        .cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
+                        .cio_lock      = LOV_EMPTY_IMPOSSIBLE,
+                        .cio_start     = LOV_EMPTY_IMPOSSIBLE,
+                        .cio_end       = LOV_EMPTY_IMPOSSIBLE
+                },
+                [CIT_FAULT] = {
+                        .cio_fini      = lov_empty_io_fini,
+                        .cio_iter_init = LOV_EMPTY_IMPOSSIBLE,
+                        .cio_lock      = LOV_EMPTY_IMPOSSIBLE,
+                        .cio_start     = LOV_EMPTY_IMPOSSIBLE,
+                        .cio_end       = LOV_EMPTY_IMPOSSIBLE
+                },
+                [CIT_MISC] = {
+                        .cio_fini   = lov_empty_io_fini
+                }
+        },
+        .req_op = {
+                 [CRT_READ] = {
+                         .cio_submit    = LOV_EMPTY_IMPOSSIBLE
+                 },
+                 [CRT_WRITE] = {
+                         .cio_submit    = LOV_EMPTY_IMPOSSIBLE
+                 }
+         },
+        .cio_commit_write = LOV_EMPTY_IMPOSSIBLE
+};
+
+int lov_io_init_raid0(const struct lu_env *env, struct cl_object *obj,
+                      struct cl_io *io)
+{
+        struct lov_io       *lio = lov_env_io(env);
+        struct lov_object   *lov = cl2lov(obj);
+
+        ENTRY;
+        CFS_INIT_LIST_HEAD(&lio->lis_active);
+        lov_io_slice_init(lio, lov, io);
+        if (io->ci_result == 0) {
+                LASSERT(lov_r0(lov)->lo_lsm != NULL);
+                io->ci_result = lov_io_subio_init(env, lio, io);
+                if (io->ci_result == 0)
+                        cl_io_slice_add(io, &lio->lis_cl, obj, &lov_io_ops);
+        }
+        RETURN(io->ci_result);
+}
+
+int lov_io_init_empty(const struct lu_env *env, struct cl_object *obj,
+                      struct cl_io *io)
+{
+        struct lov_io *lio = lov_env_io(env);
+        int result;
+
+        ENTRY;
+        switch (io->ci_type) {
+        default:
+                LBUG();
+        case CIT_MISC:
+        case CIT_READ:
+                result = 0;
+                break;
+        case CIT_WRITE:
+        case CIT_TRUNC:
+                result = -EBADF;
+                break;
+        case CIT_FAULT:
+                result = -EFAULT;
+                CERROR("Page fault on a file without stripes: "DFID"\n",
+                       PFID(lu_object_fid(&obj->co_lu)));
+                break;
+        }
+        if (result == 0)
+                cl_io_slice_add(io, &lio->lis_cl, obj, &lov_empty_io_ops);
+        io->ci_result = result;
+        RETURN(result != 0);
+}
+
+/** @} lov */
diff --git a/lustre/lov/lov_lock.c b/lustre/lov/lov_lock.c

new file mode 100644 (file)

index 0000000..14ecd68
--- /dev/null
+++ b/lustre/lov/lov_lock.c
@@ -0,0 +1,935 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_lock for LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov lov @{ */
+
+static struct cl_lock_closure *lov_closure_get(const struct lu_env *env,
+                                               struct cl_lock *parent);
+
+/*****************************************************************************
+ *
+ * Lov lock operations.
+ *
+ */
+
+static void lov_sublock_adopt(const struct lu_env *env, struct lov_lock *lck,
+                              struct cl_lock *sublock, int idx,
+                              struct lov_lock_link *link)
+{
+        struct lovsub_lock *lsl;
+        struct cl_lock     *parent = lck->lls_cl.cls_lock;
+        int                 rc;
+
+        LASSERT(cl_lock_is_mutexed(parent));
+        LASSERT(cl_lock_is_mutexed(sublock));
+        ENTRY;
+
+        lsl = cl2sub_lock(sublock);
+        /*
+         * check that sub-lock doesn't have lock link to this top-lock.
+         */
+        LASSERT(lov_lock_link_find(env, lck, lsl) == NULL);
+        LASSERT(idx < lck->lls_nr);
+
+        lck->lls_sub[idx].sub_lock = lsl;
+        lck->lls_nr_filled++;
+        LASSERT(lck->lls_nr_filled <= lck->lls_nr);
+        list_add_tail(&link->lll_list, &lsl->lss_parents);
+        link->lll_idx = idx;
+        link->lll_super = lck;
+        cl_lock_get(parent);
+        lu_ref_add(&parent->cll_reference, "lov-child", sublock);
+        lck->lls_sub[idx].sub_flags |= LSF_HELD;
+        cl_lock_user_add(env, sublock);
+
+        rc = lov_sublock_modify(env, lck, lsl, &sublock->cll_descr, idx);
+        LASSERT(rc == 0); /* there is no way this can fail, currently */
+        EXIT;
+}
+
+static struct cl_lock *lov_sublock_alloc(const struct lu_env *env,
+                                         const struct cl_io *io,
+                                         struct lov_lock *lck,
+                                         int idx, struct lov_lock_link **out)
+{
+        struct cl_lock       *sublock;
+        struct cl_lock       *parent;
+        struct lov_lock_link *link;
+
+        LASSERT(idx < lck->lls_nr);
+        ENTRY;
+
+        OBD_SLAB_ALLOC_PTR(link, lov_lock_link_kmem);
+        if (link != NULL) {
+                struct lov_lock_sub  *sub;
+                struct cl_lock_descr *descr;
+
+                parent = lck->lls_cl.cls_lock;
+                sub    = &lck->lls_sub[idx];
+                descr  = &sub->sub_descr;
+
+                /* XXX maybe sub-io? */
+                sublock = cl_lock_hold(env, io, descr, "lov-parent", parent);
+                if (!IS_ERR(sublock))
+                        *out = link;
+                else
+                        OBD_SLAB_FREE_PTR(link, lov_lock_link_kmem);
+        } else
+                sublock = ERR_PTR(-ENOMEM);
+        RETURN(sublock);
+}
+
+static void lov_sublock_unlock(const struct lu_env *env,
+                               struct lovsub_lock *lsl,
+                               struct cl_lock_closure *closure)
+{
+        ENTRY;
+        lsl->lss_active = NULL;
+        cl_lock_disclosure(env, closure);
+        EXIT;
+}
+
+static int lov_sublock_lock(const struct lu_env *env, struct lovsub_lock *lsl,
+                            struct cl_lock_closure *closure)
+{
+        struct cl_lock *child;
+        int             result;
+
+        LASSERT(list_empty(&closure->clc_list));
+
+        ENTRY;
+        child = lsl->lss_cl.cls_lock;
+        result = cl_lock_closure_build(env, child, closure);
+        if (result == 0) {
+                LASSERT(cl_lock_is_mutexed(child));
+                lsl->lss_active = closure->clc_origin;
+        }
+        RETURN(result);
+}
+
+/**
+ * Updates the result of a top-lock operation from a result of sub-lock
+ * sub-operations. Top-operations like lov_lock_{enqueue,use,unuse}() iterate
+ * over sub-locks and lov_subresult() is used to calculate return value of a
+ * top-operation. To this end, possible return values of sub-operations are
+ * ordered as
+ *
+ *     - 0                  success
+ *     - CLO_WAIT           wait for event
+ *     - CLO_REPEAT         repeat top-operation
+ *     - -ne                fundamental error
+ *
+ * Top-level return code can only go down through this list. CLO_REPEAT
+ * overwrites CLO_WAIT, because lock mutex was released and sleeping condition
+ * has to be rechecked by the upper layer.
+ */
+static int lov_subresult(int result, int rc)
+{
+        int result_rank;
+        int rc_rank;
+
+        LASSERT(result <= 0 || result == CLO_REPEAT || result == CLO_WAIT);
+        LASSERT(rc <= 0 || rc == CLO_REPEAT || rc == CLO_WAIT);
+        CLASSERT(CLO_WAIT < CLO_REPEAT);
+
+        ENTRY;
+
+        /* calculate ranks in the ordering above */
+        result_rank = result < 0 ? 1 + CLO_REPEAT : result;
+        rc_rank = rc < 0 ? 1 + CLO_REPEAT : rc;
+
+        if (result_rank < rc_rank)
+                result = rc;
+        RETURN(result);
+}
+
+/**
+ * Creates sub-locks for a given lov_lock for the first time.
+ *
+ * Goes through all sub-objects of top-object, and creates sub-locks on every
+ * sub-object intersecting with top-lock extent. This is complicated by the
+ * fact that top-lock (that is being created) can be accessed concurrently
+ * through already created sub-locks (possibly shared with other top-locks).
+ */
+static int lov_lock_sub_init(const struct lu_env *env,
+                             struct lov_lock *lck, const struct cl_io *io)
+{
+        int result = 0;
+        int i;
+        int j;
+        int nr;
+        int stripe;
+        int start_stripe;
+        obd_off start;
+        obd_off end;
+        obd_off file_start;
+        obd_off file_end;
+
+        struct lov_object       *loo    = cl2lov(lck->lls_cl.cls_obj);
+        struct lov_layout_raid0 *r0     = lov_r0(loo);
+        struct cl_lock          *parent = lck->lls_cl.cls_lock;
+
+        ENTRY;
+
+        lck->lls_orig = parent->cll_descr;
+        file_start = cl_offset(lov2cl(loo), parent->cll_descr.cld_start);
+        file_end   = cl_offset(lov2cl(loo), parent->cll_descr.cld_end + 1) - 1;
+
+        start_stripe = lov_stripe_number(r0->lo_lsm, file_start);
+        for (i = 0, nr = 0; i < r0->lo_nr; i++) {
+                /*
+                 * XXX for wide striping smarter algorithm is desirable,
+                 * breaking out of the loop, early.
+                 */
+                stripe = (start_stripe + i) % r0->lo_nr;
+                if (lov_stripe_intersects(r0->lo_lsm, stripe,
+                                          file_start, file_end, &start, &end))
+                        nr++;
+        }
+        LASSERT(nr > 0);
+        OBD_ALLOC(lck->lls_sub, nr * sizeof lck->lls_sub[0]);
+        if (lck->lls_sub == NULL)
+                RETURN(-ENOMEM);
+
+        lck->lls_nr = nr;
+        /*
+         * First, fill in sub-lock descriptions in
+         * lck->lls_sub[].sub_descr. They are used by lov_sublock_alloc()
+         * (called below in this function, and by lov_lock_enqueue()) to
+         * create sub-locks. At this moment, no other thread can access
+         * top-lock.
+         */
+        for (j = 0, nr = 0; j < i; ++j) {
+                stripe = (start_stripe + j) % r0->lo_nr;
+                if (lov_stripe_intersects(r0->lo_lsm, stripe,
+                                          file_start, file_end, &start, &end)) {
+                        struct cl_lock_descr *descr;
+
+                        descr = &lck->lls_sub[nr].sub_descr;
+
+                        LASSERT(descr->cld_obj == NULL);
+                        descr->cld_obj   = lovsub2cl(r0->lo_sub[stripe]);
+                        descr->cld_start = cl_index(descr->cld_obj, start);
+                        descr->cld_end   = cl_index(descr->cld_obj, end);
+                        descr->cld_mode  = parent->cll_descr.cld_mode;
+                        lck->lls_sub[nr].sub_got = *descr;
+                        lck->lls_sub[nr].sub_stripe = stripe;
+                        nr++;
+                }
+        }
+        LASSERT(nr == lck->lls_nr);
+        /*
+         * Then, create sub-locks. Once at least one sub-lock was created,
+         * top-lock can be reached by other threads.
+         */
+        for (i = 0; i < lck->lls_nr; ++i) {
+                struct cl_lock       *sublock;
+                struct lov_lock_link *link;
+
+                if (lck->lls_sub[i].sub_lock == NULL) {
+                        sublock = lov_sublock_alloc(env, io, lck, i, &link);
+                        if (IS_ERR(sublock)) {
+                                result = PTR_ERR(sublock);
+                                break;
+                        }
+                        cl_lock_mutex_get(env, sublock);
+                        cl_lock_mutex_get(env, parent);
+                        /*
+                         * recheck under mutex that sub-lock wasn't created
+                         * concurrently, and that top-lock is still alive.
+                         */
+                        if (lck->lls_sub[i].sub_lock == NULL &&
+                            parent->cll_state < CLS_FREEING) {
+                                lov_sublock_adopt(env, lck, sublock, i, link);
+                                cl_lock_mutex_put(env, parent);
+                        } else {
+                                cl_lock_mutex_put(env, parent);
+                                cl_lock_unhold(env, sublock,
+                                               "lov-parent", parent);
+                        }
+                        cl_lock_mutex_put(env, sublock);
+                }
+        }
+        /*
+         * Some sub-locks can be missing at this point. This is not a problem,
+         * because enqueue will create them anyway. Main duty of this function
+         * is to fill in sub-lock descriptions in a race free manner.
+         */
+        RETURN(result);
+}
+
+static int lov_sublock_release(const struct lu_env *env, struct lov_lock *lck,
+                               int i, int deluser, int rc)
+{
+        struct cl_lock *parent = lck->lls_cl.cls_lock;
+
+        LASSERT(cl_lock_is_mutexed(parent));
+        ENTRY;
+
+        if (lck->lls_sub[i].sub_flags & LSF_HELD) {
+                struct cl_lock *sublock;
+                int dying;
+
+                LASSERT(lck->lls_sub[i].sub_lock != NULL);
+                sublock = lck->lls_sub[i].sub_lock->lss_cl.cls_lock;
+                LASSERT(cl_lock_is_mutexed(sublock));
+
+                lck->lls_sub[i].sub_flags &= ~LSF_HELD;
+                if (deluser)
+                        cl_lock_user_del(env, sublock);
+                /*
+                 * If the last hold is released, and cancellation is pending
+                 * for a sub-lock, release parent mutex, to avoid keeping it
+                 * while sub-lock is being paged out.
+                 */
+                dying = (sublock->cll_descr.cld_mode == CLM_PHANTOM ||
+                         (sublock->cll_flags & (CLF_CANCELPEND|CLF_DOOMED))) &&
+                        sublock->cll_holds == 1;
+                if (dying)
+                        cl_lock_mutex_put(env, parent);
+                cl_lock_unhold(env, sublock, "lov-parent", parent);
+                if (dying) {
+                        cl_lock_mutex_get(env, parent);
+                        rc = lov_subresult(rc, CLO_REPEAT);
+                }
+                /*
+                 * From now on lck->lls_sub[i].sub_lock is a "weak" pointer,
+                 * not backed by a reference on a
+                 * sub-lock. lovsub_lock_delete() will clear
+                 * lck->lls_sub[i].sub_lock under semaphores, just before
+                 * sub-lock is destroyed.
+                 */
+        }
+        RETURN(rc);
+}
+
+static void lov_sublock_hold(const struct lu_env *env, struct lov_lock *lck,
+                             int i)
+{
+        struct cl_lock *parent = lck->lls_cl.cls_lock;
+
+        LASSERT(cl_lock_is_mutexed(parent));
+        ENTRY;
+
+        if (!(lck->lls_sub[i].sub_flags & LSF_HELD)) {
+                struct cl_lock *sublock;
+
+                LASSERT(lck->lls_sub[i].sub_lock != NULL);
+                sublock = lck->lls_sub[i].sub_lock->lss_cl.cls_lock;
+                LASSERT(cl_lock_is_mutexed(sublock));
+                LASSERT(sublock->cll_state != CLS_FREEING);
+
+                lck->lls_sub[i].sub_flags |= LSF_HELD;
+
+                cl_lock_get_trust(sublock);
+                cl_lock_hold_add(env, sublock, "lov-parent", parent);
+                cl_lock_user_add(env, sublock);
+                cl_lock_put(env, sublock);
+        }
+        EXIT;
+}
+
+static void lov_lock_fini(const struct lu_env *env,
+                          struct cl_lock_slice *slice)
+{
+        struct lov_lock *lck;
+        int i;
+
+        ENTRY;
+        lck = cl2lov_lock(slice);
+        LASSERT(lck->lls_nr_filled == 0);
+        if (lck->lls_sub != NULL) {
+                for (i = 0; i < lck->lls_nr; ++i)
+                        /*
+                         * No sub-locks exists at this point, as sub-lock has
+                         * a reference on its parent.
+                         */
+                        LASSERT(lck->lls_sub[i].sub_lock == NULL);
+                OBD_FREE(lck->lls_sub, lck->lls_nr * sizeof lck->lls_sub[0]);
+        }
+        OBD_SLAB_FREE_PTR(lck, lov_lock_kmem);
+        EXIT;
+}
+
+/**
+ * Tries to advance a state machine of a given sub-lock toward enqueuing of
+ * the top-lock.
+ *
+ * \retval 0 if state-transition can proceed
+ * \retval -ve otherwise.
+ */
+static int lov_lock_enqueue_one(const struct lu_env *env, struct lov_lock *lck,
+                                struct cl_lock *sublock,
+                                struct cl_io *io, __u32 enqflags, int last)
+{
+        int result;
+
+        ENTRY;
+        /* first, try to enqueue a sub-lock ... */
+        result = cl_enqueue_try(env, sublock, io, enqflags);
+        if (sublock->cll_state == CLS_ENQUEUED)
+                /* if it is enqueued, try to `wait' on it---maybe it's already
+                 * granted */
+                result = cl_wait_try(env, sublock);
+        /*
+         * If CEF_ASYNC flag is set, then all sub-locks can be enqueued in
+         * parallel, otherwise---enqueue has to wait until sub-lock is granted
+         * before proceeding to the next one.
+         */
+        if (result == CLO_WAIT && sublock->cll_state <= CLS_HELD &&
+            enqflags & CEF_ASYNC && !last)
+                result = 0;
+        RETURN(result);
+}
+
+/**
+ * Helper function for lov_lock_enqueue() that creates missing sub-lock.
+ */
+static int lov_sublock_fill(const struct lu_env *env, struct cl_lock *parent,
+                            struct cl_io *io, struct lov_lock *lck, int idx)
+{
+        struct lov_lock_link *link;
+        struct cl_lock       *sublock;
+        int                   result;
+
+        LASSERT(parent->cll_depth == 1);
+        cl_lock_mutex_put(env, parent);
+        sublock = lov_sublock_alloc(env, io, lck, idx, &link);
+        if (!IS_ERR(sublock))
+                cl_lock_mutex_get(env, sublock);
+        cl_lock_mutex_get(env, parent);
+
+        if (!IS_ERR(sublock)) {
+                if (parent->cll_state == CLS_QUEUING &&
+                    lck->lls_sub[idx].sub_lock == NULL)
+                        lov_sublock_adopt(env, lck, sublock, idx, link);
+                else {
+                        /* other thread allocated sub-lock, or enqueue is no
+                         * longer going on */
+                        cl_lock_mutex_put(env, parent);
+                        cl_lock_unhold(env, sublock, "lov-parent", parent);
+                        cl_lock_mutex_get(env, parent);
+                }
+                cl_lock_mutex_put(env, sublock);
+                result = CLO_REPEAT;
+        } else
+                result = PTR_ERR(sublock);
+        return result;
+}
+
+/**
+ * Implementation of cl_lock_operations::clo_enqueue() for lov layer. This
+ * function is rather subtle, as it enqueues top-lock (i.e., advances top-lock
+ * state machine from CLS_QUEUING to CLS_ENQUEUED states) by juggling sub-lock
+ * state machines in the face of sub-locks sharing (by multiple top-locks),
+ * and concurrent sub-lock cancellations.
+ */
+static int lov_lock_enqueue(const struct lu_env *env,
+                            const struct cl_lock_slice *slice,
+                            struct cl_io *io, __u32 enqflags)
+{
+        struct cl_lock         *lock    = slice->cls_lock;
+        struct lov_lock        *lck     = cl2lov_lock(slice);
+        struct cl_lock_closure *closure = lov_closure_get(env, lock);
+        int i;
+        int result;
+        enum cl_lock_state minstate;
+
+        ENTRY;
+
+        for (result = 0, minstate = CLS_FREEING, i = 0; i < lck->lls_nr; ++i) {
+                int rc;
+                struct lovsub_lock *sub;
+                struct cl_lock *sublock;
+
+                if (lock->cll_state != CLS_QUEUING) {
+                        /*
+                         * Lock might have left QUEUING state if previous
+                         * iteration released its mutex. Stop enqueing in this
+                         * case and let the upper layer to decide what to do.
+                         */
+                        LASSERT(i > 0 && result != 0);
+                        break;
+                }
+
+                sub = lck->lls_sub[i].sub_lock;
+                /*
+                 * Sub-lock might have been canceled, while top-lock was
+                 * cached.
+                 */
+                if (sub == NULL) {
+                        result = lov_sublock_fill(env, lock, io, lck, i);
+                        /* lov_sublock_fill() released @lock mutex,
+                         * restart. */
+                        break;
+                }
+                sublock = sub->lss_cl.cls_lock;
+                rc = lov_sublock_lock(env, sub, closure);
+                if (rc == 0) {
+                        lov_sublock_hold(env, lck, i);
+                        rc = lov_lock_enqueue_one(env, lck, sublock, io,
+                                                  enqflags,
+                                                  i == lck->lls_nr - 1);
+                        minstate = min(minstate, sublock->cll_state);
+                        /*
+                         * Don't hold a sub-lock in CLS_CACHED state, see
+                         * description for lov_lock::lls_sub.
+                         */
+                        if (sublock->cll_state > CLS_HELD)
+                                rc = lov_sublock_release(env, lck, i, 1, rc);
+                        lov_sublock_unlock(env, sub, closure);
+                }
+                result = lov_subresult(result, rc);
+                if (result < 0)
+                        break;
+        }
+        cl_lock_closure_fini(closure);
+        RETURN(result ?: minstate >= CLS_ENQUEUED ? 0 : CLO_WAIT);
+}
+
+static int lov_lock_unuse(const struct lu_env *env,
+                          const struct cl_lock_slice *slice)
+{
+        struct lov_lock        *lck     = cl2lov_lock(slice);
+        struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock);
+        int i;
+        int result;
+
+        ENTRY;
+
+        for (result = 0, i = 0; i < lck->lls_nr; ++i) {
+                int rc;
+                struct lovsub_lock *sub;
+                struct cl_lock *sublock;
+
+                /* top-lock state cannot change concurrently, because single
+                 * thread (one that released the last hold) carries unlocking
+                 * to the completion. */
+                LASSERT(slice->cls_lock->cll_state == CLS_UNLOCKING);
+                sub = lck->lls_sub[i].sub_lock;
+                if (sub == NULL)
+                        continue;
+
+                sublock = sub->lss_cl.cls_lock;
+                rc = lov_sublock_lock(env, sub, closure);
+                if (rc == 0) {
+                        if (lck->lls_sub[i].sub_flags & LSF_HELD) {
+                                LASSERT(sublock->cll_state == CLS_HELD);
+                                rc = cl_unuse_try(env, sublock);
+                                if (rc != CLO_WAIT)
+                                        rc = lov_sublock_release(env, lck,
+                                                                 i, 0, rc);
+                        }
+                        lov_sublock_unlock(env, sub, closure);
+                }
+                result = lov_subresult(result, rc);
+                if (result < 0)
+                        break;
+        }
+        if (result == 0 && lck->lls_unuse_race) {
+                lck->lls_unuse_race = 0;
+                result = -ESTALE;
+        }
+        cl_lock_closure_fini(closure);
+        RETURN(result);
+}
+
+static int lov_lock_wait(const struct lu_env *env,
+                         const struct cl_lock_slice *slice)
+{
+        struct lov_lock        *lck     = cl2lov_lock(slice);
+        struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock);
+        enum cl_lock_state      minstate;
+        int                     result;
+        int                     i;
+
+        ENTRY;
+
+        for (result = 0, minstate = CLS_FREEING, i = 0; i < lck->lls_nr; ++i) {
+                int rc;
+                struct lovsub_lock *sub;
+                struct cl_lock *sublock;
+
+                sub = lck->lls_sub[i].sub_lock;
+                LASSERT(sub != NULL);
+                sublock = sub->lss_cl.cls_lock;
+                rc = lov_sublock_lock(env, sub, closure);
+                if (rc == 0) {
+                        LASSERT(sublock->cll_state >= CLS_ENQUEUED);
+                        if (sublock->cll_state < CLS_HELD)
+                                rc = cl_wait_try(env, sublock);
+                        minstate = min(minstate, sublock->cll_state);
+                        lov_sublock_unlock(env, sub, closure);
+                }
+                result = lov_subresult(result, rc);
+                if (result < 0)
+                        break;
+        }
+        cl_lock_closure_fini(closure);
+        RETURN(result ?: minstate >= CLS_HELD ? 0 : CLO_WAIT);
+}
+
+static int lov_lock_use(const struct lu_env *env,
+                        const struct cl_lock_slice *slice)
+{
+        struct lov_lock        *lck     = cl2lov_lock(slice);
+        struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock);
+        int                     result;
+        int                     i;
+
+        LASSERT(slice->cls_lock->cll_state == CLS_CACHED);
+        ENTRY;
+
+        for (result = 0, i = 0; i < lck->lls_nr; ++i) {
+                int rc;
+                struct lovsub_lock *sub;
+                struct cl_lock *sublock;
+
+                if (slice->cls_lock->cll_state != CLS_CACHED) {
+                        /* see comment in lov_lock_enqueue(). */
+                        LASSERT(i > 0 && result != 0);
+                        break;
+                }
+                /*
+                 * if a sub-lock was destroyed while top-lock was in
+                 * CLS_CACHED state, top-lock would have been moved into
+                 * CLS_NEW state, so all sub-locks have to be in place.
+                 */
+                sub = lck->lls_sub[i].sub_lock;
+                LASSERT(sub != NULL);
+                sublock = sub->lss_cl.cls_lock;
+                rc = lov_sublock_lock(env, sub, closure);
+                if (rc == 0) {
+                        LASSERT(sublock->cll_state != CLS_FREEING);
+                        lov_sublock_hold(env, lck, i);
+                        if (sublock->cll_state == CLS_CACHED) {
+                                rc = cl_use_try(env, sublock);
+                                if (rc != 0)
+                                        rc = lov_sublock_release(env, lck,
+                                                                 i, 1, rc);
+                        } else
+                                rc = 0;
+                        lov_sublock_unlock(env, sub, closure);
+                }
+                result = lov_subresult(result, rc);
+                if (result < 0)
+                        break;
+        }
+        cl_lock_closure_fini(closure);
+        RETURN(result);
+}
+
+#if 0
+static int lock_lock_multi_match()
+{
+        struct cl_lock          *lock    = slice->cls_lock;
+        struct cl_lock_descr    *subneed = &lov_env_info(env)->lti_ldescr;
+        struct lov_object       *loo     = cl2lov(lov->lls_cl.cls_obj);
+        struct lov_layout_raid0 *r0      = lov_r0(loo);
+        struct lov_lock_sub     *sub;
+        struct cl_object        *subobj;
+        obd_off  fstart;
+        obd_off  fend;
+        obd_off  start;
+        obd_off  end;
+        int i;
+
+        fstart = cl_offset(need->cld_obj, need->cld_start);
+        fend   = cl_offset(need->cld_obj, need->cld_end + 1) - 1;
+        subneed->cld_mode = need->cld_mode;
+        cl_lock_mutex_get(env, lock);
+        for (i = 0; i < lov->lls_nr; ++i) {
+                sub = &lov->lls_sub[i];
+                if (sub->sub_lock == NULL)
+                        continue;
+                subobj = sub->sub_descr.cld_obj;
+                if (!lov_stripe_intersects(r0->lo_lsm, sub->sub_stripe,
+                                           fstart, fend, &start, &end))
+                        continue;
+                subneed->cld_start = cl_index(subobj, start);
+                subneed->cld_end   = cl_index(subobj, end);
+                subneed->cld_obj   = subobj;
+                if (!cl_lock_ext_match(&sub->sub_got, subneed)) {
+                        result = 0;
+                        break;
+                }
+        }
+        cl_lock_mutex_put(env, lock);
+}
+#endif
+
+static int lov_is_same_stripe(struct lov_object *lov, int stripe,
+                              const struct cl_lock_descr *descr)
+{
+        struct lov_stripe_md *lsm = lov_r0(lov)->lo_lsm;
+        obd_off start;
+        obd_off end;
+
+        start = cl_offset(&lov->lo_cl, descr->cld_start);
+        end   = cl_offset(&lov->lo_cl, descr->cld_end + 1) - 1;
+        return
+                end - start <= lsm->lsm_stripe_size &&
+                stripe == lov_stripe_number(lsm, start) &&
+                stripe == lov_stripe_number(lsm, end);
+}
+
+/**
+ * An implementation of cl_lock_operations::clo_fits_into() method.
+ *
+ * Checks whether a lock (given by \a slice) is suitable for \a
+ * io. Multi-stripe locks can be used only for "quick" io, like truncate, or
+ * O_APPEND write.
+ *
+ * \see ccc_lock_fits_into().
+ */
+static int lov_lock_fits_into(const struct lu_env *env,
+                              const struct cl_lock_slice *slice,
+                              const struct cl_lock_descr *need,
+                              const struct cl_io *io)
+{
+        struct lov_lock   *lov = cl2lov_lock(slice);
+        struct lov_object *obj = cl2lov(slice->cls_obj);
+        int result;
+
+        LASSERT(cl_object_same(need->cld_obj, slice->cls_obj));
+        LASSERT(lov->lls_nr > 0);
+
+        ENTRY;
+
+        if (lov->lls_nr == 1) {
+                /*
+                 * If a lock is on a single stripe, it's enough to check that
+                 * @need lock matches actually granted stripe lock, and...
+                 */
+                result = cl_lock_ext_match(&lov->lls_sub[0].sub_got, need);
+                if (result && lov_r0(obj)->lo_nr > 1)
+                        /*
+                         * ... @need is on the same stripe, if multiple
+                         * stripes are possible at all for this object.
+                         */
+                        result = lov_is_same_stripe(cl2lov(slice->cls_obj),
+                                                    lov->lls_sub[0].sub_stripe,
+                                                    need);
+        } else if (io->ci_type != CIT_TRUNC && io->ci_type != CIT_MISC &&
+                   !cl_io_is_append(io) && need->cld_mode != CLM_PHANTOM)
+                /*
+                 * Multi-stripe locks are only suitable for `quick' IO and for
+                 * glimpse.
+                 */
+                result = 0;
+        else
+                /*
+                 * Most general case: multi-stripe existing lock, and
+                 * (potentially) multi-stripe @need lock. Check that @need is
+                 * covered by @lov's sub-locks.
+                 *
+                 * For now, ignore lock expansions made by the server, and
+                 * match against original lock extent.
+                 */
+                result = cl_lock_ext_match(&lov->lls_orig, need);
+        CDEBUG(D_DLMTRACE, DDESCR"/"DDESCR" %i %i/%i: %i\n",
+               PDESCR(&lov->lls_orig), PDESCR(&lov->lls_sub[0].sub_got),
+               lov->lls_sub[0].sub_stripe, lov->lls_nr, lov_r0(obj)->lo_nr,
+               result);
+        RETURN(result);
+}
+
+void lov_lock_unlink(const struct lu_env *env,
+                     struct lov_lock_link *link, struct lovsub_lock *sub)
+{
+        struct lov_lock *lck    = link->lll_super;
+        struct cl_lock  *parent = lck->lls_cl.cls_lock;
+
+        LASSERT(cl_lock_is_mutexed(parent));
+        LASSERT(cl_lock_is_mutexed(sub->lss_cl.cls_lock));
+        ENTRY;
+
+        list_del_init(&link->lll_list);
+        LASSERT(lck->lls_sub[link->lll_idx].sub_lock == sub);
+        /* yank this sub-lock from parent's array */
+        lck->lls_sub[link->lll_idx].sub_lock = NULL;
+        LASSERT(lck->lls_nr_filled > 0);
+        lck->lls_nr_filled--;
+        lu_ref_del(&parent->cll_reference, "lov-child", sub->lss_cl.cls_lock);
+        cl_lock_put(env, parent);
+        OBD_SLAB_FREE_PTR(link, lov_lock_link_kmem);
+        EXIT;
+}
+
+struct lov_lock_link *lov_lock_link_find(const struct lu_env *env,
+                                         struct lov_lock *lck,
+                                         struct lovsub_lock *sub)
+{
+        struct lov_lock_link *scan;
+
+        LASSERT(cl_lock_is_mutexed(sub->lss_cl.cls_lock));
+        ENTRY;
+
+        list_for_each_entry(scan, &sub->lss_parents, lll_list) {
+                if (scan->lll_super == lck)
+                        RETURN(scan);
+        }
+        RETURN(NULL);
+}
+
+/**
+ * An implementation of cl_lock_operations::clo_delete() method. This is
+ * invoked for "top-to-bottom" delete, when lock destruction starts from the
+ * top-lock, e.g., as a result of inode destruction.
+ *
+ * Unlinks top-lock from all its sub-locks. Sub-locks are not deleted there:
+ * this is done separately elsewhere:
+ *
+ *     - for inode destruction, lov_object_delete() calls cl_object_kill() for
+ *       each sub-object, purging its locks;
+ *
+ *     - in other cases (e.g., a fatal error with a top-lock) sub-locks are
+ *       left in the cache.
+ */
+static void lov_lock_delete(const struct lu_env *env,
+                            const struct cl_lock_slice *slice)
+{
+        struct lov_lock        *lck     = cl2lov_lock(slice);
+        struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock);
+        int i;
+
+        LASSERT(slice->cls_lock->cll_state == CLS_FREEING);
+        ENTRY;
+
+        for (i = 0; i < lck->lls_nr; ++i) {
+                struct lovsub_lock *lsl;
+                struct cl_lock *sublock;
+                int rc;
+
+                lsl = lck->lls_sub[i].sub_lock;
+                if (lsl == NULL)
+                        continue;
+
+                sublock = lsl->lss_cl.cls_lock;
+                rc = lov_sublock_lock(env, lsl, closure);
+                if (rc == 0) {
+                        if (lck->lls_sub[i].sub_flags & LSF_HELD)
+                                lov_sublock_release(env, lck, i, 1, 0);
+                        if (sublock->cll_state < CLS_FREEING) {
+                                struct lov_lock_link *link;
+
+                                link = lov_lock_link_find(env, lck, lsl);
+                                LASSERT(link != NULL);
+                                lov_lock_unlink(env, link, lsl);
+                                LASSERT(lck->lls_sub[i].sub_lock == NULL);
+                        }
+                        lov_sublock_unlock(env, lsl, closure);
+                } else if (rc == CLO_REPEAT) {
+                        --i; /* repeat with this lock */
+                } else {
+                        CL_LOCK_DEBUG(D_ERROR, env, sublock,
+                                      "Cannot get sub-lock for delete: %i\n",
+                                      rc);
+                }
+        }
+        cl_lock_closure_fini(closure);
+        EXIT;
+}
+
+static int lov_lock_print(const struct lu_env *env, void *cookie,
+                          lu_printer_t p, const struct cl_lock_slice *slice)
+{
+        struct lov_lock *lck = cl2lov_lock(slice);
+        int              i;
+
+        (*p)(env, cookie, "%d\n", lck->lls_nr);
+        for (i = 0; i < lck->lls_nr; ++i) {
+                struct lov_lock_sub *sub;
+
+                sub = &lck->lls_sub[i];
+                (*p)(env, cookie, "    %d %x: ", i, sub->sub_flags);
+                if (sub->sub_lock != NULL)
+                        cl_lock_print(env, cookie, p,
+                                      sub->sub_lock->lss_cl.cls_lock);
+                else
+                        (*p)(env, cookie, "---\n");
+        }
+        return 0;
+}
+
+static const struct cl_lock_operations lov_lock_ops = {
+        .clo_fini      = lov_lock_fini,
+        .clo_enqueue   = lov_lock_enqueue,
+        .clo_wait      = lov_lock_wait,
+        .clo_use       = lov_lock_use,
+        .clo_unuse     = lov_lock_unuse,
+        .clo_fits_into = lov_lock_fits_into,
+        .clo_delete    = lov_lock_delete,
+        .clo_print     = lov_lock_print
+};
+
+int lov_lock_init_raid0(const struct lu_env *env, struct cl_object *obj,
+                        struct cl_lock *lock, const struct cl_io *io)
+{
+        struct lov_lock *lck;
+        int result;
+
+        ENTRY;
+        OBD_SLAB_ALLOC_PTR(lck, lov_lock_kmem);
+        if (lck != NULL) {
+                cl_lock_slice_add(lock, &lck->lls_cl, obj, &lov_lock_ops);
+                result = lov_lock_sub_init(env, lck, io);
+        } else
+                result = -ENOMEM;
+        RETURN(result);
+}
+
+static struct cl_lock_closure *lov_closure_get(const struct lu_env *env,
+                                               struct cl_lock *parent)
+{
+        struct cl_lock_closure *closure;
+
+        closure = &lov_env_info(env)->lti_closure;
+        LINVRNT(list_empty(&closure->clc_list));
+        cl_lock_closure_init(env, closure, parent, 1);
+        return closure;
+}
+
+
+/** @} lov */
diff --git a/lustre/lov/lov_merge.c b/lustre/lov/lov_merge.c

index 47e87e0..20abe46 100644 (file)
--- a/lustre/lov/lov_merge.c
+++ b/lustre/lov/lov_merge.c
@@ -50,19 +50,16 @@
  
  #include "lov_internal.h"
  
-/* Merge the lock value block(&lvb) attributes from each of the stripes in a
- * file into a single lvb. It is expected that the caller initializes the
- * current atime, mtime, ctime to avoid regressing a more uptodate time on
- * the local client.
- *
- * If @kms_only is set then we do not consider the recently seen size (rss)
- * when updating the known minimum size (kms).  Even when merging RSS, we will
- * take the KMS value if it's larger.  This prevents getattr from stomping on
- * dirty cached pages which extend the file size. */
-int lov_merge_lvb(struct obd_export *exp, struct lov_stripe_md *lsm,
-                  struct ost_lvb *lvb, int kms_only)
+/** Merge the lock value block(&lvb) attributes and KMS from each of the
+ * stripes in a file into a single lvb. It is expected that the caller
+ * initializes the current atime, mtime, ctime to avoid regressing a more
+ * uptodate time on the local client.
+ */
+int lov_merge_lvb_kms(struct lov_stripe_md *lsm,
+                      struct ost_lvb *lvb, __u64 *kms_place)
  {
          __u64 size = 0;
+        __u64 kms = 0;
          __u64 blocks = 0;
          __u64 current_mtime = lvb->lvb_mtime;
          __u64 current_atime = lvb->lvb_atime;
@@ -85,7 +82,11 @@ int lov_merge_lvb(struct obd_export *exp, struct lov_stripe_md *lsm,
                  }
  
                  tmpsize = loi->loi_kms;
-                if (kms_only == 0 && loi->loi_lvb.lvb_size > tmpsize)
+                lov_size = lov_stripe_size(lsm, tmpsize, i);
+                if (lov_size > kms)
+                        kms = lov_size;
+
+                if (loi->loi_lvb.lvb_size > tmpsize)
                          tmpsize = loi->loi_lvb.lvb_size;
  
                  lov_size = lov_stripe_size(lsm, tmpsize, i);
@@ -98,7 +99,7 @@ int lov_merge_lvb(struct obd_export *exp, struct lov_stripe_md *lsm,
  
                  /* mtime is always updated with ctime, but can be set in past.
                     As write and utime(2) may happen within 1 second, and utime's
-                   mtime has a priority over write's one, leave mtime from mds 
+                   mtime has a priority over write's one, leave mtime from mds
                     for the same ctimes. */
                  if (loi->loi_lvb.lvb_ctime > current_ctime) {
                          current_ctime = loi->loi_lvb.lvb_ctime;
@@ -106,6 +107,7 @@ int lov_merge_lvb(struct obd_export *exp, struct lov_stripe_md *lsm,
                  }
          }
  
+        *kms_place = kms;
          lvb->lvb_size = size;
          lvb->lvb_blocks = blocks;
          lvb->lvb_mtime = current_mtime;
@@ -114,6 +116,31 @@ int lov_merge_lvb(struct obd_export *exp, struct lov_stripe_md *lsm,
          RETURN(rc);
  }
  
+/** Merge the lock value block(&lvb) attributes from each of the stripes in a
+ * file into a single lvb. It is expected that the caller initializes the
+ * current atime, mtime, ctime to avoid regressing a more uptodate time on
+ * the local client.
+ *
+ * If @kms_only is set then we do not consider the recently seen size (rss)
+ * when updating the known minimum size (kms).  Even when merging RSS, we will
+ * take the KMS value if it's larger.  This prevents getattr from stomping on
+ * dirty cached pages which extend the file size. */
+int lov_merge_lvb(struct obd_export *exp,
+                  struct lov_stripe_md *lsm, struct ost_lvb *lvb, int kms_only)
+{
+        int   rc;
+        __u64 kms;
+
+        ENTRY;
+        rc = lov_merge_lvb_kms(lsm, lvb, &kms);
+        if (kms_only)
+                lvb->lvb_size = kms;
+        CDEBUG(D_INODE, "merged: %llu %llu %llu %llu %llu\n",
+               lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime,
+               lvb->lvb_ctime, lvb->lvb_blocks);
+        RETURN(rc);
+}
+
  /* Must be called under the lov_stripe_lock() */
  int lov_adjust_kms(struct obd_export *exp, struct lov_stripe_md *lsm,
                     obd_off size, int shrink)
diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c

index 84835e5..f946a91 100644 (file)
--- a/lustre/lov/lov_obd.c
+++ b/lustre/lov/lov_obd.c
@@ -63,7 +63,7 @@
  #include <obd_ost.h>
  #include <lprocfs_status.h>
  #include <lustre_param.h>
-#include <lustre_cache.h>
+#include <cl_object.h>
  #include <lustre/ll_fiemap.h>
  
  #include "lov_internal.h"
@@ -104,97 +104,9 @@ void lov_putref(struct obd_device *obd)
          mutex_up(&lov->lov_lock);
  }
  
-static int lov_register_page_removal_cb(struct obd_export *exp,
-                                        obd_page_removal_cb_t func,
-                                        obd_pin_extent_cb pin_cb)
-{
-        struct lov_obd *lov = &exp->exp_obd->u.lov;
-        int i, rc = 0;
-
-        if (lov->lov_page_removal_cb && lov->lov_page_removal_cb != func)
-                return -EBUSY;
-
-        if (lov->lov_page_pin_cb && lov->lov_page_pin_cb != pin_cb)
-                return -EBUSY;
-
-        for (i = 0; i < lov->desc.ld_tgt_count; i++) {
-                if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_exp)
-                        continue;
-                rc |= obd_register_page_removal_cb(lov->lov_tgts[i]->ltd_exp,
-                                                   func, pin_cb);
-        }
-
-        lov->lov_page_removal_cb = func;
-        lov->lov_page_pin_cb = pin_cb;
-
-        return rc;
-}
-
-static int lov_unregister_page_removal_cb(struct obd_export *exp,
-                                        obd_page_removal_cb_t func)
-{
-        struct lov_obd *lov = &exp->exp_obd->u.lov;
-        int i, rc = 0;
-
-        if (lov->lov_page_removal_cb && lov->lov_page_removal_cb != func)
-                return -EINVAL;
-
-        lov->lov_page_removal_cb = NULL;
-        lov->lov_page_pin_cb = NULL;
-
-        for (i = 0; i < lov->desc.ld_tgt_count; i++) {
-                if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_exp)
-                        continue;
-                rc |= obd_unregister_page_removal_cb(lov->lov_tgts[i]->ltd_exp,
-                                                     func);
-        }
-
-        return rc;
-}
-
-static int lov_register_lock_cancel_cb(struct obd_export *exp,
-                                         obd_lock_cancel_cb func)
-{
-        struct lov_obd *lov = &exp->exp_obd->u.lov;
-        int i, rc = 0;
-
-        if (lov->lov_lock_cancel_cb && lov->lov_lock_cancel_cb != func)
-                return -EBUSY;
-
-        for (i = 0; i < lov->desc.ld_tgt_count; i++) {
-                if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_exp)
-                        continue;
-                rc |= obd_register_lock_cancel_cb(lov->lov_tgts[i]->ltd_exp,
-                                                  func);
-        }
-
-        lov->lov_lock_cancel_cb = func;
-
-        return rc;
-}
-
-static int lov_unregister_lock_cancel_cb(struct obd_export *exp,
-                                         obd_lock_cancel_cb func)
-{
-        struct lov_obd *lov = &exp->exp_obd->u.lov;
-        int i, rc = 0;
-
-        if (lov->lov_lock_cancel_cb && lov->lov_lock_cancel_cb != func)
-                return -EINVAL;
-
-        for (i = 0; i < lov->desc.ld_tgt_count; i++) {
-                if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_exp)
-                        continue;
-                rc |= obd_unregister_lock_cancel_cb(lov->lov_tgts[i]->ltd_exp,
-                                                    func);
-        }
-        lov->lov_lock_cancel_cb = NULL;
-        return rc;
-}
-
  #define MAX_STRING_SIZE 128
-static int lov_connect_obd(struct obd_device *obd, __u32 index, int activate,
-                           struct obd_connect_data *data)
+int lov_connect_obd(struct obd_device *obd, __u32 index, int activate,
+                    struct obd_connect_data *data)
  {
          struct lov_obd *lov = &obd->u.lov;
          struct obd_uuid tgt_uuid;
@@ -236,7 +148,7 @@ static int lov_connect_obd(struct obd_device *obd, __u32 index, int activate,
  
          if (activate) {
                  tgt_obd->obd_no_recov = 0;
-                /* FIXME this is probably supposed to be 
+                /* FIXME this is probably supposed to be
                     ptlrpc_set_import_active.  Horrible naming. */
                  ptlrpc_activate_import(imp);
          }
@@ -265,33 +177,10 @@ static int lov_connect_obd(struct obd_device *obd, __u32 index, int activate,
                  RETURN(-ENODEV);
          }
  
-        rc = obd_register_page_removal_cb(lov->lov_tgts[index]->ltd_exp,
-                                          lov->lov_page_removal_cb,
-                                          lov->lov_page_pin_cb);
-        if (rc) {
-                obd_disconnect(lov->lov_tgts[index]->ltd_exp);
-                lov->lov_tgts[index]->ltd_exp = NULL;
-                RETURN(rc);
-        }
-
-        rc = obd_register_lock_cancel_cb(lov->lov_tgts[index]->ltd_exp,
-                                         lov->lov_lock_cancel_cb);
-        if (rc) {
-                obd_unregister_page_removal_cb(lov->lov_tgts[index]->ltd_exp,
-                                               lov->lov_page_removal_cb);
-                obd_disconnect(lov->lov_tgts[index]->ltd_exp);
-                lov->lov_tgts[index]->ltd_exp = NULL;
-                RETURN(rc);
-        }
-
          rc = obd_register_observer(tgt_obd, obd);
          if (rc) {
                  CERROR("Target %s register_observer error %d\n",
                         obd_uuid2str(&tgt_uuid), rc);
-                obd_unregister_lock_cancel_cb(lov->lov_tgts[index]->ltd_exp,
-                                              lov->lov_lock_cancel_cb);
-                obd_unregister_page_removal_cb(lov->lov_tgts[index]->ltd_exp,
-                                               lov->lov_page_removal_cb);
                  obd_disconnect(lov->lov_tgts[index]->ltd_exp);
                  lov->lov_tgts[index]->ltd_exp = NULL;
                  RETURN(rc);
@@ -396,11 +285,6 @@ static int lov_disconnect_obd(struct obd_device *obd, __u32 index)
          CDEBUG(D_CONFIG, "%s: disconnecting target %s\n",
                 obd->obd_name, osc_obd->obd_name);
  
-        obd_unregister_lock_cancel_cb(lov->lov_tgts[index]->ltd_exp,
-                                      lov->lov_lock_cancel_cb);
-        obd_unregister_page_removal_cb(lov->lov_tgts[index]->ltd_exp,
-                                       lov->lov_page_removal_cb);
-
          if (lov->lov_tgts[index]->ltd_active) {
                  lov->lov_tgts[index]->ltd_active = 0;
                  lov->desc.ld_active_tgt_count--;
@@ -446,9 +330,6 @@ static int lov_disconnect_obd(struct obd_device *obd, __u32 index)
          RETURN(0);
  }
  
-static int lov_del_target(struct obd_device *obd, __u32 index,
-                          struct obd_uuid *uuidp, int gen);
-
  static int lov_disconnect(struct obd_export *exp)
  {
          struct obd_device *obd = class_exp2obd(exp);
@@ -608,8 +489,8 @@ static int lov_notify(struct obd_device *obd, struct obd_device *watched,
          RETURN(rc);
  }
  
-static int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
-                          __u32 index, int gen, int active)
+int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp,
+                   __u32 index, int gen, int active)
  {
          struct lov_obd *lov = &obd->u.lov;
          struct lov_tgt_desc *tgt;
@@ -721,8 +602,8 @@ out:
  }
  
  /* Schedule a target for deletion */
-static int lov_del_target(struct obd_device *obd, __u32 index,
-                          struct obd_uuid *uuidp, int gen)
+int lov_del_target(struct obd_device *obd, __u32 index,
+                   struct obd_uuid *uuidp, int gen)
  {
          struct lov_obd *lov = &obd->u.lov;
          int count = lov->desc.ld_tgt_count;
@@ -841,7 +722,7 @@ void lov_fix_desc(struct lov_desc *desc)
          lov_fix_desc_qos_maxage(&desc->ld_qos_maxage);
  }
  
-static int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
  {
          struct lprocfs_static_vars lvars = { 0 };
          struct lov_desc *desc;
@@ -1005,9 +886,9 @@ static int lov_cleanup(struct obd_device *obd)
          RETURN(0);
  }
  
-static int lov_process_config(struct obd_device *obd, obd_count len, void *buf)
+int lov_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg,
+                            __u32 *indexp, int *genp)
  {
-        struct lustre_cfg *lcfg = buf;
          struct obd_uuid obd_uuid;
          int cmd;
          int rc = 0;
@@ -1025,10 +906,12 @@ static int lov_process_config(struct obd_device *obd, obd_count len, void *buf)
  
                  obd_str2uuid(&obd_uuid,  lustre_cfg_buf(lcfg, 1));
  
-                if (sscanf(lustre_cfg_buf(lcfg, 2), "%d", &index) != 1)
+                if (sscanf(lustre_cfg_buf(lcfg, 2), "%d", indexp) != 1)
                          GOTO(out, rc = -EINVAL);
-                if (sscanf(lustre_cfg_buf(lcfg, 3), "%d", &gen) != 1)
+                if (sscanf(lustre_cfg_buf(lcfg, 3), "%d", genp) != 1)
                          GOTO(out, rc = -EINVAL);
+                index = *indexp;
+                gen = *genp;
                  if (cmd == LCFG_LOV_ADD_OBD)
                          rc = lov_add_target(obd, &obd_uuid, index, gen, 1);
                  else if (cmd == LCFG_LOV_ADD_INA)
@@ -1678,7 +1561,7 @@ static int lov_brw_check(struct lov_obd *lov, struct obd_info *lov_oinfo,
                  obd_off start, end;
  
                  if (!lov_stripe_intersects(lov_oinfo->oi_md, i, pga[i].off,
-                                           pga[i].off + pga[i].count,
+                                           pga[i].off + pga[i].count - 1,
                                             &start, &end))
                          continue;
  
@@ -1737,330 +1620,6 @@ static int lov_brw(int cmd, struct obd_export *exp, struct obd_info *oinfo,
          RETURN(rc);
  }
  
-static int lov_brw_interpret(struct ptlrpc_request_set *reqset, void *data,
-                             int rc)
-{
-        struct lov_request_set *lovset = (struct lov_request_set *)data;
-        ENTRY;
-
-        if (rc) {
-                lovset->set_completes = 0;
-                lov_fini_brw_set(lovset);
-        } else {
-                rc = lov_fini_brw_set(lovset);
-        }
-
-        RETURN(rc);
-}
-
-static int lov_brw_async(int cmd, struct obd_export *exp,
-                         struct obd_info *oinfo, obd_count oa_bufs,
-                         struct brw_page *pga, struct obd_trans_info *oti,
-                         struct ptlrpc_request_set *set)
-{
-        struct lov_request_set *lovset;
-        struct lov_request *req;
-        struct list_head *pos;
-        struct lov_obd *lov = &exp->exp_obd->u.lov;
-        int rc = 0;
-        ENTRY;
-
-        LASSERT(oinfo);
-        ASSERT_LSM_MAGIC(oinfo->oi_md);
-
-        if (cmd == OBD_BRW_CHECK) {
-                rc = lov_brw_check(lov, oinfo, oa_bufs, pga);
-                RETURN(rc);
-        }
-
-        rc = lov_prep_brw_set(exp, oinfo, oa_bufs, pga, oti, &lovset);
-        if (rc)
-                RETURN(rc);
-
-        list_for_each (pos, &lovset->set_list) {
-                struct obd_export *sub_exp;
-                struct brw_page *sub_pga;
-                req = list_entry(pos, struct lov_request, rq_link);
-
-                sub_exp = lov->lov_tgts[req->rq_idx]->ltd_exp;
-                sub_pga = lovset->set_pga + req->rq_pgaidx;
-                rc = obd_brw_async(cmd, sub_exp, &req->rq_oi, req->rq_oabufs,
-                                   sub_pga, oti, set);
-                if (rc)
-                        GOTO(out, rc);
-                lov_update_common_set(lovset, req, rc);
-        }
-        LASSERT(rc == 0);
-        LASSERT(set->set_interpret == NULL);
-        LASSERT(set->set_arg == NULL);
-        rc = ptlrpc_set_add_cb(set, lov_brw_interpret, lovset);
-        if (rc)
-                GOTO(out, rc);
-
-        RETURN(rc);
-out:
-        lov_fini_brw_set(lovset);
-        RETURN(rc);
-}
-
-static int lov_ap_make_ready(void *data, int cmd)
-{
-        struct lov_async_page *lap = lap_from_cookie(data);
-
-        return lap->lap_caller_ops->ap_make_ready(lap->lap_caller_data, cmd);
-}
-
-static int lov_ap_refresh_count(void *data, int cmd)
-{
-        struct lov_async_page *lap = lap_from_cookie(data);
-
-        return lap->lap_caller_ops->ap_refresh_count(lap->lap_caller_data,
-                                                     cmd);
-}
-
-static void lov_ap_fill_obdo(void *data, int cmd, struct obdo *oa)
-{
-        struct lov_async_page *lap = lap_from_cookie(data);
-
-        lap->lap_caller_ops->ap_fill_obdo(lap->lap_caller_data, cmd, oa);
-        /* XXX woah, shouldn't we be altering more here?  size? */
-        oa->o_id = lap->lap_loi_id;
-        oa->o_gr = lap->lap_loi_gr;
-        oa->o_valid |= OBD_MD_FLGROUP;
-        oa->o_stripe_idx = lap->lap_stripe;
-}
-
-static void lov_ap_update_obdo(void *data, int cmd, struct obdo *oa,
-                               obd_valid valid)
-{
-        struct lov_async_page *lap = lap_from_cookie(data);
-
-        lap->lap_caller_ops->ap_update_obdo(lap->lap_caller_data, cmd,oa,valid);
-}
-
-static int lov_ap_completion(void *data, int cmd, struct obdo *oa, int rc)
-{
-        struct lov_async_page *lap = lap_from_cookie(data);
-
-        /* in a raid1 regime this would down a count of many ios
-         * in flight, onl calling the caller_ops completion when all
-         * the raid1 ios are complete */
-        rc = lap->lap_caller_ops->ap_completion(lap->lap_caller_data,cmd,oa,rc);
-        return rc;
-}
-
-static struct obd_capa *lov_ap_lookup_capa(void *data, int cmd)
-{
-        struct lov_async_page *lap = lap_from_cookie(data);
-        return lap->lap_caller_ops->ap_lookup_capa(lap->lap_caller_data, cmd);
-}
-
-static struct obd_async_page_ops lov_async_page_ops = {
-        .ap_make_ready =        lov_ap_make_ready,
-        .ap_refresh_count =     lov_ap_refresh_count,
-        .ap_fill_obdo =         lov_ap_fill_obdo,
-        .ap_update_obdo =       lov_ap_update_obdo,
-        .ap_completion =        lov_ap_completion,
-        .ap_lookup_capa =       lov_ap_lookup_capa,
-};
-
-int lov_prep_async_page(struct obd_export *exp, struct lov_stripe_md *lsm,
-                           struct lov_oinfo *loi, cfs_page_t *page,
-                           obd_off offset, struct obd_async_page_ops *ops,
-                           void *data, void **res, int nocache,
-                           struct lustre_handle *lockh)
-{
-        struct lov_obd *lov = &exp->exp_obd->u.lov;
-        struct lov_async_page *lap;
-        struct lov_lock_handles *lov_lockh = NULL;
-        int rc = 0;
-        ENTRY;
-
-        if (!page) {
-                int i = 0;
-                /* Find an existing osc so we can get it's stupid sizeof(*oap).
-                   Only because of this layering limitation will a client
-                   mount with no osts fail */
-                while (!lov->lov_tgts || !lov->lov_tgts[i] ||
-                       !lov->lov_tgts[i]->ltd_exp) {
-                        i++;
-                        if (i >= lov->desc.ld_tgt_count)
-                                RETURN(-ENOMEDIUM);
-                }
-                rc = size_round(sizeof(*lap)) +
-                        obd_prep_async_page(lov->lov_tgts[i]->ltd_exp, NULL,
-                                            NULL, NULL, 0, NULL, NULL, NULL, 0,
-                                            NULL);
-                RETURN(rc);
-        }
-        ASSERT_LSM_MAGIC(lsm);
-        LASSERT(loi == NULL);
-
-        lap = *res;
-        lap->lap_magic = LOV_AP_MAGIC;
-        lap->lap_caller_ops = ops;
-        lap->lap_caller_data = data;
-
-        /* for now only raid 0 which passes through */
-        lap->lap_stripe = lov_stripe_number(lsm, offset);
-        lov_stripe_offset(lsm, offset, lap->lap_stripe, &lap->lap_sub_offset);
-        loi = lsm->lsm_oinfo[lap->lap_stripe];
-
-        /* so the callback doesn't need the lsm */
-        lap->lap_loi_id = loi->loi_id;
-        lap->lap_loi_gr = lsm->lsm_object_gr;
-        LASSERT(lsm->lsm_object_gr > 0);
-        
-        lap->lap_sub_cookie = (void *)lap + size_round(sizeof(*lap));
-
-        if (lockh) {
-                lov_lockh = lov_handle2llh(lockh);
-                if (lov_lockh) {
-                        lockh = lov_lockh->llh_handles + lap->lap_stripe;
-                }
-        }
-
-        rc = obd_prep_async_page(lov->lov_tgts[loi->loi_ost_idx]->ltd_exp,
-                                 lsm, loi, page, lap->lap_sub_offset,
-                                 &lov_async_page_ops, lap,
-                                 &lap->lap_sub_cookie, nocache, lockh);
-        if (lov_lockh)
-                lov_llh_put(lov_lockh);
-        if (rc)
-                RETURN(rc);
-        CDEBUG(D_CACHE, "lap %p page %p cookie %p off "LPU64"\n", lap, page,
-               lap->lap_sub_cookie, offset);
-        RETURN(0);
-}
-
-static int lov_queue_async_io(struct obd_export *exp,
-                              struct lov_stripe_md *lsm,
-                              struct lov_oinfo *loi, void *cookie,
-                              int cmd, obd_off off, int count,
-                              obd_flag brw_flags, obd_flag async_flags)
-{
-        struct lov_obd *lov = &exp->exp_obd->u.lov;
-        struct lov_async_page *lap;
-        int rc;
-
-        LASSERT(loi == NULL);
-
-        ASSERT_LSM_MAGIC(lsm);
-
-        lap = lap_from_cookie(cookie);
-
-        loi = lsm->lsm_oinfo[lap->lap_stripe];
-
-        rc = obd_queue_async_io(lov->lov_tgts[loi->loi_ost_idx]->ltd_exp, lsm,
-                                loi, lap->lap_sub_cookie, cmd, off, count,
-                                brw_flags, async_flags);
-        RETURN(rc);
-}
-
-static int lov_set_async_flags(struct obd_export *exp,
-                               struct lov_stripe_md *lsm,
-                               struct lov_oinfo *loi, void *cookie,
-                               obd_flag async_flags)
-{
-        struct lov_obd *lov = &exp->exp_obd->u.lov;
-        struct lov_async_page *lap;
-        int rc;
-
-        LASSERT(loi == NULL);
-
-        ASSERT_LSM_MAGIC(lsm);
-
-        lap = lap_from_cookie(cookie);
-
-        loi = lsm->lsm_oinfo[lap->lap_stripe];
-
-        rc = obd_set_async_flags(lov->lov_tgts[loi->loi_ost_idx]->ltd_exp,
-                                 lsm, loi, lap->lap_sub_cookie, async_flags);
-        RETURN(rc);
-}
-
-static int lov_queue_group_io(struct obd_export *exp,
-                              struct lov_stripe_md *lsm,
-                              struct lov_oinfo *loi,
-                              struct obd_io_group *oig, void *cookie,
-                              int cmd, obd_off off, int count,
-                              obd_flag brw_flags, obd_flag async_flags)
-{
-        struct lov_obd *lov = &exp->exp_obd->u.lov;
-        struct lov_async_page *lap;
-        int rc;
-
-        LASSERT(loi == NULL);
-
-        ASSERT_LSM_MAGIC(lsm);
-
-        lap = lap_from_cookie(cookie);
-
-        loi = lsm->lsm_oinfo[lap->lap_stripe];
-
-        rc = obd_queue_group_io(lov->lov_tgts[loi->loi_ost_idx]->ltd_exp, lsm,
-                                loi, oig, lap->lap_sub_cookie, cmd, off, count,
-                                brw_flags, async_flags);
-        RETURN(rc);
-}
-
-/* this isn't exactly optimal.  we may have queued sync io in oscs on
- * all stripes, but we don't record that fact at queue time.  so we
- * trigger sync io on all stripes. */
-static int lov_trigger_group_io(struct obd_export *exp,
-                                struct lov_stripe_md *lsm,
-                                struct lov_oinfo *loi,
-                                struct obd_io_group *oig)
-{
-        struct lov_obd *lov = &exp->exp_obd->u.lov;
-        int rc = 0, i, err;
-
-        LASSERT(loi == NULL);
-
-        ASSERT_LSM_MAGIC(lsm);
-
-        for (i = 0; i < lsm->lsm_stripe_count; i++) {
-                loi = lsm->lsm_oinfo[i];
-                if (!lov->lov_tgts[loi->loi_ost_idx] ||
-                    !lov->lov_tgts[loi->loi_ost_idx]->ltd_active) {
-                        CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
-                        continue;
-                }
-
-                err = obd_trigger_group_io(lov->lov_tgts[loi->loi_ost_idx]->ltd_exp,
-                                           lsm, loi, oig);
-                if (rc == 0 && err != 0)
-                        rc = err;
-        };
-        RETURN(rc);
-}
-
-static int lov_teardown_async_page(struct obd_export *exp,
-                                   struct lov_stripe_md *lsm,
-                                   struct lov_oinfo *loi, void *cookie)
-{
-        struct lov_obd *lov = &exp->exp_obd->u.lov;
-        struct lov_async_page *lap;
-        int rc;
-
-        LASSERT(loi == NULL);
-
-        ASSERT_LSM_MAGIC(lsm);
-
-        lap = lap_from_cookie(cookie);
-
-        loi = lsm->lsm_oinfo[lap->lap_stripe];
-
-        rc = obd_teardown_async_page(lov->lov_tgts[loi->loi_ost_idx]->ltd_exp,
-                                     lsm, loi, lap->lap_sub_cookie);
-        if (rc) {
-                CERROR("unable to teardown sub cookie %p: %d\n",
-                       lap->lap_sub_cookie, rc);
-                RETURN(rc);
-        }
-        RETURN(rc);
-}
-
  static int lov_enqueue_interpret(struct ptlrpc_request_set *rqset,
                                   void *data, int rc)
  {
@@ -2118,50 +1677,6 @@ out:
          RETURN(rc);
  }
  
-static int lov_match(struct obd_export *exp, struct lov_stripe_md *lsm,
-                     __u32 type, ldlm_policy_data_t *policy, __u32 mode,
-                     int *flags, void *data, struct lustre_handle *lockh)
-{
-        struct lov_request_set *set;
-        struct obd_info oinfo;
-        struct lov_request *req;
-        struct list_head *pos;
-        struct lov_obd *lov = &exp->exp_obd->u.lov;
-        struct lustre_handle *lov_lockhp;
-        int lov_flags, rc = 0;
-        ENTRY;
-
-        ASSERT_LSM_MAGIC(lsm);
-        LASSERT((*flags & LDLM_FL_TEST_LOCK) || mode == (mode & -mode));
-
-        if (!exp || !exp->exp_obd)
-                RETURN(-ENODEV);
-
-        lov = &exp->exp_obd->u.lov;
-        rc = lov_prep_match_set(exp, &oinfo, lsm, policy, mode, lockh, &set);
-        if (rc)
-                RETURN(rc);
-
-        list_for_each (pos, &set->set_list) {
-                ldlm_policy_data_t sub_policy;
-                req = list_entry(pos, struct lov_request, rq_link);
-                lov_lockhp = set->set_lockh->llh_handles + req->rq_stripe;
-                LASSERT(lov_lockhp);
-
-                lov_flags = *flags;
-                sub_policy.l_extent = req->rq_oi.oi_policy.l_extent;
-
-                rc = obd_match(lov->lov_tgts[req->rq_idx]->ltd_exp,
-                               req->rq_oi.oi_md, type, &sub_policy,
-                               mode, &lov_flags, data, lov_lockhp);
-                rc = lov_update_match_set(set, req, rc);
-                if (rc <= 0)
-                        break;
-        }
-        lov_fini_match_set(set, mode, *flags);
-        RETURN(rc);
-}
-
  static int lov_change_cbdata(struct obd_export *exp,
                               struct lov_stripe_md *lsm, ldlm_iterator_t it,
                               void *data)
@@ -2186,7 +1701,7 @@ static int lov_change_cbdata(struct obd_export *exp,
                          CDEBUG(D_HA, "lov idx %d NULL \n", loi->loi_ost_idx);
                          continue;
                  }
-                
+
                  submd.lsm_object_id = loi->loi_id;
                  submd.lsm_object_gr = lsm->lsm_object_gr;
                  submd.lsm_stripe_count = 0;
@@ -3149,7 +2664,7 @@ int lov_complete_many(struct obd_export *exp, struct lov_stripe_md *lsm,
                  struct ldlm_lock *lock;
                  struct obd_device *obd;
  
-                lock = ldlm_handle2lock(lov_lockhp);
+                lock = ldlm_handle2lock_long(lov_lockhp, 0);
                  if (lock == NULL) {
                          CDEBUG(D_HA, "lov idx %d subobj "LPX64" no lock?\n",
                                 loi->loi_ost_idx, loi->loi_id);
@@ -3207,93 +2722,13 @@ void lov_stripe_unlock(struct lov_stripe_md *md)
  }
  EXPORT_SYMBOL(lov_stripe_unlock);
  
-/**
- * Checks if requested extent lock is compatible with a lock under the page.
- *
- * Checks if the lock under \a page is compatible with a read or write lock
- * (specified by \a rw) for an extent [\a start , \a end].
- *
- * \param exp lov export
- * \param lsm striping information for the file
- * \param res lov_async_page placeholder
- * \param rw OBD_BRW_READ if requested for reading,
- *           OBD_BRW_WRITE if requested for writing
- * \param start start of the requested extent
- * \param end end of the requested extent
- * \param cookie transparent parameter for passing locking context
- *
- * \post result == 1, *cookie == context, appropriate lock is referenced or
- * \post result == 0
- *
- * \retval 1 owned lock is reused for the request
- * \retval 0 no lock reused for the request
- *
- * \see lov_release_short_lock
- */
-static int lov_reget_short_lock(struct obd_export *exp,
-                                struct lov_stripe_md *lsm,
-                                void **res, int rw,
-                                obd_off start, obd_off end,
-                                void **cookie)
-{
-        struct lov_async_page *l = *res;
-        obd_off stripe_start, stripe_end = start;
-
-        ENTRY;
-
-        /* ensure we don't cross stripe boundaries */
-        lov_extent_calc(exp, lsm, OBD_CALC_STRIPE_END, &stripe_end);
-        if (stripe_end <= end)
-                RETURN(0);
-
-        /* map the region limits to the object limits */
-        lov_stripe_offset(lsm, start, l->lap_stripe, &stripe_start);
-        lov_stripe_offset(lsm, end, l->lap_stripe, &stripe_end);
-
-        RETURN(obd_reget_short_lock(exp->exp_obd->u.lov.lov_tgts[lsm->
-                                    lsm_oinfo[l->lap_stripe]->loi_ost_idx]->
-                                    ltd_exp, NULL, &l->lap_sub_cookie,
-                                    rw, stripe_start, stripe_end, cookie));
-}
-
-/**
- * Releases a reference to a lock taken in a "fast" way.
- *
- * Releases a read or a write (specified by \a rw) lock
- * referenced by \a cookie.
- *
- * \param exp lov export
- * \param lsm striping information for the file
- * \param end end of the locked extent
- * \param rw OBD_BRW_READ if requested for reading,
- *           OBD_BRW_WRITE if requested for writing
- * \param cookie transparent parameter for passing locking context
- *
- * \post appropriate lock is dereferenced
- *
- * \see lov_reget_short_lock
- */
-static int lov_release_short_lock(struct obd_export *exp,
-                                  struct lov_stripe_md *lsm, obd_off end,
-                                  void *cookie, int rw)
-{
-        int stripe;
-
-        ENTRY;
-
-        stripe = lov_stripe_number(lsm, end);
-
-        RETURN(obd_release_short_lock(exp->exp_obd->u.lov.lov_tgts[lsm->
-                                      lsm_oinfo[stripe]->loi_ost_idx]->
-                                      ltd_exp, NULL, end, cookie, rw));
-}
  
  struct obd_ops lov_obd_ops = {
          .o_owner               = THIS_MODULE,
          .o_setup               = lov_setup,
          .o_precleanup          = lov_precleanup,
          .o_cleanup             = lov_cleanup,
-        .o_process_config      = lov_process_config,
+        //.o_process_config      = lov_process_config,
          .o_connect             = lov_connect,
          .o_disconnect          = lov_disconnect,
          .o_statfs              = lov_statfs,
@@ -3308,21 +2743,11 @@ struct obd_ops lov_obd_ops = {
          .o_setattr             = lov_setattr,
          .o_setattr_async       = lov_setattr_async,
          .o_brw                 = lov_brw,
-        .o_brw_async           = lov_brw_async,
-        .o_prep_async_page     = lov_prep_async_page,
-        .o_reget_short_lock    = lov_reget_short_lock,
-        .o_release_short_lock  = lov_release_short_lock,
-        .o_queue_async_io      = lov_queue_async_io,
-        .o_set_async_flags     = lov_set_async_flags,
-        .o_queue_group_io      = lov_queue_group_io,
-        .o_trigger_group_io    = lov_trigger_group_io,
-        .o_teardown_async_page = lov_teardown_async_page,
          .o_merge_lvb           = lov_merge_lvb,
          .o_adjust_kms          = lov_adjust_kms,
          .o_punch               = lov_punch,
          .o_sync                = lov_sync,
          .o_enqueue             = lov_enqueue,
-        .o_match               = lov_match,
          .o_change_cbdata       = lov_change_cbdata,
          .o_cancel              = lov_cancel,
          .o_cancel_unused       = lov_cancel_unused,
@@ -3333,10 +2758,6 @@ struct obd_ops lov_obd_ops = {
          .o_llog_init           = lov_llog_init,
          .o_llog_finish         = lov_llog_finish,
          .o_notify              = lov_notify,
-        .o_register_page_removal_cb = lov_register_page_removal_cb,
-        .o_unregister_page_removal_cb = lov_unregister_page_removal_cb,
-        .o_register_lock_cancel_cb = lov_register_lock_cancel_cb,
-        .o_unregister_lock_cancel_cb = lov_unregister_lock_cancel_cb,
          .o_pool_new            = lov_pool_new,
          .o_pool_rem            = lov_pool_remove,
          .o_pool_add            = lov_pool_add,
@@ -3348,17 +2769,30 @@ extern quota_interface_t lov_quota_interface;
  
  cfs_mem_cache_t *lov_oinfo_slab;
  
+extern struct lu_kmem_descr lov_caches[];
+
  int __init lov_init(void)
  {
          struct lprocfs_static_vars lvars = { 0 };
          int rc, rc2;
          ENTRY;
  
+        /* print an address of _any_ initialized kernel symbol from this
+         * module, to allow debugging with gdb that doesn't support data
+         * symbols from modules.*/
+        CDEBUG(D_CONSOLE, "Lustre LOV module (%p).\n", &lov_caches);
+
+        rc = lu_kmem_init(lov_caches);
+        if (rc)
+                return rc;
+
          lov_oinfo_slab = cfs_mem_cache_create("lov_oinfo",
-                                              sizeof(struct lov_oinfo), 
+                                              sizeof(struct lov_oinfo),
                                                0, SLAB_HWCACHE_ALIGN);
-        if (lov_oinfo_slab == NULL)
+        if (lov_oinfo_slab == NULL) {
+                lu_kmem_fini(lov_caches);
                  return -ENOMEM;
+        }
          lprocfs_lov_init_vars(&lvars);
  
          request_module("lquota");
@@ -3366,12 +2800,14 @@ int __init lov_init(void)
          init_obd_quota_ops(quota_interface, &lov_obd_ops);
  
          rc = class_register_type(&lov_obd_ops, NULL, lvars.module_vars,
-                                 LUSTRE_LOV_NAME, NULL);
+                                 LUSTRE_LOV_NAME, &lov_device_type);
+
          if (rc) {
                  if (quota_interface)
                          PORTAL_SYMBOL_PUT(lov_quota_interface);
                  rc2 = cfs_mem_cache_destroy(lov_oinfo_slab);
                  LASSERT(rc2 == 0);
+                lu_kmem_fini(lov_caches);
          }
  
          RETURN(rc);
@@ -3381,7 +2817,10 @@ int __init lov_init(void)
  static void /*__exit*/ lov_exit(void)
  {
          int rc;
-        
+
+        lu_device_type_fini(&lov_device_type);
+        lu_kmem_fini(lov_caches);
+
          if (quota_interface)
                  PORTAL_SYMBOL_PUT(lov_quota_interface);
  
diff --git a/lustre/lov/lov_object.c b/lustre/lov/lov_object.c

new file mode 100644 (file)

index 0000000..a38d22b
--- /dev/null
+++ b/lustre/lov/lov_object.c
@@ -0,0 +1,679 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_object for LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+/** \addtogroup lov lov @{ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/*****************************************************************************
+ *
+ * Layout operations.
+ *
+ */
+
+struct lov_layout_operations {
+        int (*llo_init)(const struct lu_env *env, struct lov_device *dev,
+                        struct lov_object *lov,
+                        const struct cl_object_conf *conf,
+                        union lov_layout_state *state);
+        void (*llo_delete)(const struct lu_env *env, struct lov_object *lov,
+                           union lov_layout_state *state);
+        void (*llo_fini)(const struct lu_env *env, struct lov_object *lov,
+                         union lov_layout_state *state);
+        void (*llo_install)(const struct lu_env *env, struct lov_object *lov,
+                            union lov_layout_state *state);
+        int  (*llo_print)(const struct lu_env *env, void *cookie,
+                          lu_printer_t p, const struct lu_object *o);
+        struct cl_page *(*llo_page_init)(const struct lu_env *env,
+                                         struct cl_object *obj,
+                                         struct cl_page *page,
+                                         cfs_page_t *vmpage);
+        int  (*llo_lock_init)(const struct lu_env *env,
+                              struct cl_object *obj, struct cl_lock *lock,
+                              const struct cl_io *io);
+        int  (*llo_io_init)(const struct lu_env *env,
+                            struct cl_object *obj, struct cl_io *io);
+        int  (*llo_getattr)(const struct lu_env *env, struct cl_object *obj,
+                            struct cl_attr *attr);
+};
+
+/*****************************************************************************
+ *
+ * Lov object layout operations.
+ *
+ */
+
+static void lov_install_empty(const struct lu_env *env,
+                              struct lov_object *lov,
+                              union  lov_layout_state *state)
+{
+        /*
+         * File without objects.
+         */
+}
+
+static int lov_init_empty(const struct lu_env *env,
+                          struct lov_device *dev, struct lov_object *lov,
+                          const struct cl_object_conf *conf,
+                          union  lov_layout_state *state)
+{
+        return 0;
+}
+
+static void lov_install_raid0(const struct lu_env *env,
+                              struct lov_object *lov,
+                              union  lov_layout_state *state)
+{
+        lov->u = *state;
+}
+
+static void oinfo_get_fid(const struct lov_oinfo *oinfo, struct lu_fid *fid)
+{
+        __u64 idx = oinfo->loi_id;
+
+        /* See idif definition in wiki:CMD3_interoperability_architecture */
+
+        LASSERT(oinfo->loi_gr < 1ULL << 16);
+        LASSERT(oinfo->loi_id < 1ULL << 49);
+        ENTRY;
+
+        /*
+         * Now that the fid of stripe is not unique now, ost_idx have to
+         * be used to make it unique. This is ok because the stripe fids
+         * are just used in client side(to locate the objects). -jay
+         */
+        fid->f_seq = ((__u64)oinfo->loi_ost_idx) << 32 |
+                     oinfo->loi_gr << 16 | idx >> 32;
+        fid->f_oid = idx; /* truncated to 32 bits by assignment */
+        fid->f_ver = 0;
+        EXIT;
+}
+
+static struct cl_object *lov_sub_find(const struct lu_env *env,
+                                      struct cl_device *dev,
+                                      const struct lu_fid *fid,
+                                      const struct cl_object_conf *conf)
+{
+        struct lu_object *o;
+
+        ENTRY;
+        o = lu_object_find_at(env, cl2lu_dev(dev), fid, &conf->coc_lu);
+        LASSERT(ergo(!IS_ERR(o), o->lo_dev->ld_type == &lovsub_device_type));
+        RETURN(lu2cl(o));
+}
+
+static int lov_init_sub(const struct lu_env *env, struct lov_object *lov,
+                        struct cl_object *stripe,
+                        struct lov_layout_raid0 *r0, int idx)
+{
+        struct cl_object_header *hdr;
+        struct cl_object_header *subhdr;
+        struct cl_object_header *parent;
+        struct lov_oinfo        *oinfo;
+        int result;
+
+        hdr    = cl_object_header(lov2cl(lov));
+        subhdr = cl_object_header(stripe);
+        parent = subhdr->coh_parent;
+
+        oinfo = r0->lo_lsm->lsm_oinfo[idx];
+        CDEBUG(D_INODE, DFID"@%p[%d] -> "DFID"@%p: id: "LPU64" gr: "LPU64
+               " idx: %d gen: %d\n",
+               PFID(&subhdr->coh_lu.loh_fid), subhdr, idx,
+               PFID(&hdr->coh_lu.loh_fid), hdr,
+               oinfo->loi_id, oinfo->loi_gr,
+               oinfo->loi_ost_idx, oinfo->loi_ost_gen);
+
+        if (parent == NULL) {
+                subhdr->coh_parent = hdr;
+                subhdr->coh_nesting = hdr->coh_nesting + 1;
+                lu_object_ref_add(&stripe->co_lu, "lov-parent", lov);
+                r0->lo_sub[idx] = cl2lovsub(stripe);
+                r0->lo_sub[idx]->lso_super = lov;
+                r0->lo_sub[idx]->lso_index = idx;
+                result = 0;
+        } else {
+                CERROR("Stripe is already owned by other file (%i).\n", idx);
+                LU_OBJECT_DEBUG(D_ERROR, env, &stripe->co_lu, "\n");
+                LU_OBJECT_DEBUG(D_ERROR, env, lu_object_top(&parent->coh_lu),
+                                "old\n");
+                LU_OBJECT_HEADER(D_ERROR, env, lov2lu(lov), "new\n");
+                cl_object_put(env, stripe);
+                result = -EIO;
+        }
+        return result;
+}
+
+static int lov_init_raid0(const struct lu_env *env,
+                          struct lov_device *dev, struct lov_object *lov,
+                          const struct cl_object_conf *conf,
+                          union  lov_layout_state *state)
+{
+        int result;
+        int i;
+
+        struct cl_object        *stripe;
+        struct lov_thread_info  *lti     = lov_env_info(env);
+        struct cl_object_conf   *subconf = &lti->lti_stripe_conf;
+        struct lov_stripe_md    *lsm     = conf->u.coc_md->lsm;
+        struct lu_fid           *ofid    = &lti->lti_fid;
+        struct lov_layout_raid0 *r0      = &state->raid0;
+
+        ENTRY;
+        r0->lo_nr  = conf->u.coc_md->lsm->lsm_stripe_count;
+        r0->lo_lsm = conf->u.coc_md->lsm;
+        LASSERT(r0->lo_nr <= lov_targets_nr(dev));
+
+        OBD_ALLOC(r0->lo_sub, r0->lo_nr * sizeof r0->lo_sub[0]);
+        if (r0->lo_sub != NULL) {
+                result = 0;
+                subconf->coc_inode = conf->coc_inode;
+                /*
+                 * Create stripe cl_objects.
+                 */
+                for (i = 0; i < r0->lo_nr && result == 0; ++i) {
+                        struct cl_device *subdev;
+                        struct lov_oinfo *oinfo = lsm->lsm_oinfo[i];
+                        int ost_idx = oinfo->loi_ost_idx;
+
+                        oinfo_get_fid(oinfo, ofid);
+                        subdev = lovsub2cl_dev(dev->ld_target[ost_idx]);
+                        subconf->u.coc_oinfo = oinfo;
+                        stripe = lov_sub_find(env, subdev, ofid, subconf);
+                        if (!IS_ERR(stripe))
+                                result = lov_init_sub(env, lov, stripe, r0, i);
+                        else
+                                result = PTR_ERR(stripe);
+                }
+        } else
+                result = -ENOMEM;
+        RETURN(result);
+}
+
+static void lov_delete_empty(const struct lu_env *env, struct lov_object *lov,
+                             union lov_layout_state *state)
+{
+        LASSERT(lov->lo_type == LLT_EMPTY);
+}
+
+static void lov_delete_raid0(const struct lu_env *env, struct lov_object *lov,
+                             union lov_layout_state *state)
+{
+        struct lov_layout_raid0 *r0 = &state->raid0;
+        int                      i;
+
+        ENTRY;
+        if (r0->lo_sub != NULL &&
+            lu_object_is_dying(lov->lo_cl.co_lu.lo_header)) {
+                for (i = 0; i < r0->lo_nr; ++i) {
+                        struct lovsub_object *sub = r0->lo_sub[i];
+
+                        if (sub != NULL)
+                                /*
+                                 * If top-level object is to be evicted from
+                                 * the cache, so are its sub-objects.
+                                 */
+                                cl_object_kill(env, lovsub2cl(sub));
+                }
+        }
+        EXIT;
+}
+
+static void lov_fini_empty(const struct lu_env *env, struct lov_object *lov,
+                           union lov_layout_state *state)
+{
+        LASSERT(lov->lo_type == LLT_EMPTY);
+}
+
+static void lov_fini_raid0(const struct lu_env *env, struct lov_object *lov,
+                           union lov_layout_state *state)
+{
+        struct lov_layout_raid0 *r0 = &state->raid0;
+
+        ENTRY;
+        if (r0->lo_sub != NULL) {
+                int i;
+
+                for (i = 0; i < r0->lo_nr; ++i) {
+                        struct cl_object *sub;
+
+                        if (r0->lo_sub[i] == NULL)
+                                continue;
+                        sub = lovsub2cl(r0->lo_sub[i]);
+                        cl_object_header(sub)->coh_parent = NULL;
+                        lu_object_ref_del(&sub->co_lu, "lov-parent", lov);
+                        cl_object_put(env, sub);
+                        r0->lo_sub[i] = NULL;
+                }
+                OBD_FREE(r0->lo_sub, r0->lo_nr * sizeof r0->lo_sub[0]);
+                r0->lo_sub = NULL;
+        }
+        EXIT;
+}
+
+static int lov_print_empty(const struct lu_env *env, void *cookie,
+                           lu_printer_t p, const struct lu_object *o)
+{
+        (*p)(env, cookie, "empty\n");
+        return 0;
+}
+
+static int lov_print_raid0(const struct lu_env *env, void *cookie,
+                           lu_printer_t p, const struct lu_object *o)
+{
+        struct lov_object       *lov = lu2lov(o);
+        struct lov_layout_raid0 *r0  = lov_r0(lov);
+        int i;
+
+        (*p)(env, cookie, "stripes: %d:\n", r0->lo_nr);
+        for (i = 0; i < r0->lo_nr; ++i) {
+                struct lu_object *sub;
+
+                if (r0->lo_sub[i] != NULL) {
+                        sub = lovsub2lu(r0->lo_sub[i]);
+                        lu_object_print(env, cookie, p, sub);
+                } else
+                        (*p)(env, cookie, "sub %d absent\n", i);
+        }
+        return 0;
+}
+
+/**
+ * Implements cl_object_operations::coo_attr_get() method for an object
+ * without stripes (LLT_EMPTY layout type).
+ *
+ * The only attributes this layer is authoritative in this case is
+ * cl_attr::cat_blocks---it's 0.
+ */
+static int lov_attr_get_empty(const struct lu_env *env, struct cl_object *obj,
+                              struct cl_attr *attr)
+{
+        attr->cat_blocks = 0;
+        return 0;
+}
+
+static int lov_attr_get_raid0(const struct lu_env *env, struct cl_object *obj,
+                              struct cl_attr *attr)
+{
+        struct lov_object       *lov = cl2lov(obj);
+        struct lov_layout_raid0 *r0 = lov_r0(lov);
+        struct lov_stripe_md    *lsm = lov->u.raid0.lo_lsm;
+        struct ost_lvb          *lvb = &lov_env_info(env)->lti_lvb;
+        __u64                    kms;
+        int                      result = 0;
+
+        ENTRY;
+        if (!r0->lo_attr_valid) {
+                /*
+                 * Fill LVB with attributes already initialized by the upper
+                 * layer.
+                 */
+                cl_attr2lvb(lvb, attr);
+                kms = attr->cat_kms;
+
+                /*
+                 * XXX that should be replaced with a loop over sub-objects,
+                 * doing cl_object_attr_get() on them. But for now, let's
+                 * reuse old lov code.
+                 */
+
+                /*
+                 * XXX take lsm spin-lock to keep lov_merge_lvb_kms()
+                 * happy. It's not needed, because new code uses
+                 * ->coh_attr_guard spin-lock to protect consistency of
+                 * sub-object attributes.
+                 */
+                lov_stripe_lock(lsm);
+                result = lov_merge_lvb_kms(lsm, lvb, &kms);
+                lov_stripe_unlock(lsm);
+                if (result == 0) {
+                        cl_lvb2attr(attr, lvb);
+                        attr->cat_kms = kms;
+                        r0->lo_attr_valid = 1;
+                        r0->lo_attr = *attr;
+                }
+        } else
+                *attr = r0->lo_attr;
+        RETURN(result);
+}
+
+const static struct lov_layout_operations lov_dispatch[] = {
+        [LLT_EMPTY] = {
+                .llo_init      = lov_init_empty,
+                .llo_delete    = lov_delete_empty,
+                .llo_fini      = lov_fini_empty,
+                .llo_install   = lov_install_empty,
+                .llo_print     = lov_print_empty,
+                .llo_page_init = lov_page_init_empty,
+                .llo_lock_init = NULL,
+                .llo_io_init   = lov_io_init_empty,
+                .llo_getattr   = lov_attr_get_empty
+        },
+        [LLT_RAID0] = {
+                .llo_init      = lov_init_raid0,
+                .llo_delete    = lov_delete_raid0,
+                .llo_fini      = lov_fini_raid0,
+                .llo_install   = lov_install_raid0,
+                .llo_print     = lov_print_raid0,
+                .llo_page_init = lov_page_init_raid0,
+                .llo_lock_init = lov_lock_init_raid0,
+                .llo_io_init   = lov_io_init_raid0,
+                .llo_getattr   = lov_attr_get_raid0
+        }
+};
+
+
+/**
+ * Performs a double-dispatch based on the layout type of an object.
+ */
+#define LOV_2DISPATCH_NOLOCK(obj, op, ...)                              \
+({                                                                      \
+        struct lov_object                      *__obj = (obj);          \
+        enum lov_layout_type                    __llt;                  \
+                                                                        \
+        __llt = __obj->lo_type;                                         \
+        LASSERT(0 <= __llt && __llt < ARRAY_SIZE(lov_dispatch));        \
+        lov_dispatch[__llt].op(__VA_ARGS__);                            \
+})
+
+#define LOV_2DISPATCH_MAYLOCK(obj, op, lock, ...)                       \
+({                                                                      \
+        struct lov_object                      *__obj = (obj);          \
+        int                                     __lock = !!(lock);      \
+        typeof(lov_dispatch[0].op(__VA_ARGS__)) __result;               \
+                                                                        \
+        __lock &= __obj->lo_owner != cfs_current();                     \
+        if (__lock)                                                     \
+                down_read(&__obj->lo_type_guard);                       \
+        __result = LOV_2DISPATCH_NOLOCK(obj, op, __VA_ARGS__);          \
+        if (__lock)                                                     \
+                up_read(&__obj->lo_type_guard);                         \
+        __result;                                                       \
+})
+
+/**
+ * Performs a locked double-dispatch based on the layout type of an object.
+ */
+#define LOV_2DISPATCH(obj, op, ...)                     \
+        LOV_2DISPATCH_MAYLOCK(obj, op, 1, __VA_ARGS__)
+
+#define LOV_2DISPATCH_VOID(obj, op, ...)                                \
+do {                                                                    \
+        struct lov_object                      *__obj = (obj);          \
+        enum lov_layout_type                    __llt;                  \
+                                                                        \
+        if (__obj->lo_owner != cfs_current())                           \
+                down_read(&__obj->lo_type_guard);                       \
+        __llt = __obj->lo_type;                                         \
+        LASSERT(0 <= __llt && __llt < ARRAY_SIZE(lov_dispatch));        \
+        lov_dispatch[__llt].op(__VA_ARGS__);                            \
+        if (__obj->lo_owner != cfs_current())                           \
+                up_read(&__obj->lo_type_guard);                         \
+} while (0)
+
+static int lov_layout_change(const struct lu_env *env,
+                             struct lov_object *obj, enum lov_layout_type llt,
+                             const struct cl_object_conf *conf)
+{
+        int result;
+        union lov_layout_state       *state = &lov_env_info(env)->lti_state;
+        const struct lov_layout_operations *old_ops;
+        const struct lov_layout_operations *new_ops;
+
+        LASSERT(0 <= obj->lo_type && obj->lo_type < ARRAY_SIZE(lov_dispatch));
+        LASSERT(0 <= llt && llt < ARRAY_SIZE(lov_dispatch));
+        ENTRY;
+
+        old_ops = &lov_dispatch[obj->lo_type];
+        new_ops = &lov_dispatch[llt];
+
+        result = new_ops->llo_init(env, lu2lov_dev(obj->lo_cl.co_lu.lo_dev),
+                                   obj, conf, state);
+        if (result == 0) {
+                struct cl_object_header *hdr = cl_object_header(&obj->lo_cl);
+                void                    *cookie;
+                struct lu_env           *nested;
+                int                      refcheck;
+
+                cookie = cl_env_reenter();
+                nested = cl_env_get(&refcheck);
+                if (!IS_ERR(nested))
+                        cl_object_prune(nested, &obj->lo_cl);
+                else
+                        result = PTR_ERR(nested);
+                cl_env_put(nested, &refcheck);
+                cl_env_reexit(cookie);
+
+                old_ops->llo_fini(env, obj, &obj->u);
+                LASSERT(list_empty(&hdr->coh_locks));
+                LASSERT(hdr->coh_tree.rnode == NULL);
+                LASSERT(hdr->coh_pages == 0);
+
+                new_ops->llo_install(env, obj, state);
+                obj->lo_type = llt;
+        } else
+                new_ops->llo_fini(env, obj, state);
+        RETURN(result);
+}
+
+/*****************************************************************************
+ *
+ * Lov object operations.
+ *
+ */
+
+int lov_object_init(const struct lu_env *env, struct lu_object *obj,
+                    const struct lu_object_conf *conf)
+{
+        struct lov_device            *dev   = lu2lov_dev(obj->lo_dev);
+        struct lov_object            *lov   = lu2lov(obj);
+        const struct cl_object_conf  *cconf = lu2cl_conf(conf);
+        union  lov_layout_state      *set   = &lov_env_info(env)->lti_state;
+        const struct lov_layout_operations *ops;
+        int result;
+
+        ENTRY;
+        init_rwsem(&lov->lo_type_guard);
+
+        /* no locking is necessary, as object is being created */
+        lov->lo_type = cconf->u.coc_md->lsm != NULL ? LLT_RAID0 : LLT_EMPTY;
+        ops = &lov_dispatch[lov->lo_type];
+        result = ops->llo_init(env, dev, lov, cconf, set);
+        if (result == 0)
+                ops->llo_install(env, lov, set);
+        else
+                ops->llo_fini(env, lov, set);
+        RETURN(result);
+}
+
+static int lov_conf_set(const struct lu_env *env, struct cl_object *obj,
+                        const struct cl_object_conf *conf)
+{
+        struct lov_object *lov = cl2lov(obj);
+        int result;
+
+        ENTRY;
+        /*
+         * Currently only LLT_EMPTY -> LLT_RAID0 transition is supported.
+         */
+        LASSERT(lov->lo_owner != cfs_current());
+        down_write(&lov->lo_type_guard);
+        LASSERT(lov->lo_owner == NULL);
+        lov->lo_owner = cfs_current();
+        if (lov->lo_type == LLT_EMPTY && conf->u.coc_md->lsm != NULL)
+                result = lov_layout_change(env, lov, LLT_RAID0, conf);
+        else
+                result = -EOPNOTSUPP;
+        lov->lo_owner = NULL;
+        up_write(&lov->lo_type_guard);
+        RETURN(result);
+}
+
+static void lov_object_delete(const struct lu_env *env, struct lu_object *obj)
+{
+        struct lov_object *lov = lu2lov(obj);
+
+        ENTRY;
+        LOV_2DISPATCH_VOID(lov, llo_delete, env, lov, &lov->u);
+        EXIT;
+}
+
+static void lov_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+        struct lov_object *lov = lu2lov(obj);
+
+        ENTRY;
+        LOV_2DISPATCH_VOID(lov, llo_fini, env, lov, &lov->u);
+        lu_object_fini(obj);
+        OBD_SLAB_FREE_PTR(lov, lov_object_kmem);
+        EXIT;
+}
+
+static int lov_object_print(const struct lu_env *env, void *cookie,
+                            lu_printer_t p, const struct lu_object *o)
+{
+        return LOV_2DISPATCH(lu2lov(o), llo_print, env, cookie, p, o);
+}
+
+struct cl_page *lov_page_init(const struct lu_env *env, struct cl_object *obj,
+                              struct cl_page *page, cfs_page_t *vmpage)
+{
+        return LOV_2DISPATCH(cl2lov(obj),
+                             llo_page_init, env, obj, page, vmpage);
+}
+
+/**
+ * Implements cl_object_operations::clo_io_init() method for lov
+ * layer. Dispatches to the appropriate layout io initialization method.
+ */
+int lov_io_init(const struct lu_env *env, struct cl_object *obj,
+                struct cl_io *io)
+{
+        CL_IO_SLICE_CLEAN(lov_env_io(env), lis_cl);
+        /*
+         * Do not take lock in case of CIT_MISC io, because
+         *
+         *     - if this is an io for a glimpse, then we don't care;
+         *
+         *     - if this not a glimpse (writepage or lock cancellation), then
+         *       layout change cannot happen because a page or a lock
+         *       already exist; and
+         *
+         *     - lock ordering (lock mutex nests within layout rw-semaphore)
+         *       is obeyed in case of lock cancellation.
+         */
+        return LOV_2DISPATCH_MAYLOCK(cl2lov(obj), llo_io_init,
+                                     io->ci_type != CIT_MISC, env, obj, io);
+}
+
+/**
+ * An implementation of cl_object_operations::clo_attr_get() method for lov
+ * layer. For raid0 layout this collects and merges attributes of all
+ * sub-objects.
+ */
+static int lov_attr_get(const struct lu_env *env, struct cl_object *obj,
+                        struct cl_attr *attr)
+{
+        /* do not take lock, as this function is called under a
+         * spin-lock. Layout is protected from changing by ongoing IO. */
+        return LOV_2DISPATCH_NOLOCK(cl2lov(obj), llo_getattr, env, obj, attr);
+}
+
+static int lov_attr_set(const struct lu_env *env, struct cl_object *obj,
+                        const struct cl_attr *attr, unsigned valid)
+{
+        /*
+         * No dispatch is required here, as no layout implements this.
+         */
+        return 0;
+}
+
+int lov_lock_init(const struct lu_env *env, struct cl_object *obj,
+                  struct cl_lock *lock, const struct cl_io *io)
+{
+        return LOV_2DISPATCH(cl2lov(obj), llo_lock_init, env, obj, lock, io);
+}
+
+static const struct cl_object_operations lov_ops = {
+        .coo_page_init = lov_page_init,
+        .coo_lock_init = lov_lock_init,
+        .coo_io_init   = lov_io_init,
+        .coo_attr_get  = lov_attr_get,
+        .coo_attr_set  = lov_attr_set,
+        .coo_conf_set  = lov_conf_set
+};
+
+static const struct lu_object_operations lov_lu_obj_ops = {
+        .loo_object_init      = lov_object_init,
+        .loo_object_delete    = lov_object_delete,
+        .loo_object_release   = NULL,
+        .loo_object_free      = lov_object_free,
+        .loo_object_print     = lov_object_print,
+        .loo_object_invariant = NULL
+};
+
+struct lu_object *lov_object_alloc(const struct lu_env *env,
+                                   const struct lu_object_header *_,
+                                   struct lu_device *dev)
+{
+        struct lov_object *lov;
+        struct lu_object  *obj;
+
+        ENTRY;
+        OBD_SLAB_ALLOC_PTR(lov, lov_object_kmem);
+        if (lov != NULL) {
+                obj = lov2lu(lov);
+                lu_object_init(obj, NULL, dev);
+                lov->lo_cl.co_ops = &lov_ops;
+                lov->lo_type = -1; /* invalid, to catch uninitialized type */
+                /*
+                 * object io operation vector (cl_object::co_iop) is installed
+                 * later in lov_object_init(), as different vectors are used
+                 * for object with different layouts.
+                 */
+                obj->lo_ops = &lov_lu_obj_ops;
+        } else
+                obj = NULL;
+        RETURN(obj);
+}
+
+/** @} lov */
diff --git a/lustre/lov/lov_page.c b/lustre/lov/lov_page.c

new file mode 100644 (file)

index 0000000..3efbc41
--- /dev/null
+++ b/lustre/lov/lov_page.c
@@ -0,0 +1,227 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_page for LOV layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov lov @{ */
+
+/*****************************************************************************
+ *
+ * Lov page operations.
+ *
+ */
+
+static int lov_page_invariant(const struct cl_page_slice *slice)
+{
+        const struct cl_page  *page = slice->cpl_page;
+        const struct cl_page  *sub  = lov_sub_page(slice);
+
+        return ergo(sub != NULL,
+                    page->cp_child == sub &&
+                    sub->cp_parent == page &&
+                    page->cp_state == sub->cp_state);
+}
+
+static void lov_page_fini(const struct lu_env *env,
+                          struct cl_page_slice *slice)
+{
+        struct lov_page *lp  = cl2lov_page(slice);
+        struct cl_page  *sub = lov_sub_page(slice);
+
+        LINVRNT(lov_page_invariant(slice));
+        ENTRY;
+
+        if (sub != NULL) {
+                LASSERT(sub->cp_state == CPS_FREEING);
+                lu_ref_del(&sub->cp_reference, "lov", sub->cp_parent);
+                sub->cp_parent = NULL;
+                slice->cpl_page->cp_child = NULL;
+                cl_page_put(env, sub);
+        }
+        OBD_SLAB_FREE_PTR(lp, lov_page_kmem);
+        EXIT;
+}
+
+static void lov_page_own(const struct lu_env *env,
+                         const struct cl_page_slice *slice, struct cl_io *io)
+{
+        struct lov_io     *lio = lov_env_io(env);
+        struct lov_io_sub *sub;
+
+        LINVRNT(lov_page_invariant(slice));
+        LINVRNT(!cl2lov_page(slice)->lps_invalid);
+        ENTRY;
+
+        sub = lov_page_subio(env, lio, slice);
+        if (!IS_ERR(sub)) {
+                lov_sub_page(slice)->cp_owner = sub->sub_io;
+                lov_sub_put(sub);
+        } else
+                LBUG(); /* Arrgh */
+        EXIT;
+}
+
+static void lov_page_assume(const struct lu_env *env,
+                            const struct cl_page_slice *slice, struct cl_io *io)
+{
+        return lov_page_own(env, slice, io);
+}
+
+static int lov_page_print(const struct lu_env *env,
+                          const struct cl_page_slice *slice,
+                          void *cookie, lu_printer_t printer)
+{
+        struct lov_page *lp = cl2lov_page(slice);
+
+        return (*printer)(env, cookie, LUSTRE_LOV_NAME"-page@%p\n", lp);
+}
+
+static const struct cl_page_operations lov_page_ops = {
+        .cpo_fini   = lov_page_fini,
+        .cpo_own    = lov_page_own,
+        .cpo_assume = lov_page_assume,
+        .cpo_print  = lov_page_print
+};
+
+static void lov_empty_page_fini(const struct lu_env *env,
+                                struct cl_page_slice *slice)
+{
+        struct lov_page *lp  = cl2lov_page(slice);
+
+        LASSERT(slice->cpl_page->cp_child == NULL);
+        ENTRY;
+        OBD_SLAB_FREE_PTR(lp, lov_page_kmem);
+        EXIT;
+}
+
+struct cl_page *lov_page_init_raid0(const struct lu_env *env,
+                                    struct cl_object *obj, struct cl_page *page,
+                                    cfs_page_t *vmpage)
+{
+        struct lov_page   *lpg;
+        struct lov_object *loo = cl2lov(obj);
+        int result;
+
+        ENTRY;
+        OBD_SLAB_ALLOC_PTR(lpg, lov_page_kmem);
+        if (lpg != NULL) {
+                loff_t   offset;
+                int      stripe;
+                obd_off  suboff;
+                struct cl_page          *subpage;
+                struct cl_object        *subobj;
+                struct lov_layout_raid0 *r0 = lov_r0(loo);
+
+                offset = cl_offset(obj, page->cp_index);
+                stripe = lov_stripe_number(r0->lo_lsm, offset);
+                result = lov_stripe_offset(r0->lo_lsm, offset, stripe,
+                                           &suboff);
+                LASSERT(stripe < r0->lo_nr);
+                LASSERT(result == 0);
+
+                subobj  = lovsub2cl(r0->lo_sub[stripe]);
+                subpage = cl_page_find(env, subobj,
+                                       cl_index(subobj, suboff), vmpage,
+                                       page->cp_type);
+                if (!IS_ERR(subpage)) {
+                        if (subpage->cp_parent != NULL) {
+                                /*
+                                 * This is only possible when TRANSIENT page
+                                 * is being created, and CACHEABLE sub-page
+                                 * (attached to already existing top-page) has
+                                 * been found. Tell cl_page_find() to use
+                                 * existing page.
+                                 */
+                                LASSERT(subpage->cp_type == CPT_CACHEABLE);
+                                LASSERT(page->cp_type == CPT_TRANSIENT);
+                                lpg->lps_invalid = 1;
+                                cl_page_put(env, subpage);
+                                /*
+                                 * XXX This assumes that lov is in the topmost
+                                 * cl_page.
+                                 */
+                                result = PTR_ERR(cl_page_top(subpage));
+                        } else {
+                                lu_ref_add(&subpage->cp_reference, "lov", page);
+                                subpage->cp_parent = page;
+                                page->cp_child = subpage;
+                        }
+                        cl_page_slice_add(page, &lpg->lps_cl,
+                                          obj, &lov_page_ops);
+                } else
+                        result = PTR_ERR(subpage);
+        } else
+                result = -ENOMEM;
+        RETURN(ERR_PTR(result));
+}
+
+
+static const struct cl_page_operations lov_empty_page_ops = {
+        .cpo_fini   = lov_empty_page_fini,
+        .cpo_print  = lov_page_print
+};
+
+struct cl_page *lov_page_init_empty(const struct lu_env *env,
+                                    struct cl_object *obj, struct cl_page *page,
+                                    cfs_page_t *vmpage)
+{
+        struct lov_page   *lpg;
+        int result = -ENOMEM;
+        ENTRY;
+
+        OBD_SLAB_ALLOC_PTR(lpg, lov_page_kmem);
+        if (lpg != NULL) {
+                void *addr;
+                cl_page_slice_add(page, &lpg->lps_cl,
+                                  obj, &lov_empty_page_ops);
+                addr = cfs_kmap(vmpage);
+                memset(addr, 0, cl_page_size(obj));
+                cfs_kunmap(vmpage);
+                cl_page_export(env, page);
+                result = 0;
+        }
+        RETURN(ERR_PTR(result));
+}
+
+
+/** @} lov */
+
diff --git a/lustre/lov/lov_request.c b/lustre/lov/lov_request.c

index 00805bb..ba95f06 100644 (file)
--- a/lustre/lov/lov_request.c
+++ b/lustre/lov/lov_request.c
@@ -113,7 +113,7 @@ int lov_update_common_set(struct lov_request_set *set,
          lov_update_set(set, req, rc);
  
          /* grace error on inactive ost */
-        if (rc && !(lov->lov_tgts[req->rq_idx] && 
+        if (rc && !(lov->lov_tgts[req->rq_idx] &&
                      lov->lov_tgts[req->rq_idx]->ltd_active))
                  rc = 0;
  
@@ -127,18 +127,44 @@ void lov_set_add_req(struct lov_request *req, struct lov_request_set *set)
          set->set_count++;
  }
  
+extern void osc_update_enqueue(struct lustre_handle *lov_lockhp,
+                               struct lov_oinfo *loi, int flags,
+                               struct ost_lvb *lvb, __u32 mode, int rc);
+
+static int lov_update_enqueue_lov(struct obd_export *exp,
+                                  struct lustre_handle *lov_lockhp,
+                                  struct lov_oinfo *loi, int flags, int idx,
+                                  __u64 oid, int rc)
+{
+        struct lov_obd *lov = &exp->exp_obd->u.lov;
+
+        if (rc != ELDLM_OK &&
+            !(rc == ELDLM_LOCK_ABORTED && (flags & LDLM_FL_HAS_INTENT))) {
+                memset(lov_lockhp, 0, sizeof(*lov_lockhp));
+                if (lov->lov_tgts[idx] && lov->lov_tgts[idx]->ltd_active) {
+                        /* -EUSERS used by OST to report file contention */
+                        if (rc != -EINTR && rc != -EUSERS)
+                                CERROR("enqueue objid "LPX64" subobj "
+                                       LPX64" on OST idx %d: rc %d\n",
+                                       oid, loi->loi_id, loi->loi_ost_idx, rc);
+                } else
+                        rc = ELDLM_OK;
+        }
+        return rc;
+}
+
  int lov_update_enqueue_set(struct lov_request *req, __u32 mode, int rc)
  {
          struct lov_request_set *set = req->rq_rqset;
          struct lustre_handle *lov_lockhp;
+        struct obd_info *oi = set->set_oi;
          struct lov_oinfo *loi;
          ENTRY;
  
-        LASSERT(set != NULL);
-        LASSERT(set->set_oi != NULL);
+        LASSERT(oi != NULL);
  
          lov_lockhp = set->set_lockh->llh_handles + req->rq_stripe;
-        loi = set->set_oi->oi_md->lsm_oinfo[req->rq_stripe];
+        loi = oi->oi_md->lsm_oinfo[req->rq_stripe];
  
          /* XXX LOV STACKING: OSC gets a copy, created in lov_prep_enqueue_set
           * and that copy can be arbitrarily out of date.
@@ -146,65 +172,22 @@ int lov_update_enqueue_set(struct lov_request *req, __u32 mode, int rc)
           * The LOV API is due for a serious rewriting anyways, and this
           * can be addressed then. */
  
-        if (rc == ELDLM_OK) {
-                struct ldlm_lock *lock = ldlm_handle2lock(lov_lockhp);
-                __u64 tmp;
-
-                LASSERT(lock != NULL);
-                lov_stripe_lock(set->set_oi->oi_md);
-                loi->loi_lvb = req->rq_oi.oi_md->lsm_oinfo[0]->loi_lvb;
-                tmp = loi->loi_lvb.lvb_size;
-                /* Extend KMS up to the end of this lock and no further
-                 * A lock on [x,y] means a KMS of up to y + 1 bytes! */
-                if (tmp > lock->l_policy_data.l_extent.end)
-                        tmp = lock->l_policy_data.l_extent.end + 1;
-                if (tmp >= loi->loi_kms) {
-                        LDLM_DEBUG(lock, "lock acquired, setting rss="LPU64
-                                   ", kms="LPU64, loi->loi_lvb.lvb_size, tmp);
-                        loi->loi_kms = tmp;
-                        loi->loi_kms_valid = 1;
-                } else {
-                        LDLM_DEBUG(lock, "lock acquired, setting rss="
-                                   LPU64"; leaving kms="LPU64", end="LPU64,
-                                   loi->loi_lvb.lvb_size, loi->loi_kms,
-                                   lock->l_policy_data.l_extent.end);
-                }
-                lov_stripe_unlock(set->set_oi->oi_md);
-                ldlm_lock_allow_match(lock);
-                LDLM_LOCK_PUT(lock);
-        } else if ((rc == ELDLM_LOCK_ABORTED) &&
-                   (set->set_oi->oi_flags & LDLM_FL_HAS_INTENT)) {
-                memset(lov_lockhp, 0, sizeof(*lov_lockhp));
-                lov_stripe_lock(set->set_oi->oi_md);
-                loi->loi_lvb = req->rq_oi.oi_md->lsm_oinfo[0]->loi_lvb;
-                lov_stripe_unlock(set->set_oi->oi_md);
-                CDEBUG(D_INODE, "glimpsed, setting rss="LPU64"; leaving"
-                       " kms="LPU64"\n", loi->loi_lvb.lvb_size, loi->loi_kms);
-                rc = ELDLM_OK;
-        } else {
-                struct obd_export *exp = set->set_exp;
-                struct lov_obd *lov = &exp->exp_obd->u.lov;
-
-                memset(lov_lockhp, 0, sizeof(*lov_lockhp));
-                if (lov->lov_tgts[req->rq_idx] && 
-                    lov->lov_tgts[req->rq_idx]->ltd_active) {
-                        /* -EUSERS used by OST to report file contention */
-                        if (rc != -EINTR && rc != -EUSERS)
-                                CERROR("enqueue objid "LPX64" subobj "
-                                       LPX64" on OST idx %d: rc %d\n",
-                                       set->set_oi->oi_md->lsm_object_id,
-                                       loi->loi_id, loi->loi_ost_idx, rc);
-                } else {
-                        rc = ELDLM_OK;
-                }
-        }
+        lov_stripe_lock(oi->oi_md);
+        osc_update_enqueue(lov_lockhp, loi, oi->oi_flags,
+                           &req->rq_oi.oi_md->lsm_oinfo[0]->loi_lvb, mode, rc);
+        if (rc == ELDLM_LOCK_ABORTED && (oi->oi_flags & LDLM_FL_HAS_INTENT))
+                memset(lov_lockhp, 0, sizeof *lov_lockhp);
+        rc = lov_update_enqueue_lov(set->set_exp, lov_lockhp, loi, oi->oi_flags,
+                                    req->rq_idx, oi->oi_md->lsm_object_id, rc);
+        lov_stripe_unlock(oi->oi_md);
          lov_update_set(set, req, rc);
          RETURN(rc);
  }
  
  /* The callback for osc_enqueue that updates lov info for every OSC request. */
-static int cb_update_enqueue(struct obd_info *oinfo, int rc)
+static int cb_update_enqueue(void *cookie, int rc)
  {
+        struct obd_info *oinfo = cookie;
          struct ldlm_enqueue_info *einfo;
          struct lov_request *lovreq;
  
@@ -877,9 +860,9 @@ int lov_prep_brw_set(struct obd_export *exp, struct obd_info *oinfo,
  
                  if (info[i].count == 0)
                          continue;
-                
+
                  loi = oinfo->oi_md->lsm_oinfo[i];
-                if (!lov->lov_tgts[loi->loi_ost_idx] || 
+                if (!lov->lov_tgts[loi->loi_ost_idx] ||
                      !lov->lov_tgts[loi->loi_ost_idx]->ltd_active) {
                          CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
                          GOTO(out, rc = -EIO);
@@ -972,8 +955,9 @@ int lov_fini_getattr_set(struct lov_request_set *set)
  
  /* The callback for osc_getattr_async that finilizes a request info when a
   * response is recieved. */
-static int cb_getattr_update(struct obd_info *oinfo, int rc)
+static int cb_getattr_update(void *cookie, int rc)
  {
+        struct obd_info *oinfo = cookie;
          struct lov_request *lovreq;
          lovreq = container_of(oinfo, struct lov_request, rq_oi);
          return lov_update_common_set(lovreq->rq_rqset, lovreq, rc);
@@ -1081,7 +1065,7 @@ int lov_prep_destroy_set(struct obd_export *exp, struct obd_info *oinfo,
                  struct lov_request *req;
  
                  loi = lsm->lsm_oinfo[i];
-                if (!lov->lov_tgts[loi->loi_ost_idx] || 
+                if (!lov->lov_tgts[loi->loi_ost_idx] ||
                      !lov->lov_tgts[loi->loi_ost_idx]->ltd_active) {
                          CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx);
                          continue;
@@ -1140,7 +1124,7 @@ int lov_update_setattr_set(struct lov_request_set *set,
          lov_update_set(set, req, rc);
  
          /* grace error on inactive ost */
-        if (rc && !(lov->lov_tgts[req->rq_idx] && 
+        if (rc && !(lov->lov_tgts[req->rq_idx] &&
                      lov->lov_tgts[req->rq_idx]->ltd_active))
                  rc = 0;
  
@@ -1161,8 +1145,9 @@ int lov_update_setattr_set(struct lov_request_set *set,
  
  /* The callback for osc_setattr_async that finilizes a request info when a
   * response is recieved. */
-static int cb_setattr_update(struct obd_info *oinfo, int rc)
+static int cb_setattr_update(void *cookie, int rc)
  {
+        struct obd_info *oinfo = cookie;
          struct lov_request *lovreq;
          lovreq = container_of(oinfo, struct lov_request, rq_oi);
          return lov_update_setattr_set(lovreq->rq_rqset, lovreq, rc);
@@ -1212,7 +1197,7 @@ int lov_prep_setattr_set(struct obd_export *exp, struct obd_info *oinfo,
                  memcpy(req->rq_oi.oi_oa, oinfo->oi_oa,
                         sizeof(*req->rq_oi.oi_oa));
                  req->rq_oi.oi_oa->o_id = loi->loi_id;
-                LASSERT(!(req->rq_oi.oi_oa->o_valid & OBD_MD_FLGROUP) 
+                LASSERT(!(req->rq_oi.oi_oa->o_valid & OBD_MD_FLGROUP)
                                  || req->rq_oi.oi_oa->o_gr>0);
                  req->rq_oi.oi_oa->o_stripe_idx = i;
                  req->rq_oi.oi_cb_up = cb_setattr_update;
@@ -1293,8 +1278,9 @@ int lov_update_punch_set(struct lov_request_set *set,
  
  /* The callback for osc_punch that finilizes a request info when a response
   * is recieved. */
-static int cb_update_punch(struct obd_info *oinfo, int rc)
+static int cb_update_punch(void *cookie, int rc)
  {
+        struct obd_info *oinfo = cookie;
          struct lov_request *lovreq;
          lovreq = container_of(oinfo, struct lov_request, rq_oi);
          return lov_update_punch_set(lovreq->rq_rqset, lovreq, rc);
@@ -1576,8 +1562,9 @@ void lov_update_statfs(struct obd_statfs *osfs, struct obd_statfs *lov_sfs,
  
  /* The callback for osc_statfs_async that finilizes a request info when a
   * response is recieved. */
-static int cb_statfs_update(struct obd_info *oinfo, int rc)
+static int cb_statfs_update(void *cookie, int rc)
  {
+        struct obd_info *oinfo = cookie;
          struct lov_request *lovreq;
          struct obd_statfs *osfs, *lov_sfs;
          struct obd_device *obd;
diff --git a/lustre/lov/lovsub_dev.c b/lustre/lov/lovsub_dev.c

new file mode 100644 (file)

index 0000000..359def4
--- /dev/null
+++ b/lustre/lov/lovsub_dev.c
@@ -0,0 +1,212 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_device and cl_device_type for LOVSUB layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov lov @{ */
+
+/*****************************************************************************
+ *
+ * Lovsub transfer operations.
+ *
+ */
+
+static void lovsub_req_completion(const struct lu_env *env,
+                                  const struct cl_req_slice *slice, int ioret)
+{
+        struct lovsub_req *lsr;
+
+        ENTRY;
+        lsr = cl2lovsub_req(slice);
+        OBD_SLAB_FREE_PTR(lsr, lovsub_req_kmem);
+        EXIT;
+}
+
+/**
+ * Implementation of struct cl_req_operations::cro_attr_set() for lovsub
+ * layer. Lov and lovsub are responsible only for struct obdo::o_stripe_idx
+ * field, which is filled there.
+ */
+static void lovsub_req_attr_set(const struct lu_env *env,
+                                const struct cl_req_slice *slice,
+                                const struct cl_object *obj,
+                                struct cl_req_attr *attr, obd_valid flags)
+{
+        struct lovsub_object *subobj;
+
+        ENTRY;
+        subobj = cl2lovsub(obj);
+        /*
+         * There is no OBD_MD_* flag for obdo::o_stripe_idx, so set it
+         * unconditionally. It never changes anyway.
+         */
+        attr->cra_oa->o_stripe_idx = subobj->lso_index;
+        EXIT;
+}
+
+static const struct cl_req_operations lovsub_req_ops = {
+        .cro_attr_set   = lovsub_req_attr_set,
+        .cro_completion = lovsub_req_completion
+};
+
+/*****************************************************************************
+ *
+ * Lov-sub device and device type functions.
+ *
+ */
+
+static int lovsub_device_init(const struct lu_env *env, struct lu_device *d,
+                              const char *name, struct lu_device *next)
+{
+        struct lovsub_device  *lsd = lu2lovsub_dev(d);
+        struct lu_device_type *ldt;
+        int rc;
+
+        ENTRY;
+        next->ld_site = d->ld_site;
+        ldt = next->ld_type;
+        LASSERT(ldt != NULL);
+        rc = ldt->ldt_ops->ldto_device_init(env, next, ldt->ldt_name, NULL);
+        if (rc) {
+                next->ld_site = NULL;
+                RETURN(rc);
+        }
+
+        lu_device_get(next);
+        lu_ref_add(&next->ld_reference, "lu-stack", &lu_site_init);
+        lsd->acid_next = lu2cl_dev(next);
+        RETURN(rc);
+}
+
+static struct lu_device *lovsub_device_fini(const struct lu_env *env,
+                                            struct lu_device *d)
+{
+        struct lu_device *next;
+        struct lovsub_device *lsd;
+
+        ENTRY;
+        lsd = lu2lovsub_dev(d);
+        next = cl2lu_dev(lsd->acid_next);
+        lsd->acid_super = NULL;
+        lsd->acid_next = NULL;
+        RETURN(next);
+}
+
+static struct lu_device *lovsub_device_free(const struct lu_env *env,
+                                            struct lu_device *d)
+{
+        struct lovsub_device *lsd  = lu2lovsub_dev(d);
+        struct lu_device     *next = cl2lu_dev(lsd->acid_next);
+
+        cl_device_fini(lu2cl_dev(d));
+        OBD_FREE_PTR(lsd);
+        return next;
+}
+
+static int lovsub_req_init(const struct lu_env *env, struct cl_device *dev,
+                           struct cl_req *req)
+{
+        struct lovsub_req *lsr;
+        int result;
+
+        OBD_SLAB_ALLOC_PTR(lsr, lovsub_req_kmem);
+        if (lsr != NULL) {
+                cl_req_slice_add(req, &lsr->lsrq_cl, dev, &lovsub_req_ops);
+                result = 0;
+        } else
+                result = -ENOMEM;
+        return result;
+}
+
+static const struct lu_device_operations lovsub_lu_ops = {
+        .ldo_object_alloc      = lovsub_object_alloc,
+        .ldo_process_config    = NULL,
+        .ldo_recovery_complete = NULL
+};
+
+static const struct cl_device_operations lovsub_cl_ops = {
+        .cdo_req_init = lovsub_req_init
+};
+
+static struct lu_device *lovsub_device_alloc(const struct lu_env *env,
+                                             struct lu_device_type *t,
+                                             struct lustre_cfg *cfg)
+{
+        struct lu_device     *d;
+        struct lovsub_device *lsd;
+
+        OBD_ALLOC_PTR(lsd);
+        if (lsd != NULL) {
+                int result;
+
+                result = cl_device_init(&lsd->acid_cl, t);
+                if (result == 0) {
+                        d = lovsub2lu_dev(lsd);
+                        d->ld_ops         = &lovsub_lu_ops;
+                        lsd->acid_cl.cd_ops = &lovsub_cl_ops;
+                } else
+                        d = ERR_PTR(result);
+        } else
+                d = ERR_PTR(-ENOMEM);
+        return d;
+}
+
+static const struct lu_device_type_operations lovsub_device_type_ops = {
+        .ldto_device_alloc = lovsub_device_alloc,
+        .ldto_device_free  = lovsub_device_free,
+
+        .ldto_device_init    = lovsub_device_init,
+        .ldto_device_fini    = lovsub_device_fini
+};
+
+#define LUSTRE_LOVSUB_NAME         "lovsub"
+
+struct lu_device_type lovsub_device_type = {
+        .ldt_tags     = LU_DEVICE_CL,
+        .ldt_name     = LUSTRE_LOVSUB_NAME,
+        .ldt_ops      = &lovsub_device_type_ops,
+        .ldt_ctx_tags = LCT_CL_THREAD
+};
+
+
+/** @} lov */
+
diff --git a/lustre/include/obd_echo.h b/lustre/lov/lovsub_io.c

similarity index 58%

rename from lustre/include/obd_echo.h

rename to lustre/lov/lovsub_io.c

index 7465b68..d8cfe0e 100644 (file)
--- a/lustre/include/obd_echo.h
+++ b/lustre/lov/lovsub_io.c
@@ -26,40 +26,30 @@
   * GPL HEADER END
   */
  /*
- * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
   * Use is subject to license terms.
   */
  /*
   * This file is part of Lustre, http://www.lustre.org/
   * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_io for LOVSUB layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
   */
  
-#ifndef _OBD_ECHO_H
-#define _OBD_ECHO_H
+#define DEBUG_SUBSYSTEM S_LOV
  
-/* The persistent object (i.e. actually stores stuff!) */
-#define ECHO_PERSISTENT_OBJID    1ULL
-#define ECHO_PERSISTENT_SIZE     ((__u64)(1<<20))
+#include "lov_cl_internal.h"
  
-/* block size to use for data verification */
-#define OBD_ECHO_BLOCK_SIZE    (4<<10)
+/** \addtogroup lov lov @{ */
  
-struct ec_object {
-        struct list_head       eco_obj_chain;
-        struct obd_device     *eco_device;
-        int                    eco_refcount;
-        int                    eco_deleted;
-        obd_id                 eco_id;
-        struct lov_stripe_md  *eco_lsm;
-};
+/*****************************************************************************
+ *
+ * Lovsub io operations.
+ *
+ */
  
-struct ec_lock {
-        struct list_head       ecl_exp_chain;
-        struct ec_object      *ecl_object;
-        __u64                  ecl_cookie;
-        struct lustre_handle   ecl_lock_handle;
-        ldlm_policy_data_t     ecl_policy;
-        __u32                  ecl_mode;
-};
+/* All trivial */
  
-#endif
+/** @} lov */
diff --git a/lustre/lov/lovsub_lock.c b/lustre/lov/lovsub_lock.c

new file mode 100644 (file)

index 0000000..f02a2ce
--- /dev/null
+++ b/lustre/lov/lovsub_lock.c
@@ -0,0 +1,430 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_lock for LOVSUB layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov lov @{ */
+
+/*****************************************************************************
+ *
+ * Lovsub lock operations.
+ *
+ */
+
+static void lovsub_lock_fini(const struct lu_env *env,
+                             struct cl_lock_slice *slice)
+{
+        struct lovsub_lock   *lsl;
+
+        ENTRY;
+        lsl = cl2lovsub_lock(slice);
+        LASSERT(list_empty(&lsl->lss_parents));
+        OBD_SLAB_FREE_PTR(lsl, lovsub_lock_kmem);
+        EXIT;
+}
+
+static void lovsub_parent_lock(const struct lu_env *env, struct lov_lock *lov)
+{
+        struct cl_lock *parent;
+
+        ENTRY;
+        parent = lov->lls_cl.cls_lock;
+        cl_lock_get(parent);
+        lu_ref_add(&parent->cll_reference, "lovsub-parent", cfs_current());
+        cl_lock_mutex_get(env, parent);
+        EXIT;
+}
+
+static void lovsub_parent_unlock(const struct lu_env *env, struct lov_lock *lov)
+{
+        struct cl_lock *parent;
+
+        ENTRY;
+        parent = lov->lls_cl.cls_lock;
+        cl_lock_mutex_put(env, lov->lls_cl.cls_lock);
+        lu_ref_del(&parent->cll_reference, "lovsub-parent", cfs_current());
+        cl_lock_put(env, parent);
+        EXIT;
+}
+
+static void lovsub_lock_state_one(const struct lu_env *env,
+                                  const struct lovsub_lock *lovsub,
+                                  struct lov_lock *lov)
+{
+        struct cl_lock       *parent;
+        const struct cl_lock *child;
+
+        ENTRY;
+        parent = lov->lls_cl.cls_lock;
+        child  = lovsub->lss_cl.cls_lock;
+
+        if (lovsub->lss_active != parent) {
+                lovsub_parent_lock(env, lov);
+                if (child->cll_error != 0)
+                        cl_lock_error(env, parent, child->cll_error);
+                else
+                        cl_lock_signal(env, parent);
+                lovsub_parent_unlock(env, lov);
+        }
+        EXIT;
+}
+
+/**
+ * Implements cl_lock_operations::clo_state() method for lovsub layer, which
+ * method is called whenever sub-lock state changes. Propagates state change
+ * to the top-locks.
+ */
+static void lovsub_lock_state(const struct lu_env *env,
+                              const struct cl_lock_slice *slice,
+                              enum cl_lock_state state)
+{
+        struct lovsub_lock   *sub = cl2lovsub_lock(slice);
+        struct lov_lock_link *scan;
+        struct lov_lock_link *temp;
+
+        LASSERT(cl_lock_is_mutexed(slice->cls_lock));
+        ENTRY;
+
+        /*
+         * Use _safe() version, because
+         *
+         *     lovsub_lock_state_one()
+         *       ->cl_lock_error()
+         *         ->cl_lock_delete()
+         *           ->lov_lock_delete()
+         *
+         * can unlink parent from the parent list.
+         */
+        list_for_each_entry_safe(scan, temp, &sub->lss_parents, lll_list)
+                lovsub_lock_state_one(env, sub, scan->lll_super);
+        EXIT;
+}
+
+/**
+ * Implementation of cl_lock_operation::clo_weigh() estimating lock weight by
+ * asking parent lock.
+ */
+static unsigned long lovsub_lock_weigh(const struct lu_env *env,
+                                       const struct cl_lock_slice *slice)
+{
+        struct lovsub_lock *lock = cl2lovsub_lock(slice);
+        struct lov_lock    *lov;
+        unsigned long       dumbbell;
+
+        ENTRY;
+
+        LASSERT(cl_lock_is_mutexed(slice->cls_lock));
+
+        if (!list_empty(&lock->lss_parents)) {
+                /*
+                 * It is not clear whether all parents have to be asked and
+                 * their estimations summed, or it is enough to ask one. For
+                 * the current usages, one is always enough.
+                 */
+                lov = container_of(lock->lss_parents.next,
+                                   struct lov_lock_link, lll_list)->lll_super;
+
+                lovsub_parent_lock(env, lov);
+                dumbbell = cl_lock_weigh(env, lov->lls_cl.cls_lock);
+                lovsub_parent_unlock(env, lov);
+        } else
+                dumbbell = 0;
+
+        RETURN(dumbbell);
+}
+
+/**
+ * Maps start/end offsets within a stripe, to offsets within a file.
+ */
+static void lovsub_lock_descr_map(const struct cl_lock_descr *in,
+                                  struct lov_object *obj,
+                                  int stripe, struct cl_lock_descr *out)
+{
+        struct lov_stripe_md *lsm = lov_r0(obj)->lo_lsm;
+        pgoff_t size; /* stripe size in pages */
+        pgoff_t skip; /* how many pages in every stripe are occupied by
+                       * "other" stripes */
+        pgoff_t start;
+        pgoff_t end;
+
+        ENTRY;
+        start = in->cld_start;
+        end   = in->cld_end;
+
+        /*
+         * XXX join file support.
+         */
+        if (lsm->lsm_stripe_count > 1) {
+                size = cl_index(lov2cl(obj), lsm->lsm_stripe_size);
+                skip = (lsm->lsm_stripe_count - 1) * size;
+
+                /* XXX overflow check here? */
+                start += start/size * skip + stripe * size;
+
+                if (end != CL_PAGE_EOF) {
+                        end += end/size * skip + stripe * size;
+                        /*
+                         * And check for overflow...
+                         */
+                        if (end < in->cld_end)
+                                end = CL_PAGE_EOF;
+                }
+        }
+        out->cld_start = start;
+        out->cld_end   = end;
+        EXIT;
+}
+
+/**
+ * Adjusts parent lock extent when a sub-lock is attached to a parent. This is
+ * called in two ways:
+ *
+ *     - as part of receive call-back, when server returns granted extent to
+ *       the client, and
+ *
+ *     - when top-lock finds existing sub-lock in the cache.
+ *
+ * Note, that lock mode is not propagated to the parent: i.e., if CLM_READ
+ * top-lock matches CLM_WRITE sub-lock, top-lock is still CLM_READ.
+ */
+int lov_sublock_modify(const struct lu_env *env, struct lov_lock *lov,
+                       struct lovsub_lock *sublock,
+                       const struct cl_lock_descr *d, int idx)
+{
+        struct cl_lock       *parent;
+        struct cl_lock       *child;
+        struct lovsub_object *subobj;
+        struct cl_lock_descr *pd;
+        struct cl_lock_descr *parent_descr;
+        int                   result;
+
+        parent       = lov->lls_cl.cls_lock;
+        parent_descr = &parent->cll_descr;
+        LASSERT(cl_lock_mode_match(d->cld_mode, parent_descr->cld_mode));
+
+        child  = sublock->lss_cl.cls_lock;
+        subobj = cl2lovsub(sublock->lss_cl.cls_obj);
+        pd     = &lov_env_info(env)->lti_ldescr;
+
+        pd->cld_obj  = parent_descr->cld_obj;
+        pd->cld_mode = parent_descr->cld_mode;
+        lovsub_lock_descr_map(d, subobj->lso_super, subobj->lso_index, pd);
+        lov->lls_sub[idx].sub_got = *d;
+        /*
+         * Notify top-lock about modification, if lock description changes
+         * materially.
+         */
+        if (!cl_lock_ext_match(parent_descr, pd))
+                result = cl_lock_modify(env, parent, pd);
+        else
+                result = 0;
+        return result;
+}
+
+static int lovsub_lock_modify(const struct lu_env *env,
+                              const struct cl_lock_slice *s,
+                              const struct cl_lock_descr *d)
+{
+        struct lovsub_lock   *lock   = cl2lovsub_lock(s);
+        struct lov_lock_link *scan;
+        struct lov_lock      *lov;
+        int result                   = 0;
+
+        ENTRY;
+
+        LASSERT(cl_lock_mode_match(d->cld_mode,
+                                   s->cls_lock->cll_descr.cld_mode));
+        list_for_each_entry(scan, &lock->lss_parents, lll_list) {
+                int rc;
+
+                lov = scan->lll_super;
+                lovsub_parent_lock(env, lov);
+                rc = lov_sublock_modify(env, lov, lock, d, scan->lll_idx);
+                lovsub_parent_unlock(env, lov);
+                result = result ?: rc;
+        }
+        RETURN(result);
+}
+
+static int lovsub_lock_closure(const struct lu_env *env,
+                               const struct cl_lock_slice *slice,
+                               struct cl_lock_closure *closure)
+{
+        struct lovsub_lock   *sub;
+        struct cl_lock       *parent;
+        struct lov_lock_link *scan;
+        int                   result;
+
+        LASSERT(cl_lock_is_mutexed(slice->cls_lock));
+        ENTRY;
+
+        sub    = cl2lovsub_lock(slice);
+        result = 0;
+
+        list_for_each_entry(scan, &sub->lss_parents, lll_list) {
+                parent = scan->lll_super->lls_cl.cls_lock;
+                result = cl_lock_closure_build(env, parent, closure);
+                if (result != 0)
+                        break;
+        }
+        RETURN(result);
+}
+
+/**
+ * An implementation of cl_lock_operations::clo_delete() method. This is
+ * invoked in "bottom-to-top" delete, when lock destruction starts from the
+ * sub-lock (e.g, as a result of ldlm lock LRU policy).
+ */
+static void lovsub_lock_delete(const struct lu_env *env,
+                               const struct cl_lock_slice *slice)
+{
+        struct lovsub_lock   *sub = cl2lovsub_lock(slice);
+        struct lov_lock      *lov;
+        struct cl_lock       *parent;
+        struct lov_lock_link *scan;
+        struct lov_lock_link *temp;
+        struct lov_lock_sub  *subdata;
+
+        LASSERT(cl_lock_is_mutexed(slice->cls_lock));
+        ENTRY;
+
+        list_for_each_entry_safe(scan, temp, &sub->lss_parents, lll_list) {
+                lov     = scan->lll_super;
+                subdata = &lov->lls_sub[scan->lll_idx];
+                parent  = lov->lls_cl.cls_lock;
+                lovsub_parent_lock(env, lov);
+                subdata->sub_got = subdata->sub_descr;
+                lov_lock_unlink(env, scan, sub);
+                CDEBUG(D_DLMTRACE, "%p %p %i %i\n", parent, sub,
+                       lov->lls_nr_filled, parent->cll_state);
+                switch (parent->cll_state) {
+                case CLS_NEW:
+                case CLS_QUEUING:
+                case CLS_ENQUEUED:
+                case CLS_FREEING:
+                        cl_lock_signal(env, parent);
+                        break;
+                case CLS_UNLOCKING:
+                        /*
+                         * Here lies a problem: a sub-lock is canceled while
+                         * top-lock is being unlocked. Top-lock cannot be
+                         * moved into CLS_NEW state, because unlocking has to
+                         * succeed eventually by placing lock into CLS_CACHED
+                         * (or failing it), see cl_unuse_try(). Nor can
+                         * top-lock be left in CLS_CACHED state, because lov
+                         * maintains an invariant that all sub-locks exist in
+                         * CLS_CACHED (this allows cached top-lock to be
+                         * reused immediately). Nor can we wait for top-lock
+                         * state to change, because this can be synchronous to
+                         * the current thread.
+                         *
+                         * We know for sure that lov_lock_unuse() will be
+                         * called at least one more time to finish un-using,
+                         * so leave a mark on the top-lock, that will be seen
+                         * by the next call to lov_lock_unuse().
+                         */
+                        lov->lls_unuse_race = 1;
+                        break;
+                case CLS_CACHED:
+                        cl_lock_state_set(env, parent, CLS_NEW);
+                        if (lov->lls_nr_filled == 0) {
+                                cl_lock_cancel(env, parent);
+                                cl_lock_delete(env, parent);
+                                cl_lock_signal(env, parent);
+                        }
+                        break;
+                case CLS_HELD:
+                default:
+                        CERROR("Impossible state: %i\n", parent->cll_state);
+                        LBUG();
+                }
+                lovsub_parent_unlock(env, lov);
+        }
+        EXIT;
+}
+
+static int lovsub_lock_print(const struct lu_env *env, void *cookie,
+                             lu_printer_t p, const struct cl_lock_slice *slice)
+{
+        struct lovsub_lock   *sub = cl2lovsub_lock(slice);
+        struct lov_lock      *lov;
+        struct lov_lock_link *scan;
+
+        list_for_each_entry(scan, &sub->lss_parents, lll_list) {
+                lov = scan->lll_super;
+                (*p)(env, cookie, "[%d %p ", scan->lll_idx, lov);
+                if (lov != NULL)
+                        cl_lock_descr_print(env, cookie, p,
+                                            &lov->lls_cl.cls_lock->cll_descr);
+                (*p)(env, cookie, "] ");
+        }
+        return 0;
+}
+
+static const struct cl_lock_operations lovsub_lock_ops = {
+        .clo_fini    = lovsub_lock_fini,
+        .clo_state   = lovsub_lock_state,
+        .clo_delete  = lovsub_lock_delete,
+        .clo_modify  = lovsub_lock_modify,
+        .clo_closure = lovsub_lock_closure,
+        .clo_weigh   = lovsub_lock_weigh,
+        .clo_print   = lovsub_lock_print
+};
+
+int lovsub_lock_init(const struct lu_env *env, struct cl_object *obj,
+                     struct cl_lock *lock, const struct cl_io *io)
+{
+        struct lovsub_lock *lsk;
+        int result;
+
+        ENTRY;
+        OBD_SLAB_ALLOC_PTR(lsk, lovsub_lock_kmem);
+        if (lsk != NULL) {
+                CFS_INIT_LIST_HEAD(&lsk->lss_parents);
+                cl_lock_slice_add(lock, &lsk->lss_cl, obj, &lovsub_lock_ops);
+                result = 0;
+        } else
+                result = -ENOMEM;
+        RETURN(result);
+}
+
+/** @} lov */
diff --git a/lustre/lov/lovsub_object.c b/lustre/lov/lovsub_object.c

new file mode 100644 (file)

index 0000000..e49d43d
--- /dev/null
+++ b/lustre/lov/lovsub_object.c
@@ -0,0 +1,155 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_object for LOVSUB layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov lov @{ */
+
+/*****************************************************************************
+ *
+ * Lovsub object operations.
+ *
+ */
+
+int lovsub_object_init(const struct lu_env *env, struct lu_object *obj,
+                       const struct lu_object_conf *conf)
+{
+        struct lovsub_device  *dev   = lu2lovsub_dev(obj->lo_dev);
+        struct lu_object      *below;
+        struct lu_device      *under;
+
+        int result;
+
+        ENTRY;
+        under = &dev->acid_next->cd_lu_dev;
+        below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, under);
+        if (below != NULL) {
+                lu_object_add(obj, below);
+                result = 0;
+        } else
+                result = -ENOMEM;
+        RETURN(result);
+
+}
+
+static void lovsub_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+        struct lovsub_object *los = lu2lovsub(obj);
+
+        ENTRY;
+        lu_object_fini(obj);
+        lu_object_header_fini(&los->lso_header.coh_lu);
+        OBD_SLAB_FREE_PTR(los, lovsub_object_kmem);
+        EXIT;
+}
+
+static int lovsub_object_print(const struct lu_env *env, void *cookie,
+                               lu_printer_t p, const struct lu_object *obj)
+{
+        struct lovsub_object *los = lu2lovsub(obj);
+
+        return (*p)(env, cookie, "[%i]", los->lso_index);
+}
+
+static int lovsub_attr_set(const struct lu_env *env, struct cl_object *obj,
+                           const struct cl_attr *attr, unsigned valid)
+{
+        struct lov_object *lov = cl2lovsub(obj)->lso_super;
+
+        ENTRY;
+        lov_r0(lov)->lo_attr_valid = 0;
+        RETURN(0);
+}
+
+static int lovsub_object_glimpse(const struct lu_env *env,
+                                 const struct cl_object *obj,
+                                 struct ost_lvb *lvb)
+{
+        struct lovsub_object *los = cl2lovsub(obj);
+
+        ENTRY;
+        RETURN(cl_object_glimpse(env, &los->lso_super->lo_cl, lvb));
+}
+
+
+
+static const struct cl_object_operations lovsub_ops = {
+        .coo_page_init = lovsub_page_init,
+        .coo_lock_init = lovsub_lock_init,
+        .coo_attr_set  = lovsub_attr_set,
+        .coo_glimpse   = lovsub_object_glimpse
+};
+
+static const struct lu_object_operations lovsub_lu_obj_ops = {
+        .loo_object_init      = lovsub_object_init,
+        .loo_object_delete    = NULL,
+        .loo_object_release   = NULL,
+        .loo_object_free      = lovsub_object_free,
+        .loo_object_print     = lovsub_object_print,
+        .loo_object_invariant = NULL
+};
+
+struct lu_object *lovsub_object_alloc(const struct lu_env *env,
+                                      const struct lu_object_header *_,
+                                      struct lu_device *dev)
+{
+        struct lovsub_object *los;
+        struct lu_object     *obj;
+
+        ENTRY;
+        OBD_SLAB_ALLOC_PTR(los, lovsub_object_kmem);
+        if (los != NULL) {
+                struct cl_object_header *hdr;
+
+                obj = lovsub2lu(los);
+                hdr = &los->lso_header;
+                cl_object_header_init(hdr);
+                lu_object_init(obj, &hdr->coh_lu, dev);
+                lu_object_add_top(&hdr->coh_lu, obj);
+                los->lso_cl.co_ops = &lovsub_ops;
+                obj->lo_ops = &lovsub_lu_obj_ops;
+        } else
+                obj = NULL;
+        RETURN(obj);
+}
+
+/** @} lov */
diff --git a/lustre/lov/lovsub_page.c b/lustre/lov/lovsub_page.c

new file mode 100644 (file)

index 0000000..70e1f56
--- /dev/null
+++ b/lustre/lov/lovsub_page.c
@@ -0,0 +1,83 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_page for LOVSUB layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOV
+
+#include "lov_cl_internal.h"
+
+/** \addtogroup lov lov @{ */
+
+/*****************************************************************************
+ *
+ * Lovsub page operations.
+ *
+ */
+
+static void lovsub_page_fini(const struct lu_env *env,
+                             struct cl_page_slice *slice)
+{
+        struct lovsub_page *lsb = cl2lovsub_page(slice);
+        ENTRY;
+        OBD_SLAB_FREE_PTR(lsb, lovsub_page_kmem);
+        EXIT;
+}
+
+static const struct cl_page_operations lovsub_page_ops = {
+        .cpo_fini   = lovsub_page_fini
+};
+
+struct cl_page *lovsub_page_init(const struct lu_env *env,
+                                 struct cl_object *obj,
+                                 struct cl_page *page, cfs_page_t *_)
+{
+        struct lovsub_page *lsb;
+        int result;
+
+        ENTRY;
+        OBD_SLAB_ALLOC_PTR(lsb, lovsub_page_kmem);
+        if (lsb != NULL) {
+                cl_page_slice_add(page, &lsb->lsb_cl, obj, &lovsub_page_ops);
+                result = 0;
+        } else
+                result = -ENOMEM;
+        RETURN(ERR_PTR(result));
+}
+
+/** @} lov */
diff --git a/lustre/lvfs/fsfilt_ext3.c b/lustre/lvfs/fsfilt_ext3.c

index 1577be7..25ed99c 100644 (file)
--- a/lustre/lvfs/fsfilt_ext3.c
+++ b/lustre/lvfs/fsfilt_ext3.c
@@ -1804,13 +1804,13 @@ static int commit_chkquot(struct super_block *sb, struct qchk_ctxt *qctxt,
          if (cdqb->dqb_bsoftlimit &&
              toqb(cdqb->dqb_curspace) >= cdqb->dqb_bsoftlimit &&
              !cdqb->dqb_btime)
-                cdqb->dqb_btime = 
+                cdqb->dqb_btime =
                          now + qctxt->qckt_dqinfo[cdqb->dqb_type].dqi_bgrace;
  
          if (cdqb->dqb_isoftlimit &&
              cdqb->dqb_curinodes >= cdqb->dqb_isoftlimit &&
              !cdqb->dqb_itime)
-                cdqb->dqb_itime = 
+                cdqb->dqb_itime =
                          now + qctxt->qckt_dqinfo[cdqb->dqb_type].dqi_igrace;
  
          cdqb->dqb_valid = QIF_ALL;
@@ -1925,7 +1925,7 @@ static int fsfilt_ext3_quotacheck(struct super_block *sb,
  
                  LASSERT(sb_dqopt(sb)->files[i] != NULL);
                  INIT_LIST_HEAD(&id_list);
-#ifndef KERNEL_SUPPORTS_QUOTA_READ 
+#ifndef KERNEL_SUPPORTS_QUOTA_READ
                  rc = lustre_get_qids(sb_dqopt(sb)->files[i], NULL, i, &id_list);
  #else
                  rc = lustre_get_qids(NULL, sb_dqopt(sb)->files[i], i, &id_list);
diff --git a/lustre/lvfs/lvfs_linux.c b/lustre/lvfs/lvfs_linux.c

index 4c2b1a9..f855ca5 100644 (file)
--- a/lustre/lvfs/lvfs_linux.c
+++ b/lustre/lvfs/lvfs_linux.c
@@ -230,7 +230,7 @@ out_up:
  EXPORT_SYMBOL(simple_mknod);
  
  /* utility to make a directory */
-struct dentry *simple_mkdir(struct dentry *dir, struct vfsmount *mnt, 
+struct dentry *simple_mkdir(struct dentry *dir, struct vfsmount *mnt,
                              char *name, int mode, int fix)
  {
          struct dentry *dchild;
@@ -254,7 +254,7 @@ struct dentry *simple_mkdir(struct dentry *dir, struct vfsmount *mnt,
  
                  /* Fixup directory permissions if necessary */
                  if (fix && (old_mode & S_IALLUGO) != (mode & S_IALLUGO)) {
-                        CDEBUG(D_CONFIG, 
+                        CDEBUG(D_CONFIG,
                                 "fixing permissions on %s from %o to %o\n",
                                 name, old_mode, mode);
                          dchild->d_inode->i_mode = (mode & S_IALLUGO) |
@@ -279,7 +279,7 @@ out_up:
  EXPORT_SYMBOL(simple_mkdir);
  
  /* utility to rename a file */
-int lustre_rename(struct dentry *dir, struct vfsmount *mnt, 
+int lustre_rename(struct dentry *dir, struct vfsmount *mnt,
                    char *oldname, char *newname)
  {
          struct dentry *dchild_old, *dchild_new;
@@ -287,21 +287,21 @@ int lustre_rename(struct dentry *dir, struct vfsmount *mnt,
          ENTRY;
  
          ASSERT_KERNEL_CTXT("kernel doing rename outside kernel context\n");
-        CDEBUG(D_INODE, "renaming file %.*s to %.*s\n", 
+        CDEBUG(D_INODE, "renaming file %.*s to %.*s\n",
                 (int)strlen(oldname), oldname, (int)strlen(newname), newname);
  
          dchild_old = ll_lookup_one_len(oldname, dir, strlen(oldname));
          if (IS_ERR(dchild_old))
                  RETURN(PTR_ERR(dchild_old));
  
-        if (!dchild_old->d_inode) 
+        if (!dchild_old->d_inode)
                  GOTO(put_old, err = -ENOENT);
  
          dchild_new = ll_lookup_one_len(newname, dir, strlen(newname));
          if (IS_ERR(dchild_new))
                  GOTO(put_old, err = PTR_ERR(dchild_new));
  
-        err = ll_vfs_rename(dir->d_inode, dchild_old, mnt, 
+        err = ll_vfs_rename(dir->d_inode, dchild_old, mnt,
                              dir->d_inode, dchild_new, mnt);
  
          dput(dchild_new);
@@ -481,7 +481,7 @@ void obd_update_maxusage()
          if (max2 > obd_max_alloc)
                  obd_max_alloc = max2;
          spin_unlock(&obd_updatemax_lock);
-        
+
  }
  
  __u64 obd_memory_max(void)
diff --git a/lustre/lvfs/quotafmt_test.c b/lustre/lvfs/quotafmt_test.c

index 2b37387..de6c32e 100644 (file)
--- a/lustre/lvfs/quotafmt_test.c
+++ b/lustre/lvfs/quotafmt_test.c
@@ -84,7 +84,7 @@ static int quotfmt_initialize(struct lustre_quota_info *lqi,
                  LOCK_INODE_MUTEX_PARENT(parent_inode);
                  de = lookup_one_len(name, tgt->obd_lvfs_ctxt.pwd, namelen);
                  if (!IS_ERR(de) && de->d_inode)
-                        ll_vfs_unlink(parent_inode, de, 
+                        ll_vfs_unlink(parent_inode, de,
                                        tgt->obd_lvfs_ctxt.pwdmnt);
                  if (!IS_ERR(de))
                          dput(de);
@@ -380,7 +380,7 @@ static int quotfmt_test_4(struct lustre_quota_info *lqi)
  
  static int quotfmt_test_5(struct lustre_quota_info *lqi)
  {
-#ifndef KERNEL_SUPPORTS_QUOTA_READ 
+#ifndef KERNEL_SUPPORTS_QUOTA_READ
          int i, rc = 0;
  
          for (i = USRQUOTA; i < MAXQUOTAS && !rc; i++) {
diff --git a/lustre/mdc/mdc_locks.c b/lustre/mdc/mdc_locks.c

index da91c7e..bf90727 100644 (file)
--- a/lustre/mdc/mdc_locks.c
+++ b/lustre/mdc/mdc_locks.c
@@ -155,7 +155,7 @@ ldlm_mode_t mdc_lock_match(struct obd_export *exp, int flags,
  
          fid_build_reg_res_name(fid, &res_id);
          rc = ldlm_lock_match(class_exp2obd(exp)->obd_namespace, flags,
-                             &res_id, type, policy, mode, lockh);
+                             &res_id, type, policy, mode, lockh, 0);
          RETURN(rc);
  }
  
@@ -241,7 +241,7 @@ static struct ptlrpc_request *mdc_intent_open_pack(struct obd_export *exp,
          struct ptlrpc_request *req;
          struct obd_device     *obddev = class_exp2obd(exp);
          struct ldlm_intent    *lit;
-        int                    joinfile = !!((it->it_flags & O_JOIN_FILE) && 
+        int                    joinfile = !!((it->it_flags & O_JOIN_FILE) &&
                                                op_data->op_data);
          CFS_LIST_HEAD(cancels);
          int                    count = 0;
@@ -812,7 +812,7 @@ static int mdc_finish_intent_lock(struct obd_export *exp,
  
                  memcpy(&old_lock, lockh, sizeof(*lockh));
                  if (ldlm_lock_match(NULL, LDLM_FL_BLOCK_GRANTED, NULL,
-                                    LDLM_IBITS, &policy, LCK_NL, &old_lock)) {
+                                    LDLM_IBITS, &policy, LCK_NL, &old_lock, 0)) {
                          ldlm_lock_decref_and_cancel(lockh,
                                                      it->d.lustre.it_lock_mode);
                          memcpy(lockh, &old_lock, sizeof(old_lock));
@@ -1024,7 +1024,7 @@ int mdc_intent_getattr_async(struct obd_export *exp,
          req->rq_async_args.pointer_arg[1] = minfo;
          req->rq_async_args.pointer_arg[2] = einfo;
          req->rq_interpret_reply = mdc_intent_getattr_async_interpret;
-        ptlrpcd_add_req(req);
+        ptlrpcd_add_req(req, PSCOPE_OTHER);
  
          RETURN(0);
  }
@@ -1043,8 +1043,8 @@ int mdc_revalidate_lock(struct obd_export *exp,
          ENTRY;
  
          fid_build_reg_res_name(fid, &res_id);
-        /* As not all attributes are kept under update lock, e.g. 
-           owner/group/acls are under lookup lock, we need both 
+        /* As not all attributes are kept under update lock, e.g.
+           owner/group/acls are under lookup lock, we need both
             ibits for GETATTR. */
          policy.l_inodebits.bits = (it->it_op == IT_GETATTR) ?
                  MDS_INODELOCK_UPDATE | MDS_INODELOCK_LOOKUP :
@@ -1052,7 +1052,7 @@ int mdc_revalidate_lock(struct obd_export *exp,
  
          mode = ldlm_lock_match(exp->exp_obd->obd_namespace,
                                 LDLM_FL_BLOCK_GRANTED, &res_id, LDLM_IBITS,
-                               &policy, LCK_CR|LCK_CW|LCK_PR|LCK_PW, &lockh);
+                               &policy, LCK_CR|LCK_CW|LCK_PR|LCK_PW, &lockh, 0);
          if (mode) {
                  it->d.lustre.it_lock_handle = lockh.cookie;
                  it->d.lustre.it_lock_mode = mode;
diff --git a/lustre/mdc/mdc_reint.c b/lustre/mdc/mdc_reint.c

index 12465cd..881a223 100644 (file)
--- a/lustre/mdc/mdc_reint.c
+++ b/lustre/mdc/mdc_reint.c
@@ -131,7 +131,7 @@ int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data,
          bits = MDS_INODELOCK_UPDATE;
          if (op_data->op_attr.ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID))
                  bits |= MDS_INODELOCK_LOOKUP;
-        if ((op_data->op_flags & MF_MDC_CANCEL_FID1) && 
+        if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
              (fid_is_sane(&op_data->op_fid1)))
                  count = mdc_resource_get_unused(exp, &op_data->op_fid1,
                                                  &cancels, LCK_EX, bits);
@@ -228,7 +228,7 @@ int mdc_create(struct obd_export *exp, struct md_op_data *op_data,
                  }
          }
  
-        if ((op_data->op_flags & MF_MDC_CANCEL_FID1) && 
+        if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
              (fid_is_sane(&op_data->op_fid1)))
                  count = mdc_resource_get_unused(exp, &op_data->op_fid1,
                                                  &cancels, LCK_EX,
@@ -264,7 +264,7 @@ int mdc_create(struct obd_export *exp, struct md_op_data *op_data,
          level = LUSTRE_IMP_FULL;
   resend:
          rc = mdc_reint(req, exp->exp_obd->u.cli.cl_rpc_lock, level);
-        
+
          /* Resend if we were told to. */
          if (rc == -ERESTARTSYS) {
                  level = LUSTRE_IMP_RECOVER;
@@ -298,12 +298,12 @@ int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data,
  
          LASSERT(req == NULL);
  
-        if ((op_data->op_flags & MF_MDC_CANCEL_FID1) && 
+        if ((op_data->op_flags & MF_MDC_CANCEL_FID1) &&
              (fid_is_sane(&op_data->op_fid1)))
                  count = mdc_resource_get_unused(exp, &op_data->op_fid1,
                                                  &cancels, LCK_EX,
                                                  MDS_INODELOCK_UPDATE);
-        if ((op_data->op_flags & MF_MDC_CANCEL_FID3) && 
+        if ((op_data->op_flags & MF_MDC_CANCEL_FID3) &&
              (fid_is_sane(&op_data->op_fid3)))
                  count += mdc_resource_get_unused(exp, &op_data->op_fid3,
                                                   &cancels, LCK_EX,
@@ -407,7 +407,7 @@ int mdc_rename(struct obd_export *exp, struct md_op_data *op_data,
                  count += mdc_resource_get_unused(exp, &op_data->op_fid2,
                                                   &cancels, LCK_EX,
                                                   MDS_INODELOCK_UPDATE);
-        if ((op_data->op_flags & MF_MDC_CANCEL_FID3) && 
+        if ((op_data->op_flags & MF_MDC_CANCEL_FID3) &&
              (fid_is_sane(&op_data->op_fid3)))
                  count += mdc_resource_get_unused(exp, &op_data->op_fid3,
                                                   &cancels, LCK_EX,
diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c

index 75bc4b6..4789295 100644 (file)
--- a/lustre/mdc/mdc_request.c
+++ b/lustre/mdc/mdc_request.c
@@ -57,11 +57,10 @@
  #include <lustre_param.h>
  #include "mdc_internal.h"
  
-static quota_interface_t *quota_interface;
+quota_interface_t *quota_interface;
  
  #define REQUEST_MINOR 244
  
-static quota_interface_t *quota_interface;
  extern quota_interface_t mdc_quota_interface;
  
  static int mdc_cleanup(struct obd_device *obd);
@@ -150,7 +149,7 @@ int mdc_getstatus(struct obd_export *exp, struct lu_fid *rootfid,
   * from server. Even for cases when acl_size and md_size is zero, RPC header
   * will contain 4 fields and RPC itself will contain zero size fields. This is
   * because mdt_getattr*() _always_ returns 4 fields, but if acl is not needed
- * and thus zero, it shirinks it, making zero size. The same story about
+ * and thus zero, it shrinks it, making zero size. The same story about
   * md_size. And this is course of problem when client waits for smaller number
   * of fields. This issue will be fixed later when client gets aware of RPC
   * layouts.  --umka
@@ -1683,7 +1682,6 @@ static int mdc_process_config(struct obd_device *obd, obd_count len, void *buf)
          int rc = 0;
  
          lprocfs_mdc_init_vars(&lvars);
-
          switch (lcfg->lcfg_command) {
          case LCFG_SPTLRPC_CONF:
                  rc = sptlrpc_cliobd_process_config(obd, lcfg);
@@ -1785,7 +1783,7 @@ static int mdc_renew_capa(struct obd_export *exp, struct obd_capa *oc,
          req->rq_async_args.pointer_arg[0] = oc;
          req->rq_async_args.pointer_arg[1] = cb;
          req->rq_interpret_reply = mdc_interpret_renew_capa;
-        ptlrpcd_add_req(req);
+        ptlrpcd_add_req(req, PSCOPE_OTHER);
          RETURN(0);
  }
  
diff --git a/lustre/mdd/mdd_device.c b/lustre/mdd/mdd_device.c

index 92b1077..b84f3b4 100644 (file)
--- a/lustre/mdd/mdd_device.c
+++ b/lustre/mdd/mdd_device.c
@@ -87,7 +87,7 @@ static int mdd_device_init(const struct lu_env *env, struct lu_device *d,
  static struct lu_device *mdd_device_fini(const struct lu_env *env,
                                           struct lu_device *d)
  {
-       struct mdd_device *mdd = lu2mdd_dev(d);
+        struct mdd_device *mdd = lu2mdd_dev(d);
          struct lu_device *next = &mdd->mdd_child->dd_lu_dev;
          int rc;
  
@@ -244,7 +244,7 @@ static int mdd_recovery_complete(const struct lu_env *env,
  }
  
  const struct lu_device_operations mdd_lu_ops = {
-       .ldo_object_alloc      = mdd_object_alloc,
+        .ldo_object_alloc      = mdd_object_alloc,
          .ldo_process_config    = mdd_process_config,
          .ldo_recovery_complete = mdd_recovery_complete
  };
@@ -268,7 +268,7 @@ static int mdd_root_get(const struct lu_env *env,
  static int mdd_statfs(const struct lu_env *env, struct md_device *m,
                        struct kstatfs *sfs)
  {
-       struct mdd_device *mdd = lu2mdd_dev(&m->md_lu_dev);
+        struct mdd_device *mdd = lu2mdd_dev(&m->md_lu_dev);
          int rc;
  
          ENTRY;
@@ -284,7 +284,7 @@ static int mdd_statfs(const struct lu_env *env, struct md_device *m,
  static int mdd_maxsize_get(const struct lu_env *env, struct md_device *m,
                             int *md_size, int *cookie_size)
  {
-       struct mdd_device *mdd = lu2mdd_dev(&m->md_lu_dev);
+        struct mdd_device *mdd = lu2mdd_dev(&m->md_lu_dev);
          ENTRY;
  
          *md_size = mdd_lov_mdsize(env, mdd);
@@ -297,7 +297,7 @@ static int mdd_init_capa_ctxt(const struct lu_env *env, struct md_device *m,
                                int mode, unsigned long timeout, __u32 alg,
                                struct lustre_capa_key *keys)
  {
-       struct mdd_device *mdd = lu2mdd_dev(&m->md_lu_dev);
+        struct mdd_device *mdd = lu2mdd_dev(&m->md_lu_dev);
          struct mds_obd    *mds = &mdd2obd_dev(mdd)->u.mds;
          int rc;
          ENTRY;
@@ -312,7 +312,7 @@ static int mdd_update_capa_key(const struct lu_env *env,
                                 struct md_device *m,
                                 struct lustre_capa_key *key)
  {
-       struct mdd_device *mdd = lu2mdd_dev(&m->md_lu_dev);
+        struct mdd_device *mdd = lu2mdd_dev(&m->md_lu_dev);
          struct obd_export *lov_exp = mdd2obd_dev(mdd)->u.mds.mds_osc_exp;
          int rc;
          ENTRY;
diff --git a/lustre/mdd/mdd_dir.c b/lustre/mdd/mdd_dir.c

index b67784a..3d96f45 100644 (file)
--- a/lustre/mdd/mdd_dir.c
+++ b/lustre/mdd/mdd_dir.c
@@ -269,7 +269,7 @@ static int __mdd_may_link(const struct lu_env *env, struct mdd_object *obj)
  
          /*
           * Subdir count limitation can be broken through.
-         */ 
+         */
          if (la->la_nlink >= m->mdd_dt_conf.ddp_max_nlink &&
              !S_ISDIR(la->la_mode))
                  RETURN(-EMLINK);
@@ -348,19 +348,19 @@ static inline int mdd_is_sticky(const struct lu_env *env,
                  rc = mdd_la_get(env, pobj, tmp_la, BYPASS_CAPA);
                  if (rc)
                          return rc;
-        
+
                  if (!(tmp_la->la_mode & S_ISVTX) ||
                       (tmp_la->la_uid == uc->mu_fsuid))
                          return 0;
          }
  
          rc = mdd_la_get(env, cobj, tmp_la, BYPASS_CAPA);
-        if (rc) 
+        if (rc)
                  return rc;
-        
+
          if (tmp_la->la_uid == uc->mu_fsuid)
                  return 0;
-        
+
          return !mdd_capable(uc, CFS_CAP_FOWNER);
  }
  
@@ -978,7 +978,7 @@ static int mdd_rename_tgt(const struct lu_env *env,
          if (rc)
                  GOTO(cleanup, rc);
  
-        /* 
+        /*
           * For tobj is remote case cmm layer has processed
           * and pass NULL tobj to here. So when tobj is NOT NULL,
           * it must be local one.
@@ -1658,7 +1658,7 @@ static int mdd_rename(const struct lu_env *env,
                          GOTO(cleanup, rc);
          }
  
-        /* 
+        /*
           * For tobj is remote case cmm layer has processed
           * and set tobj to NULL then. So when tobj is NOT NULL,
           * it must be local one.
diff --git a/lustre/mdd/mdd_internal.h b/lustre/mdd/mdd_internal.h

index 205cce1..a2cdc61 100644 (file)
--- a/lustre/mdd/mdd_internal.h
+++ b/lustre/mdd/mdd_internal.h
@@ -108,12 +108,12 @@ enum mdd_object_role {
  };
  
  struct mdd_object {
-        struct md_object  mod_obj;
+        struct md_object   mod_obj;
          /* open count */
-        __u32             mod_count;
-        __u32             mod_valid;
-        unsigned long     mod_flags;
-        struct dynlock    mod_pdlock;
+        __u32              mod_count;
+        __u32              mod_valid;
+        unsigned long      mod_flags;
+        struct dynlock     mod_pdlock;
  #ifdef CONFIG_LOCKDEP
          /* "dep_map" name is assumed by lockdep.h macros. */
          struct lockdep_map dep_map;
diff --git a/lustre/mdd/mdd_lov.c b/lustre/mdd/mdd_lov.c

index 0cd36e1..3789019 100644 (file)
--- a/lustre/mdd/mdd_lov.c
+++ b/lustre/mdd/mdd_lov.c
@@ -150,7 +150,6 @@ int mdd_init_obd(const struct lu_env *env, struct mdd_device *mdd,
          obd->obd_upcall.onu_upcall = mdd_notify;
          obd->obd_upcall.onu_owner = mdd;
          mdd->mdd_obd_dev = obd;
-
          EXIT;
  class_detach:
          if (rc)
@@ -185,7 +184,7 @@ int mdd_fini_obd(const struct lu_env *env, struct mdd_device *mdd,
          if (rc)
                  GOTO(lcfg_cleanup, rc);
          mdd->mdd_obd_dev = NULL;
-        
+
          EXIT;
  lcfg_cleanup:
          return rc;
@@ -673,10 +672,10 @@ int mdd_setattr_log(const struct lu_env *env, struct mdd_device *mdd,
          /* journal chown/chgrp in llog, just like unlink */
          if (lmm_size > 0) {
                  CDEBUG(D_INFO, "setattr llog for uid/gid=%lu/%lu\n",
-                        (unsigned long)ma->ma_attr.la_uid, 
+                        (unsigned long)ma->ma_attr.la_uid,
                          (unsigned long)ma->ma_attr.la_gid);
                  return mdd_log_op_setattr(obd, ma->ma_attr.la_uid,
-                                          ma->ma_attr.la_gid, lmm, 
+                                          ma->ma_attr.la_gid, lmm,
                                            lmm_size, logcookies,
                                            cookies_size);
          } else
@@ -746,7 +745,7 @@ out:
  }
  
  int mdd_lov_setattr_async(const struct lu_env *env, struct mdd_object *obj,
-                          struct lov_mds_md *lmm, int lmm_size, 
+                          struct lov_mds_md *lmm, int lmm_size,
                            struct llog_cookie *logcookies)
  {
          struct mdd_device   *mdd = mdo2mdd(&obj->mod_obj);
diff --git a/lustre/mdd/mdd_object.c b/lustre/mdd/mdd_object.c

index 96425cd..5cf0a15 100644 (file)
--- a/lustre/mdd/mdd_object.c
+++ b/lustre/mdd/mdd_object.c
@@ -180,16 +180,16 @@ struct lu_object *mdd_object_alloc(const struct lu_env *env,
  static int mdd_object_init(const struct lu_env *env, struct lu_object *o,
                             const struct lu_object_conf *_)
  {
-       struct mdd_device *d = lu2mdd_dev(o->lo_dev);
-       struct lu_object  *below;
+        struct mdd_device *d = lu2mdd_dev(o->lo_dev);
+        struct lu_object  *below;
          struct lu_device  *under;
          ENTRY;
  
-       under = &d->mdd_child->dd_lu_dev;
-       below = under->ld_ops->ldo_object_alloc(env, o->lo_header, under);
+        under = &d->mdd_child->dd_lu_dev;
+        below = under->ld_ops->ldo_object_alloc(env, o->lo_header, under);
          mdd_pdlock_init(lu2mdd_obj(o));
          if (below == NULL)
-               RETURN(-ENOMEM);
+                RETURN(-ENOMEM);
  
          lu_object_add(o, below);
          RETURN(0);
@@ -206,7 +206,7 @@ static int mdd_object_start(const struct lu_env *env, struct lu_object *o)
  static void mdd_object_free(const struct lu_env *env, struct lu_object *o)
  {
          struct mdd_object *mdd = lu2mdd_obj(o);
-       
+
          lu_object_fini(o);
          OBD_FREE_PTR(mdd);
  }
@@ -239,9 +239,9 @@ static void mdd_object_delete(const struct lu_env *env, struct lu_object *o)
  }
  
  static const struct lu_object_operations mdd_lu_obj_ops = {
-       .loo_object_init    = mdd_object_init,
-       .loo_object_start   = mdd_object_start,
-       .loo_object_free    = mdd_object_free,
+        .loo_object_init    = mdd_object_init,
+        .loo_object_start   = mdd_object_start,
+        .loo_object_free    = mdd_object_free,
          .loo_object_delete  = mdd_object_delete
  };
  
@@ -669,7 +669,7 @@ static int mdd_fix_attr(const struct lu_env *env, struct mdd_object *obj,
                          la->la_valid &= ~LA_ATIME;
                  RETURN(0);
          }
- 
+
          /* Check if flags change. */
          if (la->la_valid & LA_FLAGS) {
                  unsigned int oldflags = 0;
@@ -685,7 +685,7 @@ static int mdd_fix_attr(const struct lu_env *env, struct mdd_object *obj,
                  if (mdd_is_immutable(obj))
                          oldflags |= LUSTRE_IMMUTABLE_FL;
                  if (mdd_is_append(obj))
-                        oldflags |= LUSTRE_APPEND_FL; 
+                        oldflags |= LUSTRE_APPEND_FL;
                  if ((oldflags ^ newflags) &&
                      !mdd_capable(uc, CFS_CAP_LINUX_IMMUTABLE))
                          RETURN(-EPERM);
@@ -1354,7 +1354,7 @@ static int mdd_close(const struct lu_env *env, struct md_object *obj,
                  rc = mdd_object_kill(env, mdd_obj, ma);
          else
                  ma->ma_valid &= ~(MA_LOV | MA_COOKIE);
-        
+
          mdd_write_unlock(env, mdd_obj);
          mdd_trans_stop(env, mdo2mdd(obj), rc, handle);
          RETURN(rc);
diff --git a/lustre/mdd/mdd_permission.c b/lustre/mdd/mdd_permission.c

index 7714e61..80e5e83 100644 (file)
--- a/lustre/mdd/mdd_permission.c
+++ b/lustre/mdd/mdd_permission.c
@@ -65,7 +65,7 @@
   * Get default acl EA only.
   * Hold read_lock for mdd_obj.
   */
-int mdd_def_acl_get(const struct lu_env *env, struct mdd_object *mdd_obj, 
+int mdd_def_acl_get(const struct lu_env *env, struct mdd_object *mdd_obj,
                      struct md_attr *ma)
  {
          struct lu_buf *buf;
@@ -74,7 +74,7 @@ int mdd_def_acl_get(const struct lu_env *env, struct mdd_object *mdd_obj,
  
          if (ma->ma_valid & MA_ACL_DEF)
                  RETURN(0);
-        
+
          buf = mdd_buf_get(env, ma->ma_acl, ma->ma_acl_size);
          rc = mdo_xattr_get(env, mdd_obj, buf, XATTR_NAME_ACL_DEFAULT,
                             BYPASS_CAPA);
@@ -91,7 +91,7 @@ int mdd_def_acl_get(const struct lu_env *env, struct mdd_object *mdd_obj,
  /*
   * Hold write_lock for o.
   */
-int mdd_acl_chmod(const struct lu_env *env, struct mdd_object *o, __u32 mode, 
+int mdd_acl_chmod(const struct lu_env *env, struct mdd_object *o, __u32 mode,
                    struct thandle *handle)
  {
          struct lu_buf           *buf;
@@ -102,9 +102,9 @@ int mdd_acl_chmod(const struct lu_env *env, struct mdd_object *o, __u32 mode,
  
          ENTRY;
  
-        buf = mdd_buf_get(env, mdd_env_info(env)->mti_xattr_buf, 
+        buf = mdd_buf_get(env, mdd_env_info(env)->mti_xattr_buf,
                            sizeof(mdd_env_info(env)->mti_xattr_buf));
-        
+
          rc = mdo_xattr_get(env, o, buf, XATTR_NAME_ACL_ACCESS, BYPASS_CAPA);
          if ((rc == -EOPNOTSUPP) || (rc == -ENODATA))
                  RETURN(0);
@@ -118,7 +118,7 @@ int mdd_acl_chmod(const struct lu_env *env, struct mdd_object *o, __u32 mode,
                        sizeof(posix_acl_xattr_entry);
          if (entry_count <= 0)
                  RETURN(0);
-       
+
          rc = lustre_posix_acl_chmod_masq(entry, mode, entry_count);
          if (rc)
                  RETURN(rc);
@@ -147,13 +147,13 @@ int __mdd_acl_init(const struct lu_env *env, struct mdd_object *obj,
                        sizeof(posix_acl_xattr_entry);
          if (entry_count <= 0)
                  RETURN(0);
-       
-       if (S_ISDIR(*mode)) {
-                rc = mdo_xattr_set(env, obj, buf, XATTR_NAME_ACL_DEFAULT, 0, 
+
+        if (S_ISDIR(*mode)) {
+                rc = mdo_xattr_set(env, obj, buf, XATTR_NAME_ACL_DEFAULT, 0,
                                     handle, BYPASS_CAPA);
                  if (rc)
                          RETURN(rc);
-       }
+        }
  
          rc = lustre_posix_acl_create_masq(entry, mode, entry_count);
          if (rc <= 0)
@@ -180,7 +180,7 @@ static int mdd_check_acl(const struct lu_env *env, struct mdd_object *obj,
          int rc;
          ENTRY;
  
-        buf = mdd_buf_get(env, mdd_env_info(env)->mti_xattr_buf, 
+        buf = mdd_buf_get(env, mdd_env_info(env)->mti_xattr_buf,
                            sizeof(mdd_env_info(env)->mti_xattr_buf));
          rc = mdo_xattr_get(env, obj, buf, XATTR_NAME_ACL_ACCESS,
                             mdd_object_capa(env, obj));
@@ -270,7 +270,7 @@ check_capabilities:
          RETURN(-EACCES);
  }
  
-int mdd_permission(const struct lu_env *env, 
+int mdd_permission(const struct lu_env *env,
                     struct md_object *pobj, struct md_object *cobj,
                     struct md_attr *ma, int mask)
  {
diff --git a/lustre/mds/lproc_mds.c b/lustre/mds/lproc_mds.c

index 138eafa..f11796d 100644 (file)
--- a/lustre/mds/lproc_mds.c
+++ b/lustre/mds/lproc_mds.c
@@ -110,7 +110,7 @@ static int lprocfs_mds_wr_evict_client(struct file *file, const char *buffer,
                  ptlrpc_check_set(NULL, set);
          }
  
-        /* See the comments in function lprocfs_wr_evict_client() 
+        /* See the comments in function lprocfs_wr_evict_client()
           * in ptlrpc/lproc_ptlrpc.c for details. - jay */
          class_incref(obd, __FUNCTION__, cfs_current());
          LPROCFS_EXIT();
diff --git a/lustre/mds/mds_fs.c b/lustre/mds/mds_fs.c

index ec83e30..cecf56d 100644 (file)
--- a/lustre/mds/mds_fs.c
+++ b/lustre/mds/mds_fs.c
@@ -84,7 +84,7 @@ int mds_obd_create(struct obd_export *exp, struct obdo *oa,
                                     strlen(MDD_OBD_NAME))) {
                  RETURN(0);
          }
-        
+
          push_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, &ucred);
  
          sprintf(fidname, "OBJECTS/%u.%u", tmpname, current->pid);
@@ -126,7 +126,7 @@ int mds_obd_create(struct obd_export *exp, struct obdo *oa,
  
          lock_kernel();
          rc = ll_vfs_rename(mds->mds_objects_dir->d_inode, filp->f_dentry,
-                           filp->f_vfsmnt, mds->mds_objects_dir->d_inode, 
+                           filp->f_vfsmnt, mds->mds_objects_dir->d_inode,
                             new_child, filp->f_vfsmnt);
          unlock_kernel();
          if (rc)
diff --git a/lustre/mds/mds_lov.c b/lustre/mds/mds_lov.c

index bd9295a..7d0238e 100644 (file)
--- a/lustre/mds/mds_lov.c
+++ b/lustre/mds/mds_lov.c
@@ -310,7 +310,7 @@ int mds_lov_clear_orphans(struct mds_obd *mds, struct obd_uuid *ost_uuid)
          oa.o_gr = FILTER_GROUP_MDS0 + mds->mds_id;
          oa.o_valid = OBD_MD_FLFLAGS | OBD_MD_FLGROUP;
          if (ost_uuid != NULL)
-                oti.oti_ost_uuid = ost_uuid;       
+                oti.oti_ost_uuid = ost_uuid;
          rc = obd_create(mds->mds_osc_exp, &oa, &empty_ea, &oti);
  
          RETURN(rc);
@@ -646,11 +646,11 @@ static int __mds_lov_synchronize(void *data)
                  GOTO(out, rc);
  
          ctxt = llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT);
-        if (!ctxt) 
+        if (!ctxt)
                  GOTO(out, rc = -ENODEV);
  
          OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_LLOG_SYNC_TIMEOUT, 60);
-        rc = llog_connect(ctxt, NULL, NULL, uuid); 
+        rc = llog_connect(ctxt, NULL, NULL, uuid);
          llog_ctxt_put(ctxt);
          if (rc != 0) {
                  CERROR("%s failed at llog_origin_connect: %d\n",
diff --git a/lustre/mdt/mdt_capa.c b/lustre/mdt/mdt_capa.c

index 9abc6df..3f911de 100644 (file)
--- a/lustre/mdt/mdt_capa.c
+++ b/lustre/mdt/mdt_capa.c
@@ -222,12 +222,13 @@ static int mdt_ck_thread_main(void *args)
          thread->t_flags = SVC_RUNNING;
          cfs_waitq_signal(&thread->t_ctl_waitq);
  
-        rc = lu_env_init(&env, NULL, LCT_MD_THREAD);
+        rc = lu_env_init(&env, LCT_MD_THREAD|LCT_REMEMBER|LCT_NOREF);
          if (rc)
                  RETURN(rc);
  
          thread->t_env = &env;
          env.le_ctx.lc_thread = thread;
+        env.le_ctx.lc_cookie = 0x1;
  
          info = lu_context_key_get(&env.le_ctx, &mdt_thread_key);
          LASSERT(info != NULL);
diff --git a/lustre/mdt/mdt_handler.c b/lustre/mdt/mdt_handler.c

index 9194314..fd12681 100644 (file)
--- a/lustre/mdt/mdt_handler.c
+++ b/lustre/mdt/mdt_handler.c
@@ -1117,9 +1117,13 @@ static int mdt_connect(struct mdt_thread_info *info)
                  LASSERT(req->rq_export != NULL);
                  info->mti_mdt = mdt_dev(req->rq_export->exp_obd->obd_lu_dev);
                  rc = mdt_init_idmap(info);
-                if (rc != 0)
+                if (rc != 0) {
+                        struct obd_export *exp;
+
+                        exp = req->rq_export;
                          /* if mdt_init_idmap failed, revocation for connect */
-                        obd_disconnect(class_export_get(req->rq_export));
+                        obd_disconnect(class_export_get(exp));
+                }
          } else
                  rc = err_serious(rc);
          return rc;
@@ -1867,7 +1871,7 @@ int mdt_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc,
          if (lock->l_req_mode == LCK_COS && lock->l_blocking_lock != NULL) {
                  struct lu_env env;
  
-                rc = lu_env_init(&env, NULL, LCT_MD_THREAD);
+                rc = lu_env_init(&env, LCT_MD_THREAD);
                  if (unlikely(rc != 0))
                          CWARN("lu_env initialization failed with rc = %d,"
                                "cannot start asynchronous commit\n", rc);
@@ -3851,20 +3855,12 @@ static struct lu_device *mdt_layer_setup(struct lu_env *env,
                  GOTO(out, rc = -ENODEV);
          }
  
-        rc = lu_context_refill(&env->le_ctx);
+        rc = lu_env_refill((struct lu_env *)env);
          if (rc != 0) {
-                CERROR("Failure to refill context: '%d'\n", rc);
+                CERROR("Failure to refill session: '%d'\n", rc);
                  GOTO(out_type, rc);
          }
  
-        if (env->le_ses != NULL) {
-                rc = lu_context_refill(env->le_ses);
-                if (rc != 0) {
-                        CERROR("Failure to refill session: '%d'\n", rc);
-                        GOTO(out_type, rc);
-                }
-        }
-
          ldt = type->typ_lu;
          if (ldt == NULL) {
                  CERROR("type: '%s'\n", typename);
@@ -3960,7 +3956,7 @@ static void mdt_fini(const struct lu_env *env, struct mdt_device *m)
  
          /* At this point, obd exports might still be on the "obd_zombie_exports"
           * list, and obd_zombie_impexp_thread() is trying to destroy them.
-         * We wait a little bit until all exports (except the self-export) 
+         * We wait a little bit until all exports (except the self-export)
           * have been destroyed, because the whole mdt stack might be accessed
           * in mdt_destroy_export(). This will not be a long time, maybe one or
           * two seconds are enough. This is not a problem while umounting.
@@ -3975,7 +3971,7 @@ static void mdt_fini(const struct lu_env *env, struct mdt_device *m)
  
          target_recovery_fini(obd);
          mdt_stop_ptlrpc_service(m);
-
+        obd_zombie_barrier();
          mdt_fs_cleanup(env, m);
  
          upcall_cache_cleanup(m->mdt_identity_cache);
@@ -4014,7 +4010,6 @@ static void mdt_fini(const struct lu_env *env, struct mdt_device *m)
                  d->ld_site = NULL;
          }
          LASSERT(atomic_read(&d->ld_ref) == 0);
-        md_device_fini(&m->mdt_md_dev);
  
          EXIT;
  }
@@ -4078,6 +4073,19 @@ static int mdt_init0(const struct lu_env *env, struct mdt_device *m,
          int                        rc;
          ENTRY;
  
+        md_device_init(&m->mdt_md_dev, ldt);
+        /*
+         * Environment (env) might be missing mdt_thread_key values at that
+         * point, if device is allocated when mdt_thread_key is in QUIESCENT
+         * mode.
+         *
+         * Usually device allocation path doesn't use module key values, but
+         * mdt has to do a lot of work here, so allocate key value.
+         */
+        rc = lu_env_refill((struct lu_env *)env);
+        if (rc != 0)
+                RETURN(rc);
+
          info = lu_context_key_get(&env->le_ctx, &mdt_thread_key);
          LASSERT(info != NULL);
  
@@ -4117,7 +4125,6 @@ static int mdt_init0(const struct lu_env *env, struct mdt_device *m,
          if (mite == NULL)
                  RETURN(-ENOMEM);
  
-        md_device_init(&m->mdt_md_dev, ldt);
          s = &mite->ms_lu;
  
          m->mdt_md_dev.md_lu_dev.ld_ops = &mdt_lu_ops;
@@ -4277,8 +4284,6 @@ err_fini_site:
          lu_site_fini(s);
  err_free_site:
          OBD_FREE_PTR(mite);
-
-        md_device_fini(&m->mdt_md_dev);
          return (rc);
  }
  
@@ -4714,7 +4719,7 @@ static int mdt_destroy_export(struct obd_export *export)
          mdt = mdt_dev(obd->obd_lu_dev);
          LASSERT(mdt != NULL);
  
-        rc = lu_env_init(&env, NULL, LCT_MD_THREAD);
+        rc = lu_env_init(&env, LCT_MD_THREAD);
          if (rc)
                  RETURN(rc);
  
@@ -4752,7 +4757,7 @@ static int mdt_destroy_export(struct obd_export *export)
                  list_del_init(&mfd->mfd_list);
                  mdt_mfd_close(info, mfd);
                  /* TODO: if we close the unlinked file,
-                 * we need to remove it's objects from OST */
+                 * we need to remove its objects from OST */
                  memset(&ma->ma_attr, 0, sizeof(ma->ma_attr));
                  spin_lock(&med->med_open_lock);
                  ma->ma_lmm_size = lmm_size;
@@ -4854,7 +4859,7 @@ static int mdt_iocontrol(unsigned int cmd, struct obd_export *exp, int len,
  
          ENTRY;
          CDEBUG(D_IOCTL, "handling ioctl cmd %#x\n", cmd);
-        rc = lu_env_init(&env, NULL, LCT_MD_THREAD);
+        rc = lu_env_init(&env, LCT_MD_THREAD);
          if (rc)
                  RETURN(rc);
  
@@ -4900,7 +4905,7 @@ int mdt_obd_postrecov(struct obd_device *obd)
          struct lu_env env;
          int rc;
  
-        rc = lu_env_init(&env, NULL, LCT_MD_THREAD);
+        rc = lu_env_init(&env, LCT_MD_THREAD);
          if (rc)
                  RETURN(rc);
          rc = mdt_postrecov(&env, mdt_dev(obd->obd_lu_dev));
@@ -4936,6 +4941,7 @@ static struct lu_device *mdt_device_free(const struct lu_env *env,
          struct mdt_device *m = mdt_dev(d);
          ENTRY;
  
+        md_device_fini(&m->mdt_md_dev);
          OBD_FREE_PTR(m);
          RETURN(NULL);
  }
@@ -4954,7 +4960,7 @@ static struct lu_device *mdt_device_alloc(const struct lu_env *env,
                  l = &m->mdt_md_dev.md_lu_dev;
                  rc = mdt_init0(env, m, t, cfg);
                  if (rc != 0) {
-                        OBD_FREE_PTR(m);
+                        mdt_device_free(env, l);
                          l = ERR_PTR(rc);
                          return l;
                  }
@@ -4998,7 +5004,7 @@ void mdt_enable_cos(struct mdt_device *mdt, int val)
          int rc;
  
          mdt->mdt_opts.mo_cos = !!val;
-        rc = lu_env_init(&env, NULL, LCT_MD_THREAD);
+        rc = lu_env_init(&env, LCT_MD_THREAD);
          if (unlikely(rc != 0)) {
                  CWARN("lu_env initialization failed with rc = %d,"
                        "cannot sync\n", rc);
diff --git a/lustre/mdt/mdt_recovery.c b/lustre/mdt/mdt_recovery.c

index 74e2402..4853c3e 100644 (file)
--- a/lustre/mdt/mdt_recovery.c
+++ b/lustre/mdt/mdt_recovery.c
@@ -913,7 +913,7 @@ static int mdt_txn_stop_cb(const struct lu_env *env,
          return mdt_last_rcvd_update(mti, txn);
  }
  
-/* commit callback, need to update last_commited value */
+/* commit callback, need to update last_committed value */
  static int mdt_txn_commit_cb(const struct lu_env *env,
                               struct thandle *txn, void *cookie)
  {
diff --git a/lustre/mdt/mdt_reint.c b/lustre/mdt/mdt_reint.c

index 4c565ab..d42e20f 100644 (file)
--- a/lustre/mdt/mdt_reint.c
+++ b/lustre/mdt/mdt_reint.c
@@ -470,9 +470,9 @@ static int mdt_reint_unlink(struct mdt_thread_info *info,
          if (OBD_FAIL_CHECK(OBD_FAIL_MDS_REINT_UNLINK))
                  RETURN(err_serious(-ENOENT));
  
-        /* 
+        /*
           * step 1: lock the parent. Note, this may be child in case of
-         * remote operation denoted by ->mti_cross_ref flag. 
+         * remote operation denoted by ->mti_cross_ref flag.
           */
          parent_lh = &info->mti_lh[MDT_LH_PARENT];
          if (info->mti_cross_ref) {
diff --git a/lustre/mgs/mgs_llog.c b/lustre/mgs/mgs_llog.c

index bb4ce90..11d310d 100644 (file)
--- a/lustre/mgs/mgs_llog.c
+++ b/lustre/mgs/mgs_llog.c
@@ -117,14 +117,14 @@ static inline int name_create(char **newname, char *prefix, char *suffix)
  {
          LASSERT(newname);
          OBD_ALLOC(*newname, strlen(prefix) + strlen(suffix) + 1);
-        if (!*newname) 
+        if (!*newname)
                  return -ENOMEM;
          sprintf(*newname, "%s%s", prefix, suffix);
          return 0;
  }
  
  static inline void name_destroy(char **name)
-{        
+{
          if (*name)
                  OBD_FREE(*name, strlen(*name) + 1);
          *name = NULL;
@@ -135,11 +135,11 @@ static inline void name_destroy(char **name)
          2. what the last config step is
          3. COMPAT_146 lov name
          4. COMPAT_146 mdt lov name
-        5. COMPAT_146 mdc name 
+        5. COMPAT_146 mdc name
  */
  /* It might be better to have a separate db file, instead of parsing the info
     out of the client log.  This is slow and potentially error-prone. */
-static int mgs_fsdb_handler(struct llog_handle *llh, struct llog_rec_hdr *rec, 
+static int mgs_fsdb_handler(struct llog_handle *llh, struct llog_rec_hdr *rec,
                              void *data)
  {
          struct fs_db *fsdb = (struct fs_db *)data;
@@ -201,9 +201,9 @@ static int mgs_fsdb_handler(struct llog_handle *llh, struct llog_rec_hdr *rec,
              (strcmp(lustre_cfg_string(lcfg, 1), LUSTRE_LOV_NAME) == 0)) {
                  fsdb->fsdb_flags |= FSDB_OLDLOG14;
                  name_destroy(&fsdb->fsdb_clilov);
-                rc = name_create(&fsdb->fsdb_clilov, 
+                rc = name_create(&fsdb->fsdb_clilov,
                                   lustre_cfg_string(lcfg, 0), "");
-                if (rc) 
+                if (rc)
                          RETURN(rc);
                  CDEBUG(D_MGS, "client lov name is %s\n", fsdb->fsdb_clilov);
          }
@@ -215,20 +215,20 @@ static int mgs_fsdb_handler(struct llog_handle *llh, struct llog_rec_hdr *rec,
                  fsdb->fsdb_flags |= FSDB_OLDLOG14;
                  ptr = strstr(lustre_cfg_string(lcfg, 1), "_UUID");
                  if (!ptr) {
-                        CERROR("Can't parse MDT uuid %s\n", 
+                        CERROR("Can't parse MDT uuid %s\n",
                                 lustre_cfg_string(lcfg, 1));
                          RETURN(-EINVAL);
                  }
                  *ptr = '\0';
                  name_destroy(&fsdb->fsdb_mdtlov);
-                rc = name_create(&fsdb->fsdb_mdtlov, 
+                rc = name_create(&fsdb->fsdb_mdtlov,
                                   "lov_", lustre_cfg_string(lcfg, 1));
-                if (rc) 
+                if (rc)
                          RETURN(rc);
                  name_destroy(&fsdb->fsdb_mdc);
-                rc = name_create(&fsdb->fsdb_mdc, 
+                rc = name_create(&fsdb->fsdb_mdc,
                                   lustre_cfg_string(lcfg, 0), "");
-                if (rc) 
+                if (rc)
                          RETURN(rc);
                  CDEBUG(D_MGS, "MDT lov name is %s\n", fsdb->fsdb_mdtlov);
          }
@@ -341,17 +341,17 @@ static struct fs_db *mgs_new_fsdb(struct obd_device *obd, char *fsname)
          strncpy(fsdb->fsdb_name, fsname, sizeof(fsdb->fsdb_name));
          fsdb->fsdb_name[sizeof(fsdb->fsdb_name) - 1] = 0;
          rc = name_create(&fsdb->fsdb_mdtlov, fsname, "-mdtlov");
-        if (rc) 
+        if (rc)
                  GOTO(err, rc);
          rc = name_create(&fsdb->fsdb_mdtlmv, fsname, "-mdtlmv");
-        if (rc) 
+        if (rc)
                  GOTO(err, rc);
          rc = name_create(&fsdb->fsdb_clilov, fsname, "-clilov");
-        if (rc) 
+        if (rc)
                  GOTO(err, rc);
  
          rc = name_create(&fsdb->fsdb_clilmv, fsname, "-clilmv");
-        if (rc) 
+        if (rc)
                  GOTO(err, rc);
  
          fsdb->fsdb_srpc_fl_udesc = 1;
@@ -367,8 +367,8 @@ err:
                  OBD_FREE(fsdb->fsdb_mdt_index_map, INDEX_MAP_SIZE);
          name_destroy(&fsdb->fsdb_clilov);
          name_destroy(&fsdb->fsdb_clilmv);
-        name_destroy(&fsdb->fsdb_mdtlov); 
-        name_destroy(&fsdb->fsdb_mdtlmv); 
+        name_destroy(&fsdb->fsdb_mdtlov);
+        name_destroy(&fsdb->fsdb_mdtlmv);
          OBD_FREE_PTR(fsdb);
          RETURN(NULL);
  }
@@ -381,11 +381,11 @@ static void mgs_free_fsdb(struct obd_device *obd, struct fs_db *fsdb)
          list_del(&fsdb->fsdb_list);
          OBD_FREE(fsdb->fsdb_ost_index_map, INDEX_MAP_SIZE);
          OBD_FREE(fsdb->fsdb_mdt_index_map, INDEX_MAP_SIZE);
-        name_destroy(&fsdb->fsdb_clilov); 
-        name_destroy(&fsdb->fsdb_clilmv); 
-        name_destroy(&fsdb->fsdb_mdtlov); 
-        name_destroy(&fsdb->fsdb_mdtlmv); 
-        name_destroy(&fsdb->fsdb_mdc); 
+        name_destroy(&fsdb->fsdb_clilov);
+        name_destroy(&fsdb->fsdb_clilmv);
+        name_destroy(&fsdb->fsdb_mdtlov);
+        name_destroy(&fsdb->fsdb_mdtlmv);
+        name_destroy(&fsdb->fsdb_mdc);
          mgs_free_fsdb_srpc(fsdb);
          OBD_FREE_PTR(fsdb);
  }
@@ -411,7 +411,7 @@ int mgs_cleanup_fsdb_list(struct obd_device *obd)
          return 0;
  }
  
-static int mgs_find_or_make_fsdb(struct obd_device *obd, char *name, 
+static int mgs_find_or_make_fsdb(struct obd_device *obd, char *name,
                                 struct fs_db **dbh)
  {
          struct mgs_obd *mgs = &obd->u.mgs;
@@ -568,13 +568,13 @@ struct mgs_modify_lookup {
          int               mml_modified;
  };
  
-static int mgs_modify_handler(struct llog_handle *llh, struct llog_rec_hdr *rec, 
+static int mgs_modify_handler(struct llog_handle *llh, struct llog_rec_hdr *rec,
                                void *data)
  {
          struct mgs_modify_lookup *mml = (struct mgs_modify_lookup *)data;
          struct cfg_marker *marker;
          struct lustre_cfg *lcfg = (struct lustre_cfg *)(rec + 1);
-        int cfg_len = rec->lrh_len - sizeof(struct llog_rec_hdr) - 
+        int cfg_len = rec->lrh_len - sizeof(struct llog_rec_hdr) -
                  sizeof(struct llog_rec_tail);
          int rc;
          ENTRY;
@@ -592,27 +592,27 @@ static int mgs_modify_handler(struct llog_handle *llh, struct llog_rec_hdr *rec,
  
          /* We only care about markers */
          if (lcfg->lcfg_command != LCFG_MARKER)
-                RETURN(0); 
-        
+                RETURN(0);
+
          marker = lustre_cfg_buf(lcfg, 1);
-        if ((strcmp(mml->mml_marker.cm_comment, marker->cm_comment) == 0) && 
+        if ((strcmp(mml->mml_marker.cm_comment, marker->cm_comment) == 0) &&
              (strcmp(mml->mml_marker.cm_tgtname, marker->cm_tgtname) == 0) &&
              !(marker->cm_flags & CM_SKIP)) {
                  /* Found a non-skipped marker match */
                  CDEBUG(D_MGS, "Changing rec %u marker %d %x->%x: %s %s\n",
-                       rec->lrh_index, marker->cm_step, 
+                       rec->lrh_index, marker->cm_step,
                         marker->cm_flags, mml->mml_marker.cm_flags,
                         marker->cm_tgtname, marker->cm_comment);
                  /* Overwrite the old marker llog entry */
                  marker->cm_flags &= ~CM_EXCLUDE; /* in case we're unexcluding */
                  marker->cm_flags |= mml->mml_marker.cm_flags;
                  marker->cm_canceltime = mml->mml_marker.cm_canceltime;
-                /* Header and tail are added back to lrh_len in 
+                /* Header and tail are added back to lrh_len in
                     llog_lvfs_write_rec */
-                rec->lrh_len = cfg_len; 
-                rc = llog_write_rec(llh, rec, NULL, 0, (void *)lcfg, 
+                rec->lrh_len = cfg_len;
+                rc = llog_write_rec(llh, rec, NULL, 0, (void *)lcfg,
                                      rec->lrh_index);
-                if (!rc) 
+                if (!rc)
                           mml->mml_modified++;
          }
  
@@ -621,7 +621,7 @@ static int mgs_modify_handler(struct llog_handle *llh, struct llog_rec_hdr *rec,
  
  /* Modify an existing config log record (for CM_SKIP or CM_EXCLUDE) */
  static int mgs_modify(struct obd_device *obd, struct fs_db *fsdb,
-                      struct mgs_target_info *mti, char *logname, 
+                      struct mgs_target_info *mti, char *logname,
                        char *devname, char *comment, int flags)
  {
          struct llog_handle *loghandle;
@@ -634,7 +634,7 @@ static int mgs_modify(struct obd_device *obd, struct fs_db *fsdb,
          CDEBUG(D_MGS, "modify %s/%s/%s\n", logname, devname, comment);
  
          push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
-        
+
          ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT);
          LASSERT(ctxt != NULL);
          rc = llog_create(ctxt, &loghandle, NULL, logname);
@@ -649,7 +649,7 @@ static int mgs_modify(struct obd_device *obd, struct fs_db *fsdb,
                  GOTO(out_close, rc = 0);
  
          OBD_ALLOC_PTR(mml);
-        if (!mml) 
+        if (!mml)
                  GOTO(out_close, rc = -ENOMEM);
          strcpy(mml->mml_marker.cm_comment, comment);
          strcpy(mml->mml_marker.cm_tgtname, devname);
@@ -658,7 +658,7 @@ static int mgs_modify(struct obd_device *obd, struct fs_db *fsdb,
          mml->mml_marker.cm_canceltime = flags ? cfs_time_current_sec() : 0;
          mml->mml_modified = 0;
          rc = llog_process(loghandle, mgs_modify_handler, (void *)mml, NULL);
-        if (!rc && !mml->mml_modified) 
+        if (!rc && !mml->mml_modified)
                  rc = -ENODEV;
          OBD_FREE_PTR(mml);
  
@@ -668,7 +668,7 @@ out_close:
                  rc = rc2;
  out_pop:
          pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
-        if (rc && rc != -ENODEV) 
+        if (rc && rc != -ENODEV)
                  CERROR("modify %s/%s failed %d\n",
                         mti->mti_svname, comment, rc);
          llog_ctxt_put(ctxt);
@@ -684,10 +684,10 @@ static int record_lcfg(struct obd_device *obd, struct llog_handle *llh,
          struct llog_rec_hdr    rec;
          int buflen, rc;
  
-        if (!lcfg || !llh) 
+        if (!lcfg || !llh)
                  return -ENOMEM;
  
-        LASSERT(llh->lgh_ctxt);        
+        LASSERT(llh->lgh_ctxt);
  
          buflen = lustre_cfg_len(lcfg->lcfg_bufcount,
                                  lcfg->lcfg_buflens);
@@ -698,7 +698,7 @@ static int record_lcfg(struct obd_device *obd, struct llog_handle *llh,
          /* idx = -1 means append */
          rc = llog_write_rec(llh, &rec, NULL, 0, (void *)lcfg, -1);
          pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
-        if (rc) 
+        if (rc)
                  CERROR("failed %d\n", rc);
          return rc;
  }
@@ -725,7 +725,7 @@ static int record_base(struct obd_device *obd, struct llog_handle *llh,
                  lustre_cfg_bufs_set_string(&bufs, 4, s4);
  
          lcfg = lustre_cfg_new(cmd, &bufs);
-        if (!lcfg) 
+        if (!lcfg)
                  return -ENOMEM;
          lcfg->lcfg_nid = nid;
  
@@ -799,7 +799,7 @@ static int record_lov_setup(struct obd_device *obd, struct llog_handle *llh,
          lustre_cfg_bufs_reset(&bufs, devname);
          lustre_cfg_bufs_set(&bufs, 1, desc, sizeof(*desc));
          lcfg = lustre_cfg_new(LCFG_SETUP, &bufs);
-        if (!lcfg) 
+        if (!lcfg)
                  return -ENOMEM;
          rc = record_lcfg(obd, llh, lcfg);
  
@@ -866,14 +866,14 @@ static int record_marker(struct obd_device *obd, struct llog_handle *llh,
          marker.cm_step = fsdb->fsdb_gen;
          marker.cm_flags = flags;
          marker.cm_vers = LUSTRE_VERSION_CODE;
-        strncpy(marker.cm_tgtname, tgtname, sizeof(marker.cm_tgtname)); 
-        strncpy(marker.cm_comment, comment, sizeof(marker.cm_comment)); 
+        strncpy(marker.cm_tgtname, tgtname, sizeof(marker.cm_tgtname));
+        strncpy(marker.cm_comment, comment, sizeof(marker.cm_comment));
          marker.cm_createtime = cfs_time_current_sec();
          marker.cm_canceltime = 0;
          lustre_cfg_bufs_reset(&bufs, NULL);
          lustre_cfg_bufs_set(&bufs, 1, &marker, sizeof(marker));
          lcfg = lustre_cfg_new(LCFG_MARKER, &bufs);
-        if (!lcfg) 
+        if (!lcfg)
                  return -ENOMEM;
          rc = record_lcfg(obd, llh, lcfg);
  
@@ -889,7 +889,7 @@ static int record_start_log(struct obd_device *obd,
          struct llog_ctxt *ctxt;
          int rc = 0;
  
-        if (*llh) 
+        if (*llh)
                  GOTO(out, rc = -EBUSY);
  
          ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT);
@@ -953,23 +953,23 @@ static int mgs_log_is_empty(struct obd_device *obd, char *name)
  
  /* write an lcfg directly into a log (with markers) */
  static int mgs_write_log_direct(struct obd_device *obd, struct fs_db *fsdb,
-                                char *logname, struct lustre_cfg *lcfg, 
+                                char *logname, struct lustre_cfg *lcfg,
                                  char *devname, char *comment)
  {
          struct llog_handle *llh = NULL;
          int rc;
          ENTRY;
  
-        if (!lcfg) 
+        if (!lcfg)
                  RETURN(-ENOMEM);
  
          rc = record_start_log(obd, &llh, logname);
-        if (rc) 
+        if (rc)
                  RETURN(rc);
  
          /* FIXME These should be a single journal transaction */
-        rc = record_marker(obd, llh, fsdb, CM_START, devname, comment); 
-        
+        rc = record_marker(obd, llh, fsdb, CM_START, devname, comment);
+
          rc = record_lcfg(obd, llh, lcfg);
  
          rc = record_marker(obd, llh, fsdb, CM_END, devname, comment);
@@ -980,7 +980,7 @@ static int mgs_write_log_direct(struct obd_device *obd, struct fs_db *fsdb,
  
  /* write the lcfg in all logs for the given fs */
  int mgs_write_log_direct_all(struct obd_device *obd, struct fs_db *fsdb,
-                             struct mgs_target_info *mti, 
+                             struct mgs_target_info *mti,
                               struct lustre_cfg *lcfg,
                               char *devname, char *comment)
  {
@@ -991,9 +991,9 @@ int mgs_write_log_direct_all(struct obd_device *obd, struct fs_db *fsdb,
          char *logname;
          int rc = 0, len = strlen(fsname);
          ENTRY;
-        
-        /* We need to set params for any future logs 
-           as well. FIXME Append this file to every new log. 
+
+        /* We need to set params for any future logs
+           as well. FIXME Append this file to every new log.
             Actually, we should store as params (text), not llogs.  Or
             in a database. */
          name_create(&logname, fsname, "-params");
@@ -1003,7 +1003,7 @@ int mgs_write_log_direct_all(struct obd_device *obd, struct fs_db *fsdb,
                  record_end_log(obd, &llh);
          }
          name_destroy(&logname);
-        if (rc) 
+        if (rc)
                  RETURN(rc);
  
          /* Find all the logs in the CONFIGS directory */
@@ -1022,13 +1022,13 @@ int mgs_write_log_direct_all(struct obd_device *obd, struct fs_db *fsdb,
                      strstr(dirent->lld_name, "-sptlrpc") == NULL) {
                          CDEBUG(D_MGS, "Changing log %s\n", dirent->lld_name);
                          /* Erase any old settings of this same parameter */
-                        mgs_modify(obd, fsdb, mti, dirent->lld_name, devname, 
+                        mgs_modify(obd, fsdb, mti, dirent->lld_name, devname,
                                     comment, CM_SKIP);
                          /* Write the new one */
                          rc = mgs_write_log_direct(obd, fsdb, dirent->lld_name,
                                                    lcfg, devname, comment);
                          if (rc)
-                                CERROR("err %d writing log %s\n", rc, 
+                                CERROR("err %d writing log %s\n", rc,
                                         dirent->lld_name);
                  }
                  OBD_FREE(dirent, sizeof(*dirent));
@@ -1131,11 +1131,11 @@ static int mgs_steal_llog_handler(struct llog_handle *llh,
  
          if (got_an_osc_or_mdc == 0 || last_step < 0)
                  RETURN(rc);
-        
+
          if (lcfg->lcfg_command == LCFG_ADD_UUID) {
                  uint64_t nodenid;
                  nodenid = lcfg->lcfg_nid;
-                
+
                  tmti->mti_nids[tmti->mti_nid_count] = nodenid;
                  tmti->mti_nid_count++;
  
@@ -1288,10 +1288,10 @@ static int mgs_write_log_lov(struct obd_device *obd, struct fs_db *fsdb,
          /* This should always be the first entry in a log.
          rc = mgs_clear_log(obd, logname); */
          rc = record_start_log(obd, &llh, logname);
-        if (rc) 
+        if (rc)
                  GOTO(out, rc);
          /* FIXME these should be a single journal transaction */
-        rc = record_marker(obd, llh, fsdb, CM_START, lovname, "lov setup"); 
+        rc = record_marker(obd, llh, fsdb, CM_START, lovname, "lov setup");
          rc = record_attach(obd, llh, lovname, "lov", uuid);
          rc = record_lov_setup(obd, llh, lovname, lovdesc);
          rc = record_marker(obd, llh, fsdb, CM_END, lovname, "lov setup");
@@ -1331,7 +1331,7 @@ static int mgs_write_log_failnids(struct obd_device *obd,
                                     so just use the first nid as the uuid */
                                  rc = name_create(&failnodeuuid,
                                                   libcfs_nid2str(nid), "");
-                                if (rc) 
+                                if (rc)
                                          return rc;
                          }
                          CDEBUG(D_MGS, "add nid %s for failover uuid %s, "
@@ -1359,7 +1359,7 @@ static int mgs_write_log_mdc_to_lmv(struct obd_device *obd, struct fs_db *fsdb,
          char index[5];
          int i, rc;
          ENTRY;
-        
+
          if (mgs_log_is_empty(obd, logname)) {
                  CERROR("log is empty! Logical error\n");
                  RETURN(-EINVAL);
@@ -1388,9 +1388,9 @@ static int mgs_write_log_mdc_to_lmv(struct obd_device *obd, struct fs_db *fsdb,
                             "add mdc");
  
          for (i = 0; i < mti->mti_nid_count; i++) {
-                CDEBUG(D_MGS, "add nid %s for mdt\n", 
+                CDEBUG(D_MGS, "add nid %s for mdt\n",
                         libcfs_nid2str(mti->mti_nids[i]));
-                       
+
                  rc = record_add_uuid(obd, llh, mti->mti_nids[i], nodeuuid);
          }
  
@@ -1402,7 +1402,7 @@ static int mgs_write_log_mdc_to_lmv(struct obd_device *obd, struct fs_db *fsdb,
          rc = record_mdc_add(obd, llh, lmvname, mdcuuid, mti->mti_uuid,
                              index, "1");
          rc = record_marker(obd, llh, fsdb, CM_END, mti->mti_svname,
-                           "add mdc"); 
+                           "add mdc");
          rc = record_end_log(obd, &llh);
  
          name_destroy(&lmvuuid);
@@ -1464,7 +1464,7 @@ static int mgs_write_log_mdc_to_mdt(struct obd_device *obd, struct fs_db *fsdb,
  
          rc = record_mdc_add(obd, llh, logname, mdcuuid, mti->mti_uuid,
                              index, "1");
-        rc = record_marker(obd, llh, fsdb, CM_END, mti->mti_svname, "add mdc"); 
+        rc = record_marker(obd, llh, fsdb, CM_END, mti->mti_svname, "add mdc");
          rc = record_end_log(obd, &llh);
  
          name_destroy(&mdcuuid);
@@ -1502,7 +1502,7 @@ static int mgs_write_log_mdt0(struct obd_device *obd, struct fs_db *fsdb,
          if (uuid == NULL)
                  GOTO(out_srpc, rc = -ENOMEM);
  
-        if (class_find_param(ptr, PARAM_FAILMODE, &ptr) == 0) 
+        if (class_find_param(ptr, PARAM_FAILMODE, &ptr) == 0)
                  failout = (strncmp(ptr, "failout", 7) == 0);
  
          name_create(&lovname, log, "-mdtlov");
@@ -1510,18 +1510,18 @@ static int mgs_write_log_mdt0(struct obd_device *obd, struct fs_db *fsdb,
                  rc = mgs_write_log_lov(obd, fsdb, mti, log, lovname);
  
          sprintf(uuid, "%s_UUID", log);
-        sprintf(mdt_index,"%d",mti->mti_stripe_index);        
+        sprintf(mdt_index,"%d",mti->mti_stripe_index);
  
          /* add MDT itself */
          rc = record_start_log(obd, &llh, log);
-        if (rc) 
+        if (rc)
                  GOTO(out, rc);
-        
+
          /* FIXME this whole fn should be a single journal transaction */
          rc = record_marker(obd, llh, fsdb, CM_START, log, "add mdt");
          rc = record_attach(obd, llh, log, LUSTRE_MDT_NAME, uuid);
          rc = record_mount_opt(obd, llh, log, lovname, NULL);
-        rc = record_setup(obd, llh, log, uuid, mdt_index, lovname, 
+        rc = record_setup(obd, llh, log, uuid, mdt_index, lovname,
                          failout ? "n" : "f");
          rc = record_sptlrpc_conf(obd, llh, log, srpc_log);
          rc = record_marker(obd, llh, fsdb, CM_END, log, "add mdt");
@@ -1553,10 +1553,10 @@ static int mgs_write_log_mdt(struct obd_device *obd, struct fs_db *fsdb,
                  /* We're starting with an old uuid.  Assume old name for lov
                     as well since the lov entry already exists in the log. */
                  CDEBUG(D_MGS, "old mds uuid %s\n", mti->mti_uuid);
-                if (strncmp(mti->mti_uuid, fsdb->fsdb_mdtlov + 4, 
+                if (strncmp(mti->mti_uuid, fsdb->fsdb_mdtlov + 4,
                              strlen(fsdb->fsdb_mdtlov) - 4) != 0) {
                          CERROR("old mds uuid %s doesn't match log %s (%s)\n",
-                               mti->mti_uuid, fsdb->fsdb_mdtlov, 
+                               mti->mti_uuid, fsdb->fsdb_mdtlov,
                                 fsdb->fsdb_mdtlov + 4);
                          RETURN(-EINVAL);
                  }
@@ -1571,19 +1571,19 @@ static int mgs_write_log_mdt(struct obd_device *obd, struct fs_db *fsdb,
  
          /* add mdt */
          rc = mgs_write_log_mdt0(obd, fsdb, mti);
-        
+
          /* Append the mdt info to the client log */
          name_create(&cliname, mti->mti_fsname, "-client");
-        
-        if (mgs_log_is_empty(obd, cliname)) { 
+
+        if (mgs_log_is_empty(obd, cliname)) {
                  /* Start client log */
-                rc = mgs_write_log_lov(obd, fsdb, mti, cliname, 
+                rc = mgs_write_log_lov(obd, fsdb, mti, cliname,
                                         fsdb->fsdb_clilov);
-                rc = mgs_write_log_lmv(obd, fsdb, mti, cliname, 
+                rc = mgs_write_log_lmv(obd, fsdb, mti, cliname,
                                         fsdb->fsdb_clilmv);
          }
  
-        /* 
+        /*
          #09 L add_uuid nid=uml1@tcp(0x20000c0a80201) 0:  1:uml1_UUID
          #10 L attach   0:MDC_uml1_mdsA_MNT_client  1:mdc  2:1d834_MNT_client_03f
          #11 L setup    0:MDC_uml1_mdsA_MNT_client  1:mdsA_UUID  2:uml1_UUID
@@ -1591,27 +1591,27 @@ static int mgs_write_log_mdt(struct obd_device *obd, struct fs_db *fsdb,
          #13 L add_conn 0:MDC_uml1_mdsA_MNT_client  1:uml2_UUID
          #14 L mount_option 0:  1:client  2:lov1  3:MDC_uml1_mdsA_MNT_client
          */
-        
+
  #if 0
          /* COMPAT_146 */
-        if (mti->mti_flags & LDD_F_UPGRADE14) { 
+        if (mti->mti_flags & LDD_F_UPGRADE14) {
                  rc = record_start_log(obd, &llh, cliname);
-                if (rc) 
+                if (rc)
                          GOTO(out, rc);
-        
-                rc = record_marker(obd, llh, fsdb, CM_START, 
+
+                rc = record_marker(obd, llh, fsdb, CM_START,
                                     mti->mti_svname,"add mdc");
-                                   
-                /* Old client log already has MDC entry, but needs mount opt 
+
+                /* Old client log already has MDC entry, but needs mount opt
                     for new client name (lustre-client) */
-                /* FIXME Old MDT log already has an old mount opt 
+                /* FIXME Old MDT log already has an old mount opt
                     which we should remove (currently handled by
                     class_del_profiles()) */
                  rc = record_mount_opt(obd, llh, cliname, fsdb->fsdb_clilov,
                                        fsdb->fsdb_mdc);
                  /* end COMPAT_146 */
-                
-                rc = record_marker(obd, llh, fsdb, CM_END, 
+
+                rc = record_marker(obd, llh, fsdb, CM_END,
                                     mti->mti_svname, "add mdc");
          } else
  #endif
@@ -1619,42 +1619,42 @@ static int mgs_write_log_mdt(struct obd_device *obd, struct fs_db *fsdb,
                  /* copy client info about lov/lmv */
                  comp.comp_mti = mti;
                  comp.comp_fsdb = fsdb;
-                
-                rc = mgs_steal_llog_for_mdt_from_client(obd, cliname, 
+
+                rc = mgs_steal_llog_for_mdt_from_client(obd, cliname,
                                                          &comp);
  
                  rc = mgs_write_log_mdc_to_lmv(obd, fsdb, mti, cliname,
                                                fsdb->fsdb_clilmv);
                  /* add mountopts */
                  rc = record_start_log(obd, &llh, cliname);
-                if (rc) 
+                if (rc)
                          GOTO(out, rc);
  
-                rc = record_marker(obd, llh, fsdb, CM_START, cliname, 
+                rc = record_marker(obd, llh, fsdb, CM_START, cliname,
                                     "mount opts");
                  rc = record_mount_opt(obd, llh, cliname, fsdb->fsdb_clilov,
                                        fsdb->fsdb_clilmv);
-                rc = record_marker(obd, llh, fsdb, CM_END, cliname, 
-                                   "mount opts"); 
+                rc = record_marker(obd, llh, fsdb, CM_END, cliname,
+                                   "mount opts");
          }
-                           
+
          rc = record_end_log(obd, &llh);
  out:
          name_destroy(&cliname);
-        
+
          // for_all_existing_mdt except current one
          for (i = 0; i < INDEX_MAP_SIZE * 8; i++){
                  char *mdtname;
                  if (i !=  mti->mti_stripe_index &&
                      test_bit(i,  fsdb->fsdb_mdt_index_map)) {
                          sprintf(mdt_index,"-MDT%04x",i);
-                        
+
                          name_create(&mdtname, mti->mti_fsname, mdt_index);
                          rc = mgs_write_log_mdc_to_mdt(obd, fsdb, mti, mdtname);
                          name_destroy(&mdtname);
                  }
          }
-        
+
          RETURN(rc);
  }
  
@@ -1673,7 +1673,7 @@ static int mgs_write_log_osc_to_lov(struct obd_device *obd, struct fs_db *fsdb,
          ENTRY;
          CDEBUG(D_INFO, "adding osc for %s to log %s\n",
                 mti->mti_svname, logname);
-        
+
          srpc_log = sptlrpc_conf_log_alloc();
          if (IS_ERR(srpc_log))
                  RETURN(PTR_ERR(srpc_log));
@@ -1689,7 +1689,7 @@ static int mgs_write_log_osc_to_lov(struct obd_device *obd, struct fs_db *fsdb,
                     somewhere to add our osc. */
                  rc = mgs_write_log_lov(obd, fsdb, mti, logname, lovname);
          }
-  
+
          name_create(&nodeuuid, libcfs_nid2str(mti->mti_nids[0]), "");
          name_create(&svname, mti->mti_svname, "-osc");
          name_create(&oscname, svname, suffix);
@@ -1707,13 +1707,13 @@ static int mgs_write_log_osc_to_lov(struct obd_device *obd, struct fs_db *fsdb,
          #07 L add_conn 0:OSC_uml1_ost1_MNT_client  1:uml2_UUID
          #08 L lov_modify_tgts add 0:lov1  1:ost1_UUID  2(index):0  3(gen):1
          */
-        
+
          rc = record_start_log(obd, &llh, logname);
-        if (rc) 
+        if (rc)
                  GOTO(out, rc);
          /* FIXME these should be a single journal transaction */
          rc = record_marker(obd, llh, fsdb, CM_START | flags, mti->mti_svname,
-                           "add osc"); 
+                           "add osc");
          for (i = 0; i < mti->mti_nid_count; i++) {
                  CDEBUG(D_MGS, "add nid %s\n", libcfs_nid2str(mti->mti_nids[i]));
                  rc = record_add_uuid(obd, llh, mti->mti_nids[i], nodeuuid);
@@ -1725,9 +1725,9 @@ static int mgs_write_log_osc_to_lov(struct obd_device *obd, struct fs_db *fsdb,
          snprintf(index, sizeof(index), "%d", mti->mti_stripe_index);
          rc = record_lov_add(obd, llh, lovname, mti->mti_uuid, index, "1");
          rc = record_marker(obd, llh, fsdb, CM_END | flags, mti->mti_svname,
-                           "add osc"); 
+                           "add osc");
          rc = record_end_log(obd, &llh);
-out:        
+out:
          name_destroy(&lovuuid);
          name_destroy(&oscuuid);
          name_destroy(&oscname);
@@ -1748,7 +1748,7 @@ static int mgs_write_log_ost(struct obd_device *obd, struct fs_db *fsdb,
          char *ptr = mti->mti_params;
          int rc, flags = 0, failout = 0, i;
          ENTRY;
-        
+
          CDEBUG(D_MGS, "writing new ost %s\n", mti->mti_svname);
  
          /* The ost startup log */
@@ -1778,14 +1778,14 @@ static int mgs_write_log_ost(struct obd_device *obd, struct fs_db *fsdb,
          attach obdfilter ost1 ost1_UUID
          setup /dev/loop2 ldiskfs f|n errors=remount-ro,user_xattr
          */
-        if (class_find_param(ptr, PARAM_FAILMODE, &ptr) == 0) 
+        if (class_find_param(ptr, PARAM_FAILMODE, &ptr) == 0)
                  failout = (strncmp(ptr, "failout", 7) == 0);
          rc = record_start_log(obd, &llh, mti->mti_svname);
-        if (rc) 
+        if (rc)
                  RETURN(rc);
          /* FIXME these should be a single journal transaction */
-        rc = record_marker(obd, llh, fsdb, CM_START, mti->mti_svname,"add ost"); 
-        if (*mti->mti_uuid == '\0') 
+        rc = record_marker(obd, llh, fsdb, CM_START, mti->mti_svname,"add ost");
+        if (*mti->mti_uuid == '\0')
                  snprintf(mti->mti_uuid, sizeof(mti->mti_uuid),
                           "%s_UUID", mti->mti_svname);
          rc = record_attach(obd, llh, mti->mti_svname,
@@ -1794,10 +1794,10 @@ static int mgs_write_log_ost(struct obd_device *obd, struct fs_db *fsdb,
                            "dev"/*ignored*/, "type"/*ignored*/,
                            failout ? "n" : "f", 0/*options*/);
          rc = record_sptlrpc_conf(obd, llh, mti->mti_svname, srpc_log);
-        rc = record_marker(obd, llh, fsdb, CM_END, mti->mti_svname, "add ost"); 
+        rc = record_marker(obd, llh, fsdb, CM_END, mti->mti_svname, "add ost");
          rc = record_end_log(obd, &llh);
  
-        /* We also have to update the other logs where this osc is part of 
+        /* We also have to update the other logs where this osc is part of
             the lov */
  
          if (fsdb->fsdb_flags & FSDB_OLDLOG14) {
@@ -1806,7 +1806,7 @@ static int mgs_write_log_ost(struct obd_device *obd, struct fs_db *fsdb,
                  /* Note that we can't add any new failnids, since we don't
                     know the old osc names. */
                  flags = CM_SKIP | CM_UPGRADE146;
-        
+
          } else if ((mti->mti_flags & LDD_F_UPDATE) != LDD_F_UPDATE) {
                  /* If the update flag isn't set, don't update client/mdt
                     logs. */
@@ -1829,7 +1829,7 @@ static int mgs_write_log_ost(struct obd_device *obd, struct fs_db *fsdb,
                          name_destroy(&lovname);
                  }
          }
-    
+
          /* Append ost info to the client log */
          name_create(&logname, mti->mti_fsname, "-client");
          mgs_write_log_osc_to_lov(obd, fsdb, mti, logname, "",
@@ -1840,7 +1840,7 @@ out_srpc:
          RETURN(rc);
  }
  
-/* Add additional failnids to an existing log.  
+/* Add additional failnids to an existing log.
     The mdc/osc must have been added to logs first */
  /* tcp nids must be in dotted-quad ascii -
     we can't resolve hostnames from the kernel. */
@@ -1853,7 +1853,7 @@ static int mgs_write_log_add_failnid(struct obd_device *obd, struct fs_db *fsdb,
          ENTRY;
  
          /* FIXME how do we delete a failnid? Currently --writeconf is the
-           only way.  Maybe make --erase-params pass a flag to really 
+           only way.  Maybe make --erase-params pass a flag to really
             erase all params from logs - except it can't erase the failnids
             given when a target first registers, since they aren't processed
             as params... */
@@ -1874,17 +1874,17 @@ static int mgs_write_log_add_failnid(struct obd_device *obd, struct fs_db *fsdb,
          } else {
                  RETURN(-EINVAL);
          }
-        
+
          /* Add failover nids to client log */
          name_create(&logname, mti->mti_fsname, "-client");
          rc = record_start_log(obd, &llh, logname);
-        if (!rc) { 
+        if (!rc) {
                  /* FIXME this fn should be a single journal transaction */
                  rc = record_marker(obd, llh, fsdb, CM_START, mti->mti_svname,
                                     "add failnid");
                  rc = mgs_write_log_failnids(obd, mti, llh, cliname);
                  rc = record_marker(obd, llh, fsdb, CM_END, mti->mti_svname,
-                                   "add failnid"); 
+                                   "add failnid");
                  rc = record_end_log(obd, &llh);
          }
          name_destroy(&logname);
@@ -1894,11 +1894,11 @@ static int mgs_write_log_add_failnid(struct obd_device *obd, struct fs_db *fsdb,
                  name_create(&logname, mti->mti_fsname, "-MDT0000");
                  rc = record_start_log(obd, &llh, logname);
                  if (!rc) {
-                        rc = record_marker(obd, llh, fsdb, CM_START, 
+                        rc = record_marker(obd, llh, fsdb, CM_START,
                                             mti->mti_svname, "add failnid");
                          rc = mgs_write_log_failnids(obd, mti, llh, cliname);
-                        rc = record_marker(obd, llh, fsdb, CM_END, 
-                                           mti->mti_svname, "add failnid"); 
+                        rc = record_marker(obd, llh, fsdb, CM_END,
+                                           mti->mti_svname, "add failnid");
                          rc = record_end_log(obd, &llh);
                  }
                  name_destroy(&logname);
@@ -1908,7 +1908,7 @@ static int mgs_write_log_add_failnid(struct obd_device *obd, struct fs_db *fsdb,
          RETURN(rc);
  }
  
-static int mgs_wlp_lcfg(struct obd_device *obd, struct fs_db *fsdb, 
+static int mgs_wlp_lcfg(struct obd_device *obd, struct fs_db *fsdb,
                          struct mgs_target_info *mti,
                          char *logname, struct lustre_cfg_bufs *bufs,
                          char *tgtname, char *ptr)
@@ -1917,7 +1917,7 @@ static int mgs_wlp_lcfg(struct obd_device *obd, struct fs_db *fsdb,
          char *tmp;
          struct lustre_cfg *lcfg;
          int rc;
-        
+
          /* Erase any old settings of this same parameter */
          memcpy(comment, ptr, MTI_NAME_MAXLEN);
          comment[MTI_NAME_MAXLEN - 1] = 0;
@@ -1932,7 +1932,7 @@ static int mgs_wlp_lcfg(struct obd_device *obd, struct fs_db *fsdb,
          lustre_cfg_bufs_reset(bufs, tgtname);
          lustre_cfg_bufs_set_string(bufs, 1, ptr);
          lcfg = lustre_cfg_new(LCFG_PARAM, bufs);
-        if (!lcfg) 
+        if (!lcfg)
                  return -ENOMEM;
          rc = mgs_write_log_direct(obd, fsdb, logname, lcfg, tgtname, comment);
          lustre_cfg_free(lcfg);
@@ -2012,7 +2012,7 @@ static int mgs_msl_tgt_uuid2name(char *tgtname, char *tgtuuid)
  }
  
  static int mgs_modify_srpc_log_handler(struct llog_handle *llh,
-                                       struct llog_rec_hdr *rec, 
+                                       struct llog_rec_hdr *rec,
                                         void *data)
  {
          struct mgs_msl_data *mmd = (struct mgs_msl_data *)data;
@@ -2026,7 +2026,7 @@ static int mgs_modify_srpc_log_handler(struct llog_handle *llh,
                  RETURN(-EINVAL);
          }
  
-        cfg_len = rec->lrh_len - sizeof(struct llog_rec_hdr) - 
+        cfg_len = rec->lrh_len - sizeof(struct llog_rec_hdr) -
                    sizeof(struct llog_rec_tail);
  
          rc = lustre_cfg_sanity_check(lcfg, cfg_len);
@@ -2130,8 +2130,8 @@ static int mgs_modify_srpc_log_handler(struct llog_handle *llh,
                  }
  
                  /* Overwrite the log */
-                rec->lrh_len = cfg_len; 
-                rc = llog_write_rec(llh, rec, NULL, 0, (void *)lcfg, 
+                rec->lrh_len = cfg_len;
+                rc = llog_write_rec(llh, rec, NULL, 0, (void *)lcfg,
                                      rec->lrh_index);
                  if (rc)
                          CERROR("overwrite sptlrpc conf log failed: %d\n", rc);
@@ -2170,7 +2170,7 @@ static int mgs_modify_srpc_log(struct obd_device *obd,
          CDEBUG(D_MGS, "modify sptlrpc log for %s\n", logname);
  
          push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
-        
+
          ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT);
          LASSERT(ctxt != NULL);
          rc = llog_create(ctxt, &llh, NULL, logname);
@@ -2185,7 +2185,7 @@ static int mgs_modify_srpc_log(struct obd_device *obd,
                  GOTO(out_close, rc = 0);
  
          OBD_ALLOC_PTR(mmd);
-        if (!mmd) 
+        if (!mmd)
                  GOTO(out_close, rc = -ENOMEM);
  
          mmd->mmd_obd = obd;
@@ -2205,7 +2205,7 @@ out_pop:
          pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
          llog_ctxt_put(ctxt);
  
-        if (rc) 
+        if (rc)
                  CERROR("modify sptlrpc log %s failed %d\n", logname, rc);
          RETURN(rc);
  }
@@ -2305,7 +2305,7 @@ static int mgs_srpc_set_param_disk(struct obd_device *obd,
          if (mgs_log_is_empty(obd, logname)) {
                  rc = record_start_log(obd, &llh, logname);
                  record_end_log(obd, &llh);
-                if (rc) 
+                if (rc)
                          GOTO(out, rc);
          }
  
@@ -2486,7 +2486,7 @@ struct mgs_srpc_read_data {
  };
  
  static int mgs_srpc_read_handler(struct llog_handle *llh,
-                                 struct llog_rec_hdr *rec, 
+                                 struct llog_rec_hdr *rec,
                                   void *data)
  {
          struct mgs_srpc_read_data *msrd = (struct mgs_srpc_read_data *) data;
@@ -2501,7 +2501,7 @@ static int mgs_srpc_read_handler(struct llog_handle *llh,
                  RETURN(-EINVAL);
          }
  
-        cfg_len = rec->lrh_len - sizeof(struct llog_rec_hdr) - 
+        cfg_len = rec->lrh_len - sizeof(struct llog_rec_hdr) -
                    sizeof(struct llog_rec_tail);
  
          rc = lustre_cfg_sanity_check(lcfg, cfg_len);
@@ -2613,26 +2613,26 @@ static int mgs_write_log_params(struct obd_device *obd, struct fs_db *fsdb,
          int rc = 0;
          ENTRY;
  
-        if (!mti->mti_params) 
+        if (!mti->mti_params)
                  RETURN(0);
  
          /* For various parameter settings, we have to figure out which logs
             care about them (e.g. both mdt and client for lov settings) */
          while (ptr) {
-                while (*ptr == ' ') 
+                while (*ptr == ' ')
                          ptr++;
                  if (*ptr == '\0')
                          break;
                  endptr = strchr(ptr, ' ');
-                if (endptr) 
+                if (endptr)
                          *endptr = '\0';
                  CDEBUG(D_MGS, "next param '%s'\n", ptr);
  
-                /* The params are stored in MOUNT_DATA_FILE and modified 
+                /* The params are stored in MOUNT_DATA_FILE and modified
                     via tunefs.lustre, or set using lctl conf_param */
  
                  /* Processed in lustre_start_mgc */
-                if (class_match_param(ptr, PARAM_MGSNODE, NULL) == 0) 
+                if (class_match_param(ptr, PARAM_MGSNODE, NULL) == 0)
                          GOTO(end_while, rc);
  
                  /* Processed in mgs_write_log_ost */
@@ -2675,12 +2675,12 @@ static int mgs_write_log_params(struct obd_device *obd, struct fs_db *fsdb,
                          /* modify all servers and clients */
                          rc = mgs_write_log_direct_all(obd, fsdb, mti, lcfg,
                                                        mti->mti_fsname,
-                                                      "timeout"); 
+                                                      "timeout");
                          lustre_cfg_free(lcfg);
                          GOTO(end_while, rc);
                  }
  
-                if (class_match_param(ptr, PARAM_OSC""PARAM_ACTIVE, &tmp) == 0) { 
+                if (class_match_param(ptr, PARAM_OSC""PARAM_ACTIVE, &tmp) == 0) {
                          /* active=0 means off, anything else means on */
                          char mdt_index[16];
                          int flag = (*tmp == '0') ? CM_EXCLUDE : 0;
@@ -2697,10 +2697,10 @@ static int mgs_write_log_params(struct obd_device *obd, struct fs_db *fsdb,
                                        flag ? "de": "re", mti->mti_svname);
                          /* Modify clilov */
                          name_create(&logname, mti->mti_fsname, "-client");
-                        rc = mgs_modify(obd, fsdb, mti, logname, 
+                        rc = mgs_modify(obd, fsdb, mti, logname,
                                          mti->mti_svname, "add osc", flag);
                          name_destroy(&logname);
-                        if (rc) 
+                        if (rc)
                                  goto active_err;
                          /* Modify mdtlov */
                          /* FIXME add to all MDT logs for CMD */
@@ -2709,7 +2709,7 @@ static int mgs_write_log_params(struct obd_device *obd, struct fs_db *fsdb,
                                          continue;
                                  sprintf(mdt_index,"-MDT%04x", i);
                                  name_create(&logname, mti->mti_fsname, mdt_index);
-                                rc = mgs_modify(obd, fsdb, mti, logname, 
+                                rc = mgs_modify(obd, fsdb, mti, logname,
                                                  mti->mti_svname, "add osc", flag);
                                  name_destroy(&logname);
                                  if (rc)
@@ -2722,7 +2722,7 @@ active_err:
                                                     "changes were made to the "
                                                     "config log.\n",
                                                     mti->mti_svname, rc);
-                                if (fsdb->fsdb_flags & FSDB_OLDLOG14) 
+                                if (fsdb->fsdb_flags & FSDB_OLDLOG14)
                                          LCONSOLE_ERROR_MSG(0x146, "This may be"
                                                             " because the log "
                                                             "is in the old 1.4"
@@ -2731,21 +2731,21 @@ active_err:
                                                             "update the logs.\n");
                                  goto end_while;
                          }
-                        /* Fall through to osc proc for deactivating 
+                        /* Fall through to osc proc for deactivating
                             live OSC on running MDT / clients. */
                  }
                  /* Below here, let obd's XXX_process_config methods handle it */
- 
+
                  /* All lov. in proc */
                  if (class_match_param(ptr, PARAM_LOV, NULL) == 0) {
                          char mdt_index[16];
                          char *mdtlovname;
-                        
+
                          CDEBUG(D_MGS, "lov param %s\n", ptr);
                          if (!(mti->mti_flags & LDD_F_SV_TYPE_MDT)) {
                                  LCONSOLE_ERROR_MSG(0x147, "LOV params must be "
                                                     "set on the MDT, not %s. "
-                                                   "Ignoring.\n", 
+                                                   "Ignoring.\n",
                                                     mti->mti_svname);
                                  rc = 0;
                                  goto end_while;
@@ -2758,7 +2758,7 @@ active_err:
                          sprintf(mdt_index,"-MDT%04x", mti->mti_stripe_index);
                          name_create(&logname, mti->mti_fsname, mdt_index);
                          name_create(&mdtlovname, logname, "-mdtlov");
-                        rc = mgs_wlp_lcfg(obd, fsdb, mti, mti->mti_svname, 
+                        rc = mgs_wlp_lcfg(obd, fsdb, mti, mti->mti_svname,
                                            &bufs, mdtlovname, ptr);
                          name_destroy(&logname);
                          name_destroy(&mdtlovname);
@@ -2774,13 +2774,13 @@ active_err:
                  }
  
                  /* All osc., mdc., llite. params in proc */
-                if ((class_match_param(ptr, PARAM_OSC, NULL) == 0) || 
+                if ((class_match_param(ptr, PARAM_OSC, NULL) == 0) ||
                      (class_match_param(ptr, PARAM_MDC, NULL) == 0) ||
                      (class_match_param(ptr, PARAM_LLITE, NULL) == 0)) {
                          char *cname;
                          if (memcmp(ptr, PARAM_LLITE, strlen(PARAM_LLITE)) == 0) {
                                  name_create(&cname, mti->mti_fsname, "-client");
-                        /* Add the client type to match the obdname 
+                        /* Add the client type to match the obdname
                             in class_config_llog_handler */
                          } else if (mti->mti_flags & LDD_F_SV_TYPE_MDT) {
                                  /* COMPAT_146 */
@@ -2806,13 +2806,13 @@ active_err:
                                          goto end_while;
                                  }
                                  name_create(&cname, mti->mti_svname, "-osc");
-                        } else {       
+                        } else {
                                  rc = -EINVAL;
                                  goto end_while;
                          }
  
                          CDEBUG(D_MGS, "%.3s param %s\n", ptr, ptr + 4);
-                        
+
                          /* Modify client */
                          name_create(&logname, mti->mti_fsname, "-client");
                          rc = mgs_wlp_lcfg(obd, fsdb, mti, logname, &bufs,
@@ -2849,7 +2849,7 @@ active_err:
                  }
  
                  /* All mdt., ost. params in proc */
-                if ((class_match_param(ptr, PARAM_MDT, NULL) == 0) || 
+                if ((class_match_param(ptr, PARAM_MDT, NULL) == 0) ||
                      (class_match_param(ptr, PARAM_MDD, NULL) == 0) ||
                      (class_match_param(ptr, PARAM_OST, NULL) == 0)) {
                          CDEBUG(D_MGS, "%.3s param %s\n", ptr, ptr + 4);
@@ -2869,11 +2869,11 @@ end_while:
                          CERROR("err %d on param '%s\n", rc, ptr);
                          break;
                  }
-                
+
                  if (!endptr)
                          /* last param */
                          break;
-                 
+
                  *endptr = ' ';
                  ptr = endptr + 1;
          }
@@ -2889,20 +2889,20 @@ int mgs_check_failnid(struct obd_device *obd, struct mgs_target_info *mti)
          int rc;
          ENTRY;
  
-        rc = mgs_find_or_make_fsdb(obd, fsname, &fsdb); 
-        if (rc) 
+        rc = mgs_find_or_make_fsdb(obd, fsname, &fsdb);
+        if (rc)
                  RETURN(rc);
  
-        if (mgs_log_is_empty(obd, mti->mti_svname)) 
+        if (mgs_log_is_empty(obd, mti->mti_svname))
                  /* should never happen */
                  RETURN(-ENOENT);
  
          CDEBUG(D_MGS, "Checking for new failnids for %s\n", mti->mti_svname);
  
          /* FIXME We can just check mti->params to see if we're already in
-           the failover list.  Modify mti->params for rewriting back at 
+           the failover list.  Modify mti->params for rewriting back at
             server_register_target(). */
-        
+
          down(&fsdb->fsdb_sem);
          rc = mgs_write_log_add_failnid(obd, fsdb, mti);
          up(&fsdb->fsdb_sem);
@@ -2930,7 +2930,7 @@ int mgs_write_log_target(struct obd_device *obd,
          if (mti->mti_flags & LDD_F_UPGRADE14) {
                  if (rc == EALREADY) {
                          LCONSOLE_INFO("Found index %d for %s 1.4 log, "
-                                      "upgrading\n", mti->mti_stripe_index, 
+                                      "upgrading\n", mti->mti_stripe_index,
                                        mti->mti_svname);
                  } else {
                          LCONSOLE_ERROR_MSG(0x149, "Failed to find %s in the old"
@@ -2948,9 +2948,9 @@ int mgs_write_log_target(struct obd_device *obd,
                  /* end COMPAT_146 */
          } else {
                  if (rc == EALREADY) {
-                        LCONSOLE_WARN("Found index %d for %s, updating log\n", 
+                        LCONSOLE_WARN("Found index %d for %s, updating log\n",
                                        mti->mti_stripe_index, mti->mti_svname);
-                        /* We would like to mark old log sections as invalid 
+                        /* We would like to mark old log sections as invalid
                             and add new log sections in the client and mdt logs.
                             But if we add new sections, then live clients will
                             get repeat setup instructions for already running
@@ -2959,7 +2959,7 @@ int mgs_write_log_target(struct obd_device *obd,
                  }
          }
  
-        rc = mgs_find_or_make_fsdb(obd, mti->mti_fsname, &fsdb); 
+        rc = mgs_find_or_make_fsdb(obd, mti->mti_fsname, &fsdb);
          if (rc) {
                  CERROR("Can't get db for %s\n", mti->mti_fsname);
                  RETURN(rc);
@@ -2967,7 +2967,7 @@ int mgs_write_log_target(struct obd_device *obd,
  
          down(&fsdb->fsdb_sem);
  
-        if (mti->mti_flags & 
+        if (mti->mti_flags &
              (LDD_F_VIRGIN | LDD_F_UPGRADE14 | LDD_F_WRITECONF)) {
                  /* Generate a log from scratch */
                  if (mti->mti_flags & LDD_F_SV_TYPE_MDT) {
@@ -2988,7 +2988,7 @@ int mgs_write_log_target(struct obd_device *obd,
                  CDEBUG(D_MGS, "Update params for %s\n", mti->mti_svname);
                  mti->mti_flags |= LDD_F_PARAM;
          }
-        
+
          rc = mgs_write_log_params(obd, fsdb, mti);
  
  out_up:
@@ -2997,30 +2997,30 @@ out_up:
  }
  
  /* COMPAT_146 */
-/* verify that we can handle the old config logs */ 
+/* verify that we can handle the old config logs */
  int mgs_upgrade_sv_14(struct obd_device *obd, struct mgs_target_info *mti)
  {
          struct fs_db *fsdb;
          int rc = 0;
          ENTRY;
  
-        /* Create ost log normally, as servers register.  Servers 
+        /* Create ost log normally, as servers register.  Servers
             register with their old uuids (from last_rcvd), so old
             (MDT and client) logs should work.
-         - new MDT won't know about old OSTs, only the ones that have 
-           registered, so we need the old MDT log to get the LOV right 
-           in order for old clients to work. 
-         - Old clients connect to the MDT, not the MGS, for their logs, and 
-           will therefore receive the old client log from the MDT /LOGS dir. 
+         - new MDT won't know about old OSTs, only the ones that have
+           registered, so we need the old MDT log to get the LOV right
+           in order for old clients to work.
+         - Old clients connect to the MDT, not the MGS, for their logs, and
+           will therefore receive the old client log from the MDT /LOGS dir.
           - Old clients can continue to use and connect to old or new OSTs
-         - New clients will contact the MGS for their log 
+         - New clients will contact the MGS for their log
          */
  
-        LCONSOLE_INFO("upgrading server %s from pre-1.6\n", mti->mti_svname); 
+        LCONSOLE_INFO("upgrading server %s from pre-1.6\n", mti->mti_svname);
          server_mti_print("upgrade", mti);
-        
+
          rc = mgs_find_or_make_fsdb(obd, mti->mti_fsname, &fsdb);
-        if (rc) 
+        if (rc)
                  RETURN(rc);
  
          if (fsdb->fsdb_flags & FSDB_LOG_EMPTY) {
@@ -3031,7 +3031,7 @@ int mgs_upgrade_sv_14(struct obd_device *obd, struct mgs_target_info *mti)
          }
  
          if (fsdb->fsdb_gen == 0) {
-                /* There were no markers in the client log, meaning we have 
+                /* There were no markers in the client log, meaning we have
                     not updated the logs for this fs */
                  CDEBUG(D_MGS, "found old, unupdated client log\n");
          }
@@ -3047,10 +3047,10 @@ int mgs_upgrade_sv_14(struct obd_device *obd, struct mgs_target_info *mti)
                  /* We're starting with an old uuid.  Assume old name for lov
                     as well since the lov entry already exists in the log. */
                  CDEBUG(D_MGS, "old mds uuid %s\n", mti->mti_uuid);
-                if (strncmp(mti->mti_uuid, fsdb->fsdb_mdtlov + 4, 
+                if (strncmp(mti->mti_uuid, fsdb->fsdb_mdtlov + 4,
                              strlen(fsdb->fsdb_mdtlov) - 4) != 0) {
                          CERROR("old mds uuid %s doesn't match log %s (%s)\n",
-                               mti->mti_uuid, fsdb->fsdb_mdtlov, 
+                               mti->mti_uuid, fsdb->fsdb_mdtlov,
                                 fsdb->fsdb_mdtlov + 4);
                          RETURN(-EINVAL);
                  }
@@ -3102,7 +3102,7 @@ int mgs_erase_logs(struct obd_device *obd, char *fsname)
          struct l_linux_dirent *dirent, *n;
          int rc, len = strlen(fsname);
          ENTRY;
-        
+
          /* Find all the logs in the CONFIGS directory */
          rc = class_dentry_readdir(obd, mgs->mgs_configs_dir,
                                    mgs->mgs_vfsmnt, &dentry_list);
@@ -3110,12 +3110,12 @@ int mgs_erase_logs(struct obd_device *obd, char *fsname)
                  CERROR("Can't read %s dir\n", MOUNT_CONFIGS_DIR);
                  RETURN(rc);
          }
-                                                                                
+
          down(&mgs->mgs_sem);
-        
+
          /* Delete the fs db */
          fsdb = mgs_find_fsdb(obd, fsname);
-        if (fsdb) 
+        if (fsdb)
                  mgs_free_fsdb(obd, fsdb);
  
          list_for_each_entry_safe(dirent, n, &dentry_list, lld_list) {
@@ -3126,7 +3126,7 @@ int mgs_erase_logs(struct obd_device *obd, char *fsname)
                  }
                  OBD_FREE(dirent, sizeof(*dirent));
          }
-        
+
          up(&mgs->mgs_sem);
  
          RETURN(rc);
@@ -3150,7 +3150,7 @@ static void print_lustre_cfg(struct lustre_cfg *lcfg)
          if (lcfg->lcfg_bufcount < LUSTRE_CFG_MAX_BUFCOUNT)
                  for (i = 0; i < lcfg->lcfg_bufcount; i++) {
                          CDEBUG(D_MGS, "\tlcfg->lcfg_buflens[%d]: %d %s\n",
-                               i, lcfg->lcfg_buflens[i], 
+                               i, lcfg->lcfg_buflens[i],
                                 lustre_cfg_string(lcfg, i));
                  }
          EXIT;
@@ -3168,7 +3168,7 @@ int mgs_setparam(struct obd_device *obd, struct lustre_cfg *lcfg, char *fsname)
          ENTRY;
  
          print_lustre_cfg(lcfg);
-        
+
          /* lustre, lustre-mdtlov, lustre-client, lustre-MDT0000 */
          devname = lustre_cfg_string(lcfg, 0);
          param = lustre_cfg_string(lcfg, 1);
@@ -3199,8 +3199,8 @@ int mgs_setparam(struct obd_device *obd, struct lustre_cfg *lcfg, char *fsname)
          fsname[MTI_NAME_MAXLEN - 1] = 0;
          CDEBUG(D_MGS, "setparam on fs %s device %s\n", fsname, devname);
  
-        rc = mgs_find_or_make_fsdb(obd, fsname, &fsdb); 
-        if (rc) 
+        rc = mgs_find_or_make_fsdb(obd, fsname, &fsdb);
+        if (rc)
                  RETURN(rc);
          if (fsdb->fsdb_flags & FSDB_LOG_EMPTY) {
                  CERROR("No filesystem targets for %s.  cfg_device from lctl "
@@ -3211,25 +3211,25 @@ int mgs_setparam(struct obd_device *obd, struct lustre_cfg *lcfg, char *fsname)
  
          /* Create a fake mti to hold everything */
          OBD_ALLOC_PTR(mti);
-        if (!mti) 
+        if (!mti)
                  GOTO(out, rc = -ENOMEM);
          strncpy(mti->mti_fsname, fsname, MTI_NAME_MAXLEN);
          strncpy(mti->mti_svname, devname, MTI_NAME_MAXLEN);
          strncpy(mti->mti_params, param, sizeof(mti->mti_params));
          rc = server_name2index(mti->mti_svname, &mti->mti_stripe_index, &tmp);
-        if (rc < 0) 
+        if (rc < 0)
                  /* Not a valid server; may be only fsname */
                  rc = 0;
          else
                  /* Strip -osc or -mdc suffix from svname */
-                if (server_make_name(rc, mti->mti_stripe_index, mti->mti_fsname, 
-                                     mti->mti_svname)) 
+                if (server_make_name(rc, mti->mti_stripe_index, mti->mti_fsname,
+                                     mti->mti_svname))
                          GOTO(out, rc = -EINVAL);
  
          mti->mti_flags = rc | LDD_F_PARAM;
  
          down(&fsdb->fsdb_sem);
-        rc = mgs_write_log_params(obd, fsdb, mti); 
+        rc = mgs_write_log_params(obd, fsdb, mti);
          up(&fsdb->fsdb_sem);
  
  out:
@@ -3412,10 +3412,10 @@ static int mgs_backup_llog(struct obd_device *obd, char* fsname)
  
          if (len >= PATH_MAX - 1) {
                  GOTO(out, -ENAMETOOLONG);
-        } 
+        }
  
          push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
-                
+
          bak_filp = l_filp_open(logname, O_RDWR|O_CREAT|O_TRUNC, 0660);
          if (IS_ERR(bak_filp)) {
                  rc = PTR_ERR(bak_filp);
diff --git a/lustre/obdclass/Makefile.in b/lustre/obdclass/Makefile.in

index 50da9e8..1bbd3c3 100644 (file)
--- a/lustre/obdclass/Makefile.in
+++ b/lustre/obdclass/Makefile.in
@@ -25,7 +25,7 @@ obdclass-all-objs += debug.o genops.o uuid.o llog_ioctl.o
  obdclass-all-objs += lprocfs_status.o lustre_handles.o lustre_peer.o
  obdclass-all-objs += statfs_pack.o obdo.o obd_config.o obd_mount.o mea.o
  obdclass-all-objs += lu_object.o dt_object.o hash.o capa.o lu_time.o
-obdclass-all-objs += lu_ref.o
+obdclass-all-objs += cl_object.o cl_page.o cl_lock.o cl_io.o lu_ref.o
  obdclass-all-objs += acl.o idmap.o
  
  obdclass-objs := $(obdclass-linux-objs) $(obdclass-all-objs)
diff --git a/lustre/obdclass/autoMakefile.am b/lustre/obdclass/autoMakefile.am

index 778dba0..b7fb43e 100644 (file)
--- a/lustre/obdclass/autoMakefile.am
+++ b/lustre/obdclass/autoMakefile.am
@@ -11,7 +11,8 @@ liblustreclass_a_SOURCES = class_obd.c debug.c genops.c statfs_pack.c mea.c uuid
  liblustreclass_a_SOURCES += lustre_handles.c lustre_peer.c lprocfs_status.c class_hash.c
  liblustreclass_a_SOURCES += obdo.c obd_config.c llog.c llog_obd.c llog_cat.c 
  liblustreclass_a_SOURCES += llog_lvfs.c llog_swab.c capa.c
-liblustreclass_a_SOURCES += lu_object.c lu_ref.c lu_time.c
+liblustreclass_a_SOURCES += lu_object.c cl_object.c lu_time.c lu_ref.c
+liblustreclass_a_SOURCES += cl_page.c cl_lock.c cl_io.c
  liblustreclass_a_SOURCES += #llog_ioctl.c rbtree.c
  liblustreclass_a_CPPFLAGS = $(LLCPPFLAGS)
  liblustreclass_a_CFLAGS = $(LLCFLAGS)
@@ -53,4 +54,4 @@ install-data-hook: $(install_data_hook)
  
  MOSTLYCLEANFILES := @MOSTLYCLEANFILES@  llog-test.c
  MOSTLYCLEANFILES += linux/*.o darwin/*.o
-DIST_SOURCES = $(filter-out llog-test.c,$(obdclass-all-objs:.o=.c)) $(llog-test-objs:.o=.c) llog_test.c llog_internal.h
+DIST_SOURCES = $(filter-out llog-test.c,$(obdclass-all-objs:.o=.c)) $(llog-test-objs:.o=.c) llog_test.c llog_internal.h cl_internal.h
diff --git a/lustre/obdclass/cl_internal.h b/lustre/obdclass/cl_internal.h

new file mode 100644 (file)

index 0000000..578fdc7
--- /dev/null
+++ b/lustre/obdclass/cl_internal.h
@@ -0,0 +1,97 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Internal cl interfaces.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+#ifndef _CL_INTERNAL_H
+#define _CL_INTERNAL_H
+
+#define CLT_PVEC_SIZE (14)
+
+/**
+ * Thread local state internal for generic cl-code.
+ */
+struct cl_thread_info {
+        /*
+         * Common fields.
+         */
+        struct cl_io         clt_io;
+        struct cl_2queue     clt_queue;
+
+        /*
+         * Fields used by cl_lock.c
+         */
+        struct cl_lock_descr clt_descr;
+        struct cl_page_list  clt_list;
+        /**
+         * \name debugging.
+         *
+         * Counters used to check correctness of cl_lock interface usage.
+         * @{
+         */
+        /**
+         * Number of outstanding calls to cl_lock_mutex_get() made by the
+         * current thread. For debugging.
+         */
+        int                  clt_nr_locks_locked;
+        /** List of locked locks. */
+        struct lu_ref        clt_locks_locked;
+        /** Number of outstanding holds on the top-level locks. */
+        int                  clt_nr_held;
+        /** Number of outstanding uses on the top-level locks. */
+        int                  clt_nr_used;
+        /** Number of held top-level extent locks. */
+        int                  clt_nr_locks_acquired;
+        /** @} debugging */
+
+        /*
+         * Fields used by cl_page.c
+         */
+        struct cl_page      *clt_pvec[CLT_PVEC_SIZE];
+
+        /*
+         * Fields used by cl_io.c
+         */
+        /**
+         * Pointer to the topmost ongoing IO in this thread.
+         */
+        struct cl_io        *clt_current_io;
+};
+
+struct cl_thread_info *cl_env_info(const struct lu_env *env);
+
+#endif /* _CL_INTERNAL_H */
diff --git a/lustre/obdclass/cl_io.c b/lustre/obdclass/cl_io.c

new file mode 100644 (file)

index 0000000..1be2176
--- /dev/null
+++ b/lustre/obdclass/cl_io.c
@@ -0,0 +1,1623 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Client IO.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <libcfs/list.h>
+/* lu_time_global_{init,fini}() */
+#include <lu_time.h>
+
+#include <cl_object.h>
+#include "cl_internal.h"
+
+/*****************************************************************************
+ *
+ * cl_io interface.
+ *
+ */
+
+#define cl_io_for_each(slice, io) \
+        list_for_each_entry((slice), &io->ci_layers, cis_linkage)
+#define cl_io_for_each_reverse(slice, io)                 \
+        list_for_each_entry_reverse((slice), &io->ci_layers, cis_linkage)
+
+static inline int cl_io_type_is_valid(enum cl_io_type type)
+{
+        return CIT_READ <= type && type < CIT_OP_NR;
+}
+
+static inline int cl_io_is_loopable(const struct cl_io *io)
+{
+        return cl_io_type_is_valid(io->ci_type) && io->ci_type != CIT_MISC;
+}
+
+/**
+ * True, iff \a io is a sendfile().
+ */
+int cl_io_is_sendfile(const struct cl_io *io)
+{
+        return io->ci_type == CIT_READ && io->u.ci_rd.rd_is_sendfile;
+}
+EXPORT_SYMBOL(cl_io_is_sendfile);
+
+/**
+ * Returns true iff there is an IO ongoing in the given environment.
+ */
+int cl_io_is_going(const struct lu_env *env)
+{
+        return cl_env_info(env)->clt_current_io != NULL;
+}
+EXPORT_SYMBOL(cl_io_is_going);
+
+/**
+ * cl_io invariant that holds at all times when exported cl_io_*() functions
+ * are entered and left.
+ */
+static int cl_io_invariant(const struct cl_io *io)
+{
+        struct cl_io *up;
+
+        up = io->ci_parent;
+        return
+                /*
+                 * io can own pages only when it is ongoing. Sub-io might
+                 * still be in CIS_LOCKED state when top-io is in
+                 * CIS_IO_GOING.
+                 */
+                ergo(io->ci_owned_nr > 0, io->ci_state == CIS_IO_GOING ||
+                     (io->ci_state == CIS_LOCKED && up != NULL));
+}
+
+/**
+ * Finalize \a io, by calling cl_io_operations::cio_fini() bottom-to-top.
+ */
+void cl_io_fini(const struct lu_env *env, struct cl_io *io)
+{
+        struct cl_io_slice    *slice;
+        struct cl_thread_info *info;
+
+        LINVRNT(cl_io_type_is_valid(io->ci_type));
+        LINVRNT(cl_io_invariant(io));
+        ENTRY;
+
+        while (!list_empty(&io->ci_layers)) {
+                slice = container_of(io->ci_layers.next, struct cl_io_slice,
+                                     cis_linkage);
+                list_del_init(&slice->cis_linkage);
+                if (slice->cis_iop->op[io->ci_type].cio_fini != NULL)
+                        slice->cis_iop->op[io->ci_type].cio_fini(env, slice);
+                /*
+                 * Invalidate slice to catch use after free. This assumes that
+                 * slices are allocated within session and can be touched
+                 * after ->cio_fini() returns.
+                 */
+                slice->cis_io = NULL;
+        }
+        io->ci_state = CIS_FINI;
+        info = cl_env_info(env);
+        if (info->clt_current_io == io)
+                info->clt_current_io = NULL;
+        EXIT;
+}
+EXPORT_SYMBOL(cl_io_fini);
+
+static int cl_io_init0(const struct lu_env *env, struct cl_io *io,
+                       enum cl_io_type iot, struct cl_object *obj)
+{
+        struct cl_object *scan;
+        int result;
+
+        LINVRNT(io->ci_state == CIS_ZERO || io->ci_state == CIS_FINI);
+        LINVRNT(cl_io_type_is_valid(iot));
+        LINVRNT(cl_io_invariant(io));
+        ENTRY;
+
+        io->ci_type = iot;
+        CFS_INIT_LIST_HEAD(&io->ci_lockset.cls_todo);
+        CFS_INIT_LIST_HEAD(&io->ci_lockset.cls_curr);
+        CFS_INIT_LIST_HEAD(&io->ci_lockset.cls_done);
+        CFS_INIT_LIST_HEAD(&io->ci_layers);
+
+        result = 0;
+        cl_object_for_each(scan, obj) {
+                if (scan->co_ops->coo_io_init != NULL) {
+                        result = scan->co_ops->coo_io_init(env, scan, io);
+                        if (result != 0)
+                                break;
+                }
+        }
+        if (result == 0)
+                io->ci_state = CIS_INIT;
+        RETURN(result);
+}
+
+/**
+ * Initialize sub-io, by calling cl_io_operations::cio_init() top-to-bottom.
+ *
+ * \pre obj != cl_object_top(obj)
+ */
+int cl_io_sub_init(const struct lu_env *env, struct cl_io *io,
+                   enum cl_io_type iot, struct cl_object *obj)
+{
+        struct cl_thread_info *info = cl_env_info(env);
+
+        LASSERT(obj != cl_object_top(obj));
+        if (info->clt_current_io == NULL)
+                info->clt_current_io = io;
+        return cl_io_init0(env, io, iot, obj);
+}
+EXPORT_SYMBOL(cl_io_sub_init);
+
+/**
+ * Initialize \a io, by calling cl_io_operations::cio_init() top-to-bottom.
+ *
+ * Caller has to call cl_io_fini() after a call to cl_io_init(), no matter
+ * what the latter returned.
+ *
+ * \pre obj == cl_object_top(obj)
+ * \pre cl_io_type_is_valid(iot)
+ * \post cl_io_type_is_valid(io->ci_type) && io->ci_type == iot
+ */
+int cl_io_init(const struct lu_env *env, struct cl_io *io,
+               enum cl_io_type iot, struct cl_object *obj)
+{
+        struct cl_thread_info *info = cl_env_info(env);
+
+        LASSERT(obj == cl_object_top(obj));
+        LASSERT(info->clt_current_io == NULL);
+
+        info->clt_current_io = io;
+        return cl_io_init0(env, io, iot, obj);
+}
+EXPORT_SYMBOL(cl_io_init);
+
+/**
+ * Initialize read or write io.
+ *
+ * \pre iot == CIT_READ || iot == CIT_WRITE
+ */
+int cl_io_rw_init(const struct lu_env *env, struct cl_io *io,
+                  enum cl_io_type iot, loff_t pos, size_t count)
+{
+        LINVRNT(iot == CIT_READ || iot == CIT_WRITE);
+        LINVRNT(io->ci_obj != NULL);
+        ENTRY;
+
+        LU_OBJECT_HEADER(D_VFSTRACE, env, &io->ci_obj->co_lu,
+                         "io range: %i [%llu, %llu) %i %i\n",
+                         iot, (__u64)pos, (__u64)pos + count,
+                         io->u.ci_rw.crw_nonblock, io->u.ci_wr.wr_append);
+        io->u.ci_rw.crw_pos    = pos;
+        io->u.ci_rw.crw_count  = count;
+        RETURN(cl_io_init(env, io, iot, io->ci_obj));
+}
+EXPORT_SYMBOL(cl_io_rw_init);
+
+static inline const struct lu_fid *
+cl_lock_descr_fid(const struct cl_lock_descr *descr)
+{
+        return lu_object_fid(&descr->cld_obj->co_lu);
+}
+
+static int cl_lock_descr_cmp(const struct cl_lock_descr *d0,
+                             const struct cl_lock_descr *d1)
+{
+        return lu_fid_cmp(cl_lock_descr_fid(d0), cl_lock_descr_fid(d1)) ?:
+                __diff_normalize(d0->cld_start, d1->cld_start);
+}
+
+/*
+ * Sort locks in lexicographical order of their (fid, start-offset) pairs.
+ */
+static void cl_io_locks_sort(struct cl_io *io)
+{
+        int done = 0;
+
+        ENTRY;
+        /* hidden treasure: bubble sort for now. */
+        do {
+                struct cl_io_lock_link *curr;
+                struct cl_io_lock_link *prev;
+                struct cl_io_lock_link *temp;
+
+                done = 1;
+                prev = NULL;
+
+                list_for_each_entry_safe(curr, temp, &io->ci_lockset.cls_todo,
+                                         cill_linkage) {
+                        if (prev != NULL) {
+                                switch (cl_lock_descr_cmp(&prev->cill_descr,
+                                                          &curr->cill_descr)) {
+                                case 0:
+                                        /*
+                                         * IMPOSSIBLE: Identical locks are
+                                         *             already removed at
+                                         *             this point.
+                                         */
+                                default:
+                                        LBUG();
+                                case +1:
+                                        list_move_tail(&curr->cill_linkage,
+                                                       &prev->cill_linkage);
+                                        done = 0;
+                                        continue; /* don't change prev: it's
+                                                   * still "previous" */
+                                case -1: /* already in order */
+                                        break;
+                                }
+                        }
+                        prev = curr;
+                }
+        } while (!done);
+        EXIT;
+}
+
+/**
+ * Check whether \a queue contains locks matching \a need.
+ *
+ * \retval +ve there is a matching lock in the \a queue
+ * \retval   0 there are no matching locks in the \a queue
+ */
+int cl_queue_match(const struct list_head *queue,
+                   const struct cl_lock_descr *need)
+{
+       struct cl_io_lock_link *scan;
+
+       ENTRY;
+       list_for_each_entry(scan, queue, cill_linkage) {
+               if (cl_lock_descr_match(&scan->cill_descr, need))
+                       RETURN(+1);
+       }
+       return 0;
+}
+EXPORT_SYMBOL(cl_queue_match);
+
+static int cl_lockset_match(const struct cl_lockset *set,
+                            const struct cl_lock_descr *need, int all_queues)
+{
+        return (all_queues ? cl_queue_match(&set->cls_todo, need) : 0) ||
+                cl_queue_match(&set->cls_curr, need) ||
+                cl_queue_match(&set->cls_done, need);
+}
+
+static int cl_lockset_lock_one(const struct lu_env *env,
+                               struct cl_io *io, struct cl_lockset *set,
+                               struct cl_io_lock_link *link)
+{
+        struct cl_lock *lock;
+        int             result;
+
+        ENTRY;
+
+        lock = cl_lock_request(env, io, &link->cill_descr, link->cill_enq_flags,
+                               "io", io);
+        if (!IS_ERR(lock)) {
+                link->cill_lock = lock;
+                list_move(&link->cill_linkage, &set->cls_curr);
+                if (!(link->cill_enq_flags & CEF_ASYNC)) {
+                        result = cl_wait(env, lock);
+                        if (result == 0)
+                                list_move(&link->cill_linkage, &set->cls_done);
+                } else
+                        result = 0;
+        } else
+                result = PTR_ERR(lock);
+        RETURN(result);
+}
+
+static void cl_lock_link_fini(const struct lu_env *env, struct cl_io *io,
+                              struct cl_io_lock_link *link)
+{
+        struct cl_lock *lock = link->cill_lock;
+
+        ENTRY;
+        list_del_init(&link->cill_linkage);
+        if (lock != NULL) {
+                cl_lock_release(env, lock, "io", io);
+                link->cill_lock = NULL;
+        }
+        if (link->cill_fini != NULL)
+                link->cill_fini(env, link);
+        EXIT;
+}
+
+static int cl_lockset_lock(const struct lu_env *env, struct cl_io *io,
+                           struct cl_lockset *set)
+{
+        struct cl_io_lock_link *link;
+        struct cl_io_lock_link *temp;
+        struct cl_lock         *lock;
+        int result;
+
+        ENTRY;
+        result = 0;
+        list_for_each_entry_safe(link, temp, &set->cls_todo, cill_linkage) {
+                if (!cl_lockset_match(set, &link->cill_descr, 0)) {
+                        /* XXX some locking to guarantee that locks aren't
+                         * expanded in between. */
+                        result = cl_lockset_lock_one(env, io, set, link);
+                        if (result != 0)
+                                break;
+                } else
+                        cl_lock_link_fini(env, io, link);
+        }
+        if (result == 0) {
+                list_for_each_entry_safe(link, temp,
+                                         &set->cls_curr, cill_linkage) {
+                        lock = link->cill_lock;
+                        result = cl_wait(env, lock);
+                        if (result == 0)
+                                list_move(&link->cill_linkage, &set->cls_done);
+                        else
+                                break;
+                }
+        }
+        RETURN(result);
+}
+
+/**
+ * Takes locks necessary for the current iteration of io.
+ *
+ * Calls cl_io_operations::cio_lock() top-to-bottom to collect locks required
+ * by layers for the current iteration. Then sort locks (to avoid dead-locks),
+ * and acquire them.
+ */
+int cl_io_lock(const struct lu_env *env, struct cl_io *io)
+{
+        const struct cl_io_slice *scan;
+        int result = 0;
+
+        LINVRNT(cl_io_is_loopable(io));
+        LINVRNT(io->ci_state == CIS_IT_STARTED);
+        LINVRNT(cl_io_invariant(io));
+
+        ENTRY;
+        cl_io_for_each(scan, io) {
+                if (scan->cis_iop->op[io->ci_type].cio_lock == NULL)
+                        continue;
+                result = scan->cis_iop->op[io->ci_type].cio_lock(env, scan);
+                if (result != 0)
+                        break;
+        }
+        if (result == 0) {
+                cl_io_locks_sort(io);
+                result = cl_lockset_lock(env, io, &io->ci_lockset);
+        }
+        if (result != 0)
+                cl_io_unlock(env, io);
+        else
+                io->ci_state = CIS_LOCKED;
+        RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_lock);
+
+/**
+ * Release locks takes by io.
+ */
+void cl_io_unlock(const struct lu_env *env, struct cl_io *io)
+{
+        struct cl_lockset        *set;
+        struct cl_io_lock_link   *link;
+        struct cl_io_lock_link   *temp;
+        const struct cl_io_slice *scan;
+
+        LASSERT(cl_io_is_loopable(io));
+        LASSERT(CIS_IT_STARTED <= io->ci_state && io->ci_state < CIS_UNLOCKED);
+        LINVRNT(cl_io_invariant(io));
+
+        ENTRY;
+        set = &io->ci_lockset;
+
+        list_for_each_entry_safe(link, temp, &set->cls_todo, cill_linkage)
+                cl_lock_link_fini(env, io, link);
+
+        list_for_each_entry_safe(link, temp, &set->cls_curr, cill_linkage)
+                cl_lock_link_fini(env, io, link);
+
+        list_for_each_entry_safe(link, temp, &set->cls_done, cill_linkage) {
+                cl_unuse(env, link->cill_lock);
+                cl_lock_link_fini(env, io, link);
+        }
+        cl_io_for_each_reverse(scan, io) {
+                if (scan->cis_iop->op[io->ci_type].cio_unlock != NULL)
+                        scan->cis_iop->op[io->ci_type].cio_unlock(env, scan);
+        }
+        io->ci_state = CIS_UNLOCKED;
+        LASSERT(cl_env_info(env)->clt_nr_locks_acquired == 0);
+        EXIT;
+}
+EXPORT_SYMBOL(cl_io_unlock);
+
+/**
+ * Prepares next iteration of io.
+ *
+ * Calls cl_io_operations::cio_iter_init() top-to-bottom. This exists to give
+ * layers a chance to modify io parameters, e.g., so that lov can restrict io
+ * to a single stripe.
+ */
+int cl_io_iter_init(const struct lu_env *env, struct cl_io *io)
+{
+        const struct cl_io_slice *scan;
+        int result;
+
+        LINVRNT(cl_io_is_loopable(io));
+        LINVRNT(io->ci_state == CIS_INIT || io->ci_state == CIS_IT_ENDED);
+        LINVRNT(cl_io_invariant(io));
+
+        ENTRY;
+        result = 0;
+        cl_io_for_each(scan, io) {
+                if (scan->cis_iop->op[io->ci_type].cio_iter_init == NULL)
+                        continue;
+                result = scan->cis_iop->op[io->ci_type].cio_iter_init(env,
+                                                                      scan);
+                if (result != 0)
+                        break;
+        }
+        if (result == 0)
+                io->ci_state = CIS_IT_STARTED;
+        RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_iter_init);
+
+/**
+ * Finalizes io iteration.
+ *
+ * Calls cl_io_operations::cio_iter_fini() bottom-to-top.
+ */
+void cl_io_iter_fini(const struct lu_env *env, struct cl_io *io)
+{
+        const struct cl_io_slice *scan;
+
+        LINVRNT(cl_io_is_loopable(io));
+        LINVRNT(io->ci_state == CIS_UNLOCKED);
+        LINVRNT(cl_io_invariant(io));
+
+        ENTRY;
+        cl_io_for_each_reverse(scan, io) {
+                if (scan->cis_iop->op[io->ci_type].cio_iter_fini != NULL)
+                        scan->cis_iop->op[io->ci_type].cio_iter_fini(env, scan);
+        }
+        io->ci_state = CIS_IT_ENDED;
+        EXIT;
+}
+EXPORT_SYMBOL(cl_io_iter_fini);
+
+/**
+ * Records that read or write io progressed \a nob bytes forward.
+ */
+void cl_io_rw_advance(const struct lu_env *env, struct cl_io *io, size_t nob)
+{
+        const struct cl_io_slice *scan;
+
+        LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE ||
+                nob == 0);
+        LINVRNT(cl_io_is_loopable(io));
+        LINVRNT(cl_io_invariant(io));
+
+        ENTRY;
+
+        io->u.ci_rw.crw_pos   += nob;
+        io->u.ci_rw.crw_count -= nob;
+
+        /* layers have to be notified. */
+        cl_io_for_each_reverse(scan, io) {
+                if (scan->cis_iop->op[io->ci_type].cio_advance != NULL)
+                        scan->cis_iop->op[io->ci_type].cio_advance(env, scan,
+                                                                   nob);
+        }
+        EXIT;
+}
+EXPORT_SYMBOL(cl_io_rw_advance);
+
+/**
+ * Adds a lock to a lockset.
+ */
+int cl_io_lock_add(const struct lu_env *env, struct cl_io *io,
+                   struct cl_io_lock_link *link)
+{
+        int result;
+
+        ENTRY;
+        if (cl_lockset_match(&io->ci_lockset, &link->cill_descr, 1))
+                result = +1;
+        else {
+                list_add(&link->cill_linkage, &io->ci_lockset.cls_todo);
+                result = 0;
+        }
+        RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_lock_add);
+
+static void cl_free_io_lock_link(const struct lu_env *env,
+                                 struct cl_io_lock_link *link)
+{
+        OBD_FREE_PTR(link);
+}
+
+/**
+ * Allocates new lock link, and uses it to add a lock to a lockset.
+ */
+int cl_io_lock_alloc_add(const struct lu_env *env, struct cl_io *io,
+                         struct cl_lock_descr *descr)
+{
+        struct cl_io_lock_link *link;
+        int result;
+
+        ENTRY;
+        OBD_ALLOC_PTR(link);
+        if (link != NULL) {
+                link->cill_descr = *descr;
+                link->cill_fini = cl_free_io_lock_link;
+                result = cl_io_lock_add(env, io, link);
+                if (result) /* lock match */
+                        link->cill_fini(env, link);
+        } else
+                result = -ENOMEM;
+
+        RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_lock_alloc_add);
+
+/**
+ * Starts io by calling cl_io_operations::cio_start() top-to-bottom.
+ */
+int cl_io_start(const struct lu_env *env, struct cl_io *io)
+{
+        const struct cl_io_slice *scan;
+        int result = 0;
+
+        LINVRNT(cl_io_is_loopable(io));
+        LINVRNT(io->ci_state == CIS_LOCKED);
+        LINVRNT(cl_io_invariant(io));
+        ENTRY;
+
+        io->ci_state = CIS_IO_GOING;
+        cl_io_for_each(scan, io) {
+                if (scan->cis_iop->op[io->ci_type].cio_start == NULL)
+                        continue;
+                result = scan->cis_iop->op[io->ci_type].cio_start(env, scan);
+                if (result != 0)
+                        break;
+        }
+        if (result >= 0)
+                result = 0;
+        RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_start);
+
+/**
+ * Wait until current io iteration is finished by calling
+ * cl_io_operations::cio_end() bottom-to-top.
+ */
+void cl_io_end(const struct lu_env *env, struct cl_io *io)
+{
+        const struct cl_io_slice *scan;
+
+        LINVRNT(cl_io_is_loopable(io));
+        LINVRNT(io->ci_state == CIS_IO_GOING);
+        LINVRNT(cl_io_invariant(io));
+        ENTRY;
+
+        cl_io_for_each_reverse(scan, io) {
+                if (scan->cis_iop->op[io->ci_type].cio_end != NULL)
+                        scan->cis_iop->op[io->ci_type].cio_end(env, scan);
+                /* TODO: error handling. */
+        }
+        io->ci_state = CIS_IO_FINISHED;
+        EXIT;
+}
+EXPORT_SYMBOL(cl_io_end);
+
+static const struct cl_page_slice *
+cl_io_slice_page(const struct cl_io_slice *ios, struct cl_page *page)
+{
+        const struct cl_page_slice *slice;
+
+        slice = cl_page_at(page, ios->cis_obj->co_lu.lo_dev->ld_type);
+        LINVRNT(slice != NULL);
+        return slice;
+}
+
+/**
+ * True iff \a page is within \a io range.
+ */
+static int cl_page_in_io(const struct cl_page *page, const struct cl_io *io)
+{
+        int     result;
+        loff_t  start;
+        loff_t  end;
+        pgoff_t idx;
+
+        idx = page->cp_index;
+        switch (io->ci_type) {
+        case CIT_READ:
+        case CIT_WRITE:
+                /*
+                 * check that [start, end) and [pos, pos + count) extents
+                 * overlap.
+                 */
+                start = cl_offset(page->cp_obj, idx);
+                end   = cl_offset(page->cp_obj, idx + 1);
+                result = io->u.ci_rw.crw_pos < end &&
+                        start < io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count;
+                break;
+        case CIT_FAULT:
+                result = io->u.ci_fault.ft_index == idx;
+                break;
+        default:
+                LBUG();
+        }
+        return result;
+}
+
+/**
+ * Called by read io, when page has to be read from the server.
+ *
+ * \see cl_io_operations::cio_read_page()
+ */
+int cl_io_read_page(const struct lu_env *env, struct cl_io *io,
+                    struct cl_page *page)
+{
+        const struct cl_io_slice *scan;
+        struct cl_2queue         *queue;
+        int                       result = 0;
+
+        LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_FAULT);
+        LINVRNT(cl_page_is_owned(page, io));
+        LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED);
+        LINVRNT(cl_page_in_io(page, io));
+        LINVRNT(cl_io_invariant(io));
+        ENTRY;
+
+        queue = &io->ci_queue;
+
+        cl_2queue_init(queue);
+        /*
+         * ->cio_read_page() methods called in the loop below are supposed to
+         * never block waiting for network (the only subtle point is the
+         * creation of new pages for read-ahead that might result in cache
+         * shrinking, but currently only clean pages are shrunk and this
+         * requires no network io).
+         *
+         * Should this ever starts blocking, retry loop would be needed for
+         * "parallel io" (see CLO_REPEAT loops in cl_lock.c).
+         */
+        cl_io_for_each(scan, io) {
+                if (scan->cis_iop->cio_read_page != NULL) {
+                        const struct cl_page_slice *slice;
+
+                        slice = cl_io_slice_page(scan, page);
+                        LINVRNT(slice != NULL);
+                        result = scan->cis_iop->cio_read_page(env, scan, slice);
+                        if (result != 0)
+                                break;
+                }
+        }
+        if (result == 0)
+                result = cl_io_submit_rw(env, io, CRT_READ, queue);
+        /*
+         * Unlock unsent pages in case of error.
+         */
+        cl_page_list_disown(env, io, &queue->c2_qin);
+        cl_2queue_fini(env, queue);
+        RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_read_page);
+
+/**
+ * Called by write io to prepare page to receive data from user buffer.
+ *
+ * \see cl_io_operations::cio_prepare_write()
+ */
+int cl_io_prepare_write(const struct lu_env *env, struct cl_io *io,
+                        struct cl_page *page, unsigned from, unsigned to)
+{
+        const struct cl_io_slice *scan;
+        int result = 0;
+
+        LINVRNT(io->ci_type == CIT_WRITE);
+        LINVRNT(cl_page_is_owned(page, io));
+        LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED);
+        LINVRNT(cl_io_invariant(io));
+        LASSERT(cl_page_in_io(page, io));
+        ENTRY;
+
+        cl_io_for_each_reverse(scan, io) {
+                if (scan->cis_iop->cio_prepare_write != NULL) {
+                        const struct cl_page_slice *slice;
+
+                        slice = cl_io_slice_page(scan, page);
+                        result = scan->cis_iop->cio_prepare_write(env, scan,
+                                                                  slice,
+                                                                  from, to);
+                        if (result != 0)
+                                break;
+                }
+        }
+        RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_prepare_write);
+
+/**
+ * Called by write io after user data were copied into a page.
+ *
+ * \see cl_io_operations::cio_commit_write()
+ */
+int cl_io_commit_write(const struct lu_env *env, struct cl_io *io,
+                       struct cl_page *page, unsigned from, unsigned to)
+{
+        const struct cl_io_slice *scan;
+        int result = 0;
+
+        LINVRNT(io->ci_type == CIT_WRITE);
+        LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED);
+        LINVRNT(cl_io_invariant(io));
+        /*
+         * XXX Uh... not nice. Top level cl_io_commit_write() call (vvp->lov)
+         * already called cl_page_cache_add(), moving page into CPS_CACHED
+         * state. Better (and more general) way of dealing with such situation
+         * is needed.
+         */
+        LASSERT(cl_page_is_owned(page, io) || page->cp_parent != NULL);
+        LASSERT(cl_page_in_io(page, io));
+        ENTRY;
+
+        cl_io_for_each(scan, io) {
+                if (scan->cis_iop->cio_commit_write != NULL) {
+                        const struct cl_page_slice *slice;
+
+                        slice = cl_io_slice_page(scan, page);
+                        result = scan->cis_iop->cio_commit_write(env, scan,
+                                                                 slice,
+                                                                 from, to);
+                        if (result != 0)
+                                break;
+                }
+        }
+        LINVRNT(result <= 0);
+        RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_commit_write);
+
+/**
+ * Submits a list of pages for immediate io.
+ *
+ * After the function gets returned, The submitted pages are moved to
+ * queue->c2_qout queue, and queue->c2_qin contain both the pages don't need
+ * to be submitted, and the pages are errant to submit.
+ *
+ * \returns 0 if at least one page was submitted, error code otherwise.
+ * \see cl_io_operations::cio_submit()
+ */
+int cl_io_submit_rw(const struct lu_env *env, struct cl_io *io,
+                    enum cl_req_type crt, struct cl_2queue *queue)
+{
+        const struct cl_io_slice *scan;
+        int result = 0;
+
+        LINVRNT(crt < ARRAY_SIZE(scan->cis_iop->req_op));
+        ENTRY;
+
+        cl_io_for_each(scan, io) {
+                if (scan->cis_iop->req_op[crt].cio_submit == NULL)
+                        continue;
+                result = scan->cis_iop->req_op[crt].cio_submit(env, scan, crt,
+                                                               queue);
+                if (result != 0)
+                        break;
+        }
+        /*
+         * If ->cio_submit() failed, no pages were sent.
+         */
+        LASSERT(ergo(result != 0, list_empty(&queue->c2_qout.pl_pages)));
+        RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_submit_rw);
+
+/**
+ * Cancel an IO which has been submitted by cl_io_submit_rw.
+ */
+int cl_io_cancel(const struct lu_env *env, struct cl_io *io,
+                 struct cl_page_list *queue)
+{
+        struct cl_page *page;
+        int result = 0;
+
+        CERROR("Canceling ongoing page trasmission\n");
+        cl_page_list_for_each(page, queue) {
+                int rc;
+
+                LINVRNT(cl_page_in_io(page, io));
+                rc = cl_page_cancel(env, page);
+                result = result ?: rc;
+        }
+        return result;
+}
+EXPORT_SYMBOL(cl_io_cancel);
+
+/**
+ * Main io loop.
+ *
+ * Pumps io through iterations calling
+ *
+ *    - cl_io_iter_init()
+ *
+ *    - cl_io_lock()
+ *
+ *    - cl_io_start()
+ *
+ *    - cl_io_end()
+ *
+ *    - cl_io_unlock()
+ *
+ *    - cl_io_iter_fini()
+ *
+ * repeatedly until there is no more io to do.
+ */
+int cl_io_loop(const struct lu_env *env, struct cl_io *io)
+{
+        int result   = 0;
+
+        LINVRNT(cl_io_is_loopable(io));
+        ENTRY;
+
+        do {
+                size_t nob;
+
+                io->ci_continue = 0;
+                result = cl_io_iter_init(env, io);
+                if (result == 0) {
+                        nob    = io->ci_nob;
+                        result = cl_io_lock(env, io);
+                        if (result == 0) {
+                                /*
+                                 * Notify layers that locks has been taken,
+                                 * and do actual i/o.
+                                 *
+                                 *   - llite: kms, short read;
+                                 *   - llite: generic_file_read();
+                                 */
+                                result = cl_io_start(env, io);
+                                /*
+                                 * Send any remaining pending
+                                 * io, etc.
+                                 *
+                                 *   - llite: ll_rw_stats_tally.
+                                 */
+                                cl_io_end(env, io);
+                                cl_io_unlock(env, io);
+                                cl_io_rw_advance(env, io, io->ci_nob - nob);
+                        }
+                }
+                cl_io_iter_fini(env, io);
+        } while (result == 0 && io->ci_continue);
+        RETURN(result < 0 ? result : 0);
+}
+EXPORT_SYMBOL(cl_io_loop);
+
+/**
+ * Adds io slice to the cl_io.
+ *
+ * This is called by cl_object_operations::coo_io_init() methods to add a
+ * per-layer state to the io. New state is added at the end of
+ * cl_io::ci_layers list, that is, it is at the bottom of the stack.
+ *
+ * \see cl_lock_slice_add(), cl_req_slice_add(), cl_page_slice_add()
+ */
+void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice,
+                     struct cl_object *obj,
+                     const struct cl_io_operations *ops)
+{
+        struct list_head *linkage = &slice->cis_linkage;
+
+        LASSERT((linkage->prev == NULL && linkage->next == NULL) ||
+                list_empty(linkage));
+        ENTRY;
+
+        list_add_tail(linkage, &io->ci_layers);
+        slice->cis_io  = io;
+        slice->cis_obj = obj;
+        slice->cis_iop = ops;
+        EXIT;
+}
+EXPORT_SYMBOL(cl_io_slice_add);
+
+
+/**
+ * Initializes page list.
+ */
+void cl_page_list_init(struct cl_page_list *plist)
+{
+        ENTRY;
+        plist->pl_nr = 0;
+        CFS_INIT_LIST_HEAD(&plist->pl_pages);
+        plist->pl_owner = cfs_current();
+        EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_init);
+
+/**
+ * Adds a page to a page list.
+ */
+void cl_page_list_add(struct cl_page_list *plist, struct cl_page *page)
+{
+        ENTRY;
+        /* it would be better to check that page is owned by "current" io, but
+         * it is not passed here. */
+        LASSERT(page->cp_owner != NULL);
+        LINVRNT(plist->pl_owner == cfs_current());
+
+        lockdep_off();
+        mutex_lock(&page->cp_mutex);
+        lockdep_on();
+        LASSERT(list_empty(&page->cp_batch));
+        list_add_tail(&page->cp_batch, &plist->pl_pages);
+        ++plist->pl_nr;
+        page->cp_queue_ref = lu_ref_add(&page->cp_reference, "queue", plist);
+        cl_page_get(page);
+        EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_add);
+
+/**
+ * Removes a page from a page list.
+ */
+void cl_page_list_del(const struct lu_env *env,
+                      struct cl_page_list *plist, struct cl_page *page)
+{
+        LASSERT(plist->pl_nr > 0);
+        LINVRNT(plist->pl_owner == cfs_current());
+
+        ENTRY;
+        list_del_init(&page->cp_batch);
+        lockdep_off();
+        mutex_unlock(&page->cp_mutex);
+        lockdep_on();
+        --plist->pl_nr;
+        lu_ref_del_at(&page->cp_reference, page->cp_queue_ref, "queue", plist);
+        cl_page_put(env, page);
+        EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_del);
+
+/**
+ * Moves a page from one page list to another.
+ */
+void cl_page_list_move(struct cl_page_list *dst, struct cl_page_list *src,
+                       struct cl_page *page)
+{
+        LASSERT(src->pl_nr > 0);
+        LINVRNT(dst->pl_owner == cfs_current());
+        LINVRNT(src->pl_owner == cfs_current());
+
+        ENTRY;
+        list_move_tail(&page->cp_batch, &dst->pl_pages);
+        --src->pl_nr;
+        ++dst->pl_nr;
+        lu_ref_set_at(&page->cp_reference,
+                      page->cp_queue_ref, "queue", src, dst);
+        EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_move);
+
+/**
+ * splice the cl_page_list, just as list head does
+ */
+void cl_page_list_splice(struct cl_page_list *list, struct cl_page_list *head)
+{
+        struct cl_page *page;
+        struct cl_page *tmp;
+
+        LINVRNT(list->pl_owner == cfs_current());
+        LINVRNT(head->pl_owner == cfs_current());
+
+        ENTRY;
+        cl_page_list_for_each_safe(page, tmp, list)
+                cl_page_list_move(head, list, page);
+        EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_splice);
+
+void cl_page_disown0(const struct lu_env *env,
+                     struct cl_io *io, struct cl_page *pg);
+
+/**
+ * Disowns pages in a queue.
+ */
+void cl_page_list_disown(const struct lu_env *env,
+                         struct cl_io *io, struct cl_page_list *plist)
+{
+        struct cl_page *page;
+        struct cl_page *temp;
+
+        LINVRNT(plist->pl_owner == cfs_current());
+
+        ENTRY;
+        cl_page_list_for_each_safe(page, temp, plist) {
+                LASSERT(plist->pl_nr > 0);
+
+                list_del_init(&page->cp_batch);
+                lockdep_off();
+                mutex_unlock(&page->cp_mutex);
+                lockdep_on();
+                --plist->pl_nr;
+                /*
+                 * cl_page_disown0 rather than usual cl_page_disown() is used,
+                 * because pages are possibly in CPS_FREEING state already due
+                 * to the call to cl_page_list_discard().
+                 */
+                /*
+                 * XXX cl_page_disown0() will fail if page is not locked.
+                 */
+                cl_page_disown0(env, io, page);
+                lu_ref_del(&page->cp_reference, "queue", plist);
+                cl_page_put(env, page);
+        }
+        EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_disown);
+
+/**
+ * Releases pages from queue.
+ */
+void cl_page_list_fini(const struct lu_env *env, struct cl_page_list *plist)
+{
+        struct cl_page *page;
+        struct cl_page *temp;
+
+        LINVRNT(plist->pl_owner == cfs_current());
+
+        ENTRY;
+        cl_page_list_for_each_safe(page, temp, plist)
+                cl_page_list_del(env, plist, page);
+        LASSERT(plist->pl_nr == 0);
+        EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_fini);
+
+/**
+ * Owns all pages in a queue.
+ */
+int cl_page_list_own(const struct lu_env *env,
+                     struct cl_io *io, struct cl_page_list *plist)
+{
+        struct cl_page *page;
+        int result;
+        int rc;
+
+        LINVRNT(plist->pl_owner == cfs_current());
+
+        ENTRY;
+        result = 0;
+        cl_page_list_for_each(page, plist) {
+                rc = cl_page_own(env, io, page);
+                result = result ?: page->cp_error;
+        }
+        RETURN(result);
+}
+EXPORT_SYMBOL(cl_page_list_own);
+
+/**
+ * Assumes all pages in a queue.
+ */
+void cl_page_list_assume(const struct lu_env *env,
+                         struct cl_io *io, struct cl_page_list *plist)
+{
+        struct cl_page *page;
+
+        LINVRNT(plist->pl_owner == cfs_current());
+
+        cl_page_list_for_each(page, plist)
+                cl_page_assume(env, io, page);
+}
+EXPORT_SYMBOL(cl_page_list_assume);
+
+/**
+ * Discards all pages in a queue.
+ */
+void cl_page_list_discard(const struct lu_env *env, struct cl_io *io,
+                          struct cl_page_list *plist)
+{
+        struct cl_page *page;
+
+        LINVRNT(plist->pl_owner == cfs_current());
+        ENTRY;
+        cl_page_list_for_each(page, plist)
+                cl_page_discard(env, io, page);
+        EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_discard);
+
+/**
+ * Unmaps all pages in a queue from user virtual memory.
+ */
+int cl_page_list_unmap(const struct lu_env *env, struct cl_io *io,
+                        struct cl_page_list *plist)
+{
+        struct cl_page *page;
+        int result;
+
+        LINVRNT(plist->pl_owner == cfs_current());
+        ENTRY;
+        result = 0;
+        cl_page_list_for_each(page, plist) {
+                result = cl_page_unmap(env, io, page);
+                if (result != 0)
+                        break;
+        }
+        RETURN(result);
+}
+EXPORT_SYMBOL(cl_page_list_unmap);
+
+/**
+ * Initialize dual page queue.
+ */
+void cl_2queue_init(struct cl_2queue *queue)
+{
+        ENTRY;
+        cl_page_list_init(&queue->c2_qin);
+        cl_page_list_init(&queue->c2_qout);
+        EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_init);
+
+/**
+ * Add a page to the incoming page list of 2-queue.
+ */
+void cl_2queue_add(struct cl_2queue *queue, struct cl_page *page)
+{
+        ENTRY;
+        cl_page_list_add(&queue->c2_qin, page);
+        EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_add);
+
+/**
+ * Disown pages in both lists of a 2-queue.
+ */
+void cl_2queue_disown(const struct lu_env *env,
+                      struct cl_io *io, struct cl_2queue *queue)
+{
+        ENTRY;
+        cl_page_list_disown(env, io, &queue->c2_qin);
+        cl_page_list_disown(env, io, &queue->c2_qout);
+        EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_disown);
+
+/**
+ * Discard (truncate) pages in both lists of a 2-queue.
+ */
+void cl_2queue_discard(const struct lu_env *env,
+                       struct cl_io *io, struct cl_2queue *queue)
+{
+        ENTRY;
+        cl_page_list_discard(env, io, &queue->c2_qin);
+        cl_page_list_discard(env, io, &queue->c2_qout);
+        EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_discard);
+
+/**
+ * Assume to own the pages in cl_2queue
+ */
+void cl_2queue_assume(const struct lu_env *env,
+                      struct cl_io *io, struct cl_2queue *queue)
+{
+        cl_page_list_assume(env, io, &queue->c2_qin);
+        cl_page_list_assume(env, io, &queue->c2_qout);
+}
+EXPORT_SYMBOL(cl_2queue_assume);
+
+/**
+ * Finalize both page lists of a 2-queue.
+ */
+void cl_2queue_fini(const struct lu_env *env, struct cl_2queue *queue)
+{
+        ENTRY;
+        cl_page_list_fini(env, &queue->c2_qout);
+        cl_page_list_fini(env, &queue->c2_qin);
+        EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_fini);
+
+/**
+ * Initialize a 2-queue to contain \a page in its incoming page list.
+ */
+void cl_2queue_init_page(struct cl_2queue *queue, struct cl_page *page)
+{
+        ENTRY;
+        cl_2queue_init(queue);
+        cl_2queue_add(queue, page);
+        EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_init_page);
+
+/**
+ * Returns top-level io.
+ *
+ * \see cl_object_top(), cl_page_top().
+ */
+struct cl_io *cl_io_top(struct cl_io *io)
+{
+        ENTRY;
+        while (io->ci_parent != NULL)
+                io = io->ci_parent;
+        RETURN(io);
+}
+EXPORT_SYMBOL(cl_io_top);
+
+/**
+ * Prints human readable representation of \a io to the \a f.
+ */
+void cl_io_print(const struct lu_env *env, void *cookie,
+                 lu_printer_t printer, const struct cl_io *io)
+{
+}
+
+/**
+ * Adds request slice to the compound request.
+ *
+ * This is called by cl_device_operations::cdo_req_init() methods to add a
+ * per-layer state to the request. New state is added at the end of
+ * cl_req::crq_layers list, that is, it is at the bottom of the stack.
+ *
+ * \see cl_lock_slice_add(), cl_page_slice_add(), cl_io_slice_add()
+ */
+void cl_req_slice_add(struct cl_req *req, struct cl_req_slice *slice,
+                      struct cl_device *dev,
+                      const struct cl_req_operations *ops)
+{
+        ENTRY;
+        list_add_tail(&slice->crs_linkage, &req->crq_layers);
+        slice->crs_dev = dev;
+        slice->crs_ops = ops;
+        slice->crs_req = req;
+        EXIT;
+}
+EXPORT_SYMBOL(cl_req_slice_add);
+
+static void cl_req_free(const struct lu_env *env, struct cl_req *req)
+{
+        unsigned i;
+
+        LASSERT(list_empty(&req->crq_pages));
+        LASSERT(req->crq_nrpages == 0);
+        LINVRNT(list_empty(&req->crq_layers));
+        LINVRNT(equi(req->crq_nrobjs > 0, req->crq_o != NULL));
+        ENTRY;
+
+        if (req->crq_o != NULL) {
+                for (i = 0; i < req->crq_nrobjs; ++i) {
+                        struct cl_object *obj = req->crq_o[i].ro_obj;
+                        if (obj != NULL) {
+                                lu_object_ref_del_at(&obj->co_lu,
+                                                     req->crq_o[i].ro_obj_ref,
+                                                     "cl_req", req);
+                                cl_object_put(env, obj);
+                        }
+                }
+                OBD_FREE(req->crq_o, req->crq_nrobjs * sizeof req->crq_o[0]);
+        }
+        OBD_FREE_PTR(req);
+        EXIT;
+}
+
+static int cl_req_init(const struct lu_env *env, struct cl_req *req,
+                       struct cl_page *page)
+{
+        struct cl_device     *dev;
+        struct cl_page_slice *slice;
+        int result;
+
+        ENTRY;
+        result = 0;
+        page = cl_page_top(page);
+        do {
+                list_for_each_entry(slice, &page->cp_layers, cpl_linkage) {
+                        dev = lu2cl_dev(slice->cpl_obj->co_lu.lo_dev);
+                        if (dev->cd_ops->cdo_req_init != NULL) {
+                                result = dev->cd_ops->cdo_req_init(env,
+                                                                   dev, req);
+                                if (result != 0)
+                                        break;
+                        }
+                }
+                page = page->cp_child;
+        } while (page != NULL && result == 0);
+        RETURN(result);
+}
+
+/**
+ * Invokes per-request transfer completion call-backs
+ * (cl_req_operations::cro_completion()) bottom-to-top.
+ */
+void cl_req_completion(const struct lu_env *env, struct cl_req *req, int rc)
+{
+        struct cl_req_slice *slice;
+
+        ENTRY;
+        /*
+         * for the lack of list_for_each_entry_reverse_safe()...
+         */
+        while (!list_empty(&req->crq_layers)) {
+                slice = list_entry(req->crq_layers.prev,
+                                   struct cl_req_slice, crs_linkage);
+                list_del_init(&slice->crs_linkage);
+                if (slice->crs_ops->cro_completion != NULL)
+                        slice->crs_ops->cro_completion(env, slice, rc);
+        }
+        cl_req_free(env, req);
+        EXIT;
+}
+EXPORT_SYMBOL(cl_req_completion);
+
+/**
+ * Allocates new transfer request.
+ */
+struct cl_req *cl_req_alloc(const struct lu_env *env, struct cl_page *page,
+                            enum cl_req_type crt, int nr_objects)
+{
+        struct cl_req *req;
+
+        LINVRNT(nr_objects > 0);
+        ENTRY;
+
+        OBD_ALLOC_PTR(req);
+        if (req != NULL) {
+                int result;
+
+                OBD_ALLOC(req->crq_o, nr_objects * sizeof req->crq_o[0]);
+                if (req->crq_o != NULL) {
+                        req->crq_nrobjs = nr_objects;
+                        req->crq_type = crt;
+                        CFS_INIT_LIST_HEAD(&req->crq_pages);
+                        CFS_INIT_LIST_HEAD(&req->crq_layers);
+                        result = cl_req_init(env, req, page);
+                } else
+                        result = -ENOMEM;
+                if (result != 0) {
+                        cl_req_completion(env, req, result);
+                        req = ERR_PTR(result);
+                }
+        } else
+                req = ERR_PTR(-ENOMEM);
+        RETURN(req);
+}
+EXPORT_SYMBOL(cl_req_alloc);
+
+/**
+ * Adds a page to a request.
+ */
+void cl_req_page_add(const struct lu_env *env,
+                     struct cl_req *req, struct cl_page *page)
+{
+        struct cl_object  *obj;
+        struct cl_req_obj *rqo;
+        int i;
+
+        ENTRY;
+        page = cl_page_top(page);
+
+        LINVRNT(cl_page_is_vmlocked(env, page));
+        LASSERT(list_empty(&page->cp_flight));
+        LASSERT(page->cp_req == NULL);
+
+        list_add_tail(&page->cp_flight, &req->crq_pages);
+        ++req->crq_nrpages;
+        page->cp_req = req;
+        obj = cl_object_top(page->cp_obj);
+        for (i = 0, rqo = req->crq_o; obj != rqo->ro_obj; ++i, ++rqo) {
+                if (rqo->ro_obj == NULL) {
+                        rqo->ro_obj = obj;
+                        cl_object_get(obj);
+                        rqo->ro_obj_ref = lu_object_ref_add(&obj->co_lu,
+                                                            "cl_req", req);
+                        break;
+                }
+        }
+        LASSERT(i < req->crq_nrobjs);
+        EXIT;
+}
+EXPORT_SYMBOL(cl_req_page_add);
+
+/**
+ * Removes a page from a request.
+ */
+void cl_req_page_done(const struct lu_env *env, struct cl_page *page)
+{
+        struct cl_req *req = page->cp_req;
+
+        ENTRY;
+        page = cl_page_top(page);
+
+        LINVRNT(cl_page_is_vmlocked(env, page));
+        LASSERT(!list_empty(&page->cp_flight));
+        LASSERT(req->crq_nrpages > 0);
+
+        list_del_init(&page->cp_flight);
+        --req->crq_nrpages;
+        page->cp_req = NULL;
+        EXIT;
+}
+EXPORT_SYMBOL(cl_req_page_done);
+
+/**
+ * Notifies layers that request is about to depart by calling
+ * cl_req_operations::cro_prep() top-to-bottom.
+ */
+int cl_req_prep(const struct lu_env *env, struct cl_req *req)
+{
+        int i;
+        int result;
+        const struct cl_req_slice *slice;
+
+        ENTRY;
+        /*
+         * Check that the caller of cl_req_alloc() didn't lie about the number
+         * of objects.
+         */
+        for (i = 0; i < req->crq_nrobjs; ++i)
+                LASSERT(req->crq_o[i].ro_obj != NULL);
+
+        result = 0;
+        list_for_each_entry(slice, &req->crq_layers, crs_linkage) {
+                if (slice->crs_ops->cro_prep != NULL) {
+                        result = slice->crs_ops->cro_prep(env, slice);
+                        if (result != 0)
+                                break;
+                }
+        }
+        RETURN(result);
+}
+EXPORT_SYMBOL(cl_req_prep);
+
+/**
+ * Fills in attributes that are passed to server together with transfer. Only
+ * attributes from \a flags may be touched. This can be called multiple times
+ * for the same request.
+ */
+void cl_req_attr_set(const struct lu_env *env, struct cl_req *req,
+                     struct cl_req_attr *attr, obd_valid flags)
+{
+        const struct cl_req_slice *slice;
+        struct cl_page            *page;
+        int i;
+
+        LASSERT(!list_empty(&req->crq_pages));
+        ENTRY;
+
+        /* Take any page to use as a model. */
+        page = list_entry(req->crq_pages.next, struct cl_page, cp_flight);
+
+        for (i = 0; i < req->crq_nrobjs; ++i) {
+                list_for_each_entry(slice, &req->crq_layers, crs_linkage) {
+                        const struct cl_page_slice *scan;
+                        const struct cl_object     *obj;
+
+                        scan = cl_page_at(page,
+                                          slice->crs_dev->cd_lu_dev.ld_type);
+                        LASSERT(scan != NULL);
+                        obj = scan->cpl_obj;
+                        if (slice->crs_ops->cro_attr_set != NULL)
+                                slice->crs_ops->cro_attr_set(env, slice, obj,
+                                                             attr + i, flags);
+                }
+        }
+        EXIT;
+}
+EXPORT_SYMBOL(cl_req_attr_set);
+
+/* XXX complete(), init_completion(), and wait_for_completion(), until they are
+ * implemented in libcfs. */
+#ifdef __KERNEL__
+# include <linux/sched.h>
+#else /* __KERNEL__ */
+# include <liblustre.h>
+#endif
+
+/**
+ * Initialize synchronous io wait anchor, for transfer of \a nrpages pages.
+ */
+void cl_sync_io_init(struct cl_sync_io *anchor, int nrpages)
+{
+        ENTRY;
+        init_completion(&anchor->csi_sync_completion);
+        atomic_set(&anchor->csi_sync_nr, nrpages);
+        anchor->csi_sync_rc  = 0;
+        EXIT;
+}
+EXPORT_SYMBOL(cl_sync_io_init);
+
+/**
+ * Wait until all transfer completes. Transfer completion routine has to call
+ * cl_sync_io_note() for every page.
+ */
+int cl_sync_io_wait(const struct lu_env *env, struct cl_io *io,
+                    struct cl_page_list *queue, struct cl_sync_io *anchor)
+{
+        int rc;
+        ENTRY;
+
+        rc = wait_for_completion_interruptible(&anchor->csi_sync_completion);
+        if (rc < 0) {
+                int rc2;
+                rc2 = cl_io_cancel(env, io, queue);
+                if (rc2 < 0) {
+                        /* Too bad, some pages are still in IO. */
+                        CDEBUG(D_VFSTRACE, "Failed to cancel transfer (%i). "
+                               "Waiting for %i pages\n",
+                               rc2, atomic_read(&anchor->csi_sync_nr));
+                        wait_for_completion(&anchor->csi_sync_completion);
+                }
+        } else
+                rc = anchor->csi_sync_rc;
+        LASSERT(atomic_read(&anchor->csi_sync_nr) == 0);
+        cl_page_list_assume(env, io, queue);
+        POISON(anchor, 0x5a, sizeof *anchor);
+        RETURN(rc);
+}
+EXPORT_SYMBOL(cl_sync_io_wait);
+
+/**
+ * Indicate that transfer of a single page completed.
+ */
+void cl_sync_io_note(struct cl_sync_io *anchor, int ioret)
+{
+        ENTRY;
+        if (anchor->csi_sync_rc == 0 && ioret < 0)
+                anchor->csi_sync_rc = ioret;
+        /*
+         * Synchronous IO done without releasing page lock (e.g., as a part of
+         * ->{prepare,commit}_write(). Completion is used to signal the end of
+         * IO.
+         */
+        if (atomic_dec_and_test(&anchor->csi_sync_nr))
+                complete(&anchor->csi_sync_completion);
+        EXIT;
+}
+EXPORT_SYMBOL(cl_sync_io_note);
diff --git a/lustre/obdclass/cl_lock.c b/lustre/obdclass/cl_lock.c

new file mode 100644 (file)

index 0000000..89077c2
--- /dev/null
+++ b/lustre/obdclass/cl_lock.c
@@ -0,0 +1,2073 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Client Extent Lock.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <libcfs/list.h>
+/* lu_time_global_{init,fini}() */
+#include <lu_time.h>
+
+#include <cl_object.h>
+#include "cl_internal.h"
+
+/** Lock class of cl_lock::cll_guard */
+static struct lock_class_key cl_lock_guard_class;
+static cfs_mem_cache_t *cl_lock_kmem;
+
+static struct lu_kmem_descr cl_lock_caches[] = {
+        {
+                .ckd_cache = &cl_lock_kmem,
+                .ckd_name  = "cl_lock_kmem",
+                .ckd_size  = sizeof (struct cl_lock)
+        },
+        {
+                .ckd_cache = NULL
+        }
+};
+
+/**
+ * Basic lock invariant that is maintained at all times. Caller either has a
+ * reference to \a lock, or somehow assures that \a lock cannot be freed.
+ *
+ * \see cl_lock_invariant()
+ */
+static int cl_lock_invariant_trusted(const struct lu_env *env,
+                                     const struct cl_lock *lock)
+{
+        return
+                cl_is_lock(lock) &&
+                ergo(lock->cll_state == CLS_FREEING, lock->cll_holds == 0) &&
+                atomic_read(&lock->cll_ref) >= lock->cll_holds &&
+                lock->cll_holds >= lock->cll_users &&
+                lock->cll_holds >= 0 &&
+                lock->cll_users >= 0 &&
+                lock->cll_depth >= 0;
+}
+
+/**
+ * Stronger lock invariant, checking that caller has a reference on a lock.
+ *
+ * \see cl_lock_invariant_trusted()
+ */
+static int cl_lock_invariant(const struct lu_env *env,
+                             const struct cl_lock *lock)
+{
+        int result;
+
+        result = atomic_read(&lock->cll_ref) > 0 &&
+                cl_lock_invariant_trusted(env, lock);
+        if (!result && env != NULL)
+                CL_LOCK_DEBUG(D_ERROR, env, lock, "invariant broken");
+        return result;
+}
+
+#define RETIP ((unsigned long)__builtin_return_address(0))
+
+#ifdef CONFIG_LOCKDEP
+static struct lock_class_key cl_lock_key;
+
+static void cl_lock_lockdep_init(struct cl_lock *lock)
+{
+        lockdep_set_class_and_name(lock, &cl_lock_key, "EXT");
+}
+
+static void cl_lock_lockdep_acquire(const struct lu_env *env,
+                                    struct cl_lock *lock, __u32 enqflags)
+{
+        cl_env_info(env)->clt_nr_locks_acquired++;
+        lock_acquire(&lock->dep_map, !!(enqflags & CEF_ASYNC),
+                     /* try: */ 0, lock->cll_descr.cld_mode <= CLM_READ,
+                     /* check: */ 2, RETIP);
+}
+
+static void cl_lock_lockdep_release(const struct lu_env *env,
+                                    struct cl_lock *lock)
+{
+        cl_env_info(env)->clt_nr_locks_acquired--;
+        lock_release(&lock->dep_map, 0, RETIP);
+}
+
+#else /* !CONFIG_LOCKDEP */
+
+static void cl_lock_lockdep_init(struct cl_lock *lock)
+{}
+static void cl_lock_lockdep_acquire(const struct lu_env *env,
+                                    struct cl_lock *lock, __u32 enqflags)
+{}
+static void cl_lock_lockdep_release(const struct lu_env *env,
+                                    struct cl_lock *lock)
+{}
+
+#endif /* !CONFIG_LOCKDEP */
+
+/**
+ * Adds lock slice to the compound lock.
+ *
+ * This is called by cl_object_operations::coo_lock_init() methods to add a
+ * per-layer state to the lock. New state is added at the end of
+ * cl_lock::cll_layers list, that is, it is at the bottom of the stack.
+ *
+ * \see cl_req_slice_add(), cl_page_slice_add(), cl_io_slice_add()
+ */
+void cl_lock_slice_add(struct cl_lock *lock, struct cl_lock_slice *slice,
+                       struct cl_object *obj,
+                       const struct cl_lock_operations *ops)
+{
+        ENTRY;
+        slice->cls_lock = lock;
+        list_add_tail(&slice->cls_linkage, &lock->cll_layers);
+        slice->cls_obj = obj;
+        slice->cls_ops = ops;
+        EXIT;
+}
+EXPORT_SYMBOL(cl_lock_slice_add);
+
+/**
+ * Returns true iff a lock with the mode \a has provides at least the same
+ * guarantees as a lock with the mode \a need.
+ */
+int cl_lock_mode_match(enum cl_lock_mode has, enum cl_lock_mode need)
+{
+        LINVRNT(need == CLM_READ || need == CLM_WRITE || need == CLM_PHANTOM);
+        LINVRNT(has == CLM_READ || has == CLM_WRITE || has == CLM_PHANTOM);
+        CLASSERT(CLM_PHANTOM < CLM_READ);
+        CLASSERT(CLM_READ < CLM_WRITE);
+
+        return need <= has;
+}
+EXPORT_SYMBOL(cl_lock_mode_match);
+
+/**
+ * Returns true iff extent portions of lock descriptions match.
+ */
+int cl_lock_ext_match(const struct cl_lock_descr *has,
+                      const struct cl_lock_descr *need)
+{
+        return
+                has->cld_start <= need->cld_start &&
+                has->cld_end >= need->cld_end &&
+                cl_lock_mode_match(has->cld_mode, need->cld_mode);
+}
+EXPORT_SYMBOL(cl_lock_ext_match);
+
+/**
+ * Returns true iff a lock with the description \a has provides at least the
+ * same guarantees as a lock with the description \a need.
+ */
+int cl_lock_descr_match(const struct cl_lock_descr *has,
+                        const struct cl_lock_descr *need)
+{
+        return
+                cl_object_same(has->cld_obj, need->cld_obj) &&
+                cl_lock_ext_match(has, need);
+}
+EXPORT_SYMBOL(cl_lock_descr_match);
+
+static void cl_lock_free(const struct lu_env *env, struct cl_lock *lock)
+{
+        struct cl_object *obj = lock->cll_descr.cld_obj;
+
+        LASSERT(cl_is_lock(lock));
+        LINVRNT(!cl_lock_is_mutexed(lock));
+        LINVRNT(!mutex_is_locked(&lock->cll_guard));
+
+        ENTRY;
+        might_sleep();
+        while (!list_empty(&lock->cll_layers)) {
+                struct cl_lock_slice *slice;
+
+                slice = list_entry(lock->cll_layers.next, struct cl_lock_slice,
+                                   cls_linkage);
+                list_del_init(lock->cll_layers.next);
+                slice->cls_ops->clo_fini(env, slice);
+        }
+        atomic_dec(&cl_object_site(obj)->cs_locks.cs_total);
+        atomic_dec(&cl_object_site(obj)->cs_locks_state[lock->cll_state]);
+        lu_object_ref_del_at(&obj->co_lu, lock->cll_obj_ref, "cl_lock", lock);
+        cl_object_put(env, obj);
+        lu_ref_fini(&lock->cll_reference);
+        lu_ref_fini(&lock->cll_holders);
+        mutex_destroy(&lock->cll_guard);
+        OBD_SLAB_FREE_PTR(lock, cl_lock_kmem);
+        EXIT;
+}
+
+/**
+ * Releases a reference on a lock.
+ *
+ * When last reference is released, lock is returned to the cache, unless it
+ * is in cl_lock_state::CLS_FREEING state, in which case it is destroyed
+ * immediately.
+ *
+ * \see cl_object_put(), cl_page_put()
+ */
+void cl_lock_put(const struct lu_env *env, struct cl_lock *lock)
+{
+        struct cl_object        *obj;
+        struct cl_object_header *head;
+        struct cl_site          *site;
+
+        LINVRNT(cl_lock_invariant(env, lock));
+        ENTRY;
+        obj = lock->cll_descr.cld_obj;
+        LINVRNT(obj != NULL);
+        head = cl_object_header(obj);
+        site = cl_object_site(obj);
+
+        CDEBUG(D_DLMTRACE, "releasing reference: %d %p %lu\n",
+               atomic_read(&lock->cll_ref), lock, RETIP);
+
+        if (atomic_dec_and_test(&lock->cll_ref)) {
+                if (lock->cll_state == CLS_FREEING) {
+                        LASSERT(list_empty(&lock->cll_linkage));
+                        cl_lock_free(env, lock);
+                }
+                atomic_dec(&site->cs_locks.cs_busy);
+        }
+        EXIT;
+}
+EXPORT_SYMBOL(cl_lock_put);
+
+/**
+ * Acquires an additional reference to a lock.
+ *
+ * This can be called only by caller already possessing a reference to \a
+ * lock.
+ *
+ * \see cl_object_get(), cl_page_get()
+ */
+void cl_lock_get(struct cl_lock *lock)
+{
+        LINVRNT(cl_lock_invariant(NULL, lock));
+        CDEBUG(D_DLMTRACE|D_TRACE, "acquiring reference: %d %p %lu\n",
+               atomic_read(&lock->cll_ref), lock, RETIP);
+        atomic_inc(&lock->cll_ref);
+}
+EXPORT_SYMBOL(cl_lock_get);
+
+/**
+ * Acquires a reference to a lock.
+ *
+ * This is much like cl_lock_get(), except that this function can be used to
+ * acquire initial reference to the cached lock. Caller has to deal with all
+ * possible races. Use with care!
+ *
+ * \see cl_page_get_trust()
+ */
+void cl_lock_get_trust(struct cl_lock *lock)
+{
+        struct cl_site *site = cl_object_site(lock->cll_descr.cld_obj);
+
+        LASSERT(cl_is_lock(lock));
+        CDEBUG(D_DLMTRACE|D_TRACE, "acquiring trusted reference: %d %p %lu\n",
+               atomic_read(&lock->cll_ref), lock, RETIP);
+        if (atomic_inc_return(&lock->cll_ref) == 1)
+                atomic_inc(&site->cs_locks.cs_busy);
+}
+EXPORT_SYMBOL(cl_lock_get_trust);
+
+/**
+ * Helper function destroying the lock that wasn't completely initialized.
+ *
+ * Other threads can acquire references to the top-lock through its
+ * sub-locks. Hence, it cannot be cl_lock_free()-ed immediately.
+ */
+static void cl_lock_finish(const struct lu_env *env, struct cl_lock *lock)
+{
+        cl_lock_mutex_get(env, lock);
+        cl_lock_delete(env, lock);
+        cl_lock_mutex_put(env, lock);
+        cl_lock_put(env, lock);
+}
+
+static struct cl_lock *cl_lock_alloc(const struct lu_env *env,
+                                     struct cl_object *obj,
+                                     const struct cl_io *io,
+                                     const struct cl_lock_descr *descr)
+{
+        struct cl_lock          *lock;
+        struct lu_object_header *head;
+        struct cl_site          *site = cl_object_site(obj);
+
+        ENTRY;
+        OBD_SLAB_ALLOC_PTR(lock, cl_lock_kmem);
+        if (lock != NULL) {
+                atomic_set(&lock->cll_ref, 1);
+                lock->cll_descr = *descr;
+                lock->cll_state = CLS_NEW;
+                cl_object_get(obj);
+                lock->cll_obj_ref = lu_object_ref_add(&obj->co_lu,
+                                                      "cl_lock", lock);
+                CFS_INIT_LIST_HEAD(&lock->cll_layers);
+                CFS_INIT_LIST_HEAD(&lock->cll_linkage);
+                CFS_INIT_LIST_HEAD(&lock->cll_inclosure);
+                lu_ref_init(&lock->cll_reference);
+                lu_ref_init(&lock->cll_holders);
+                mutex_init(&lock->cll_guard);
+                lockdep_set_class(&lock->cll_guard, &cl_lock_guard_class);
+                cfs_waitq_init(&lock->cll_wq);
+                head = obj->co_lu.lo_header;
+                atomic_inc(&site->cs_locks_state[CLS_NEW]);
+                atomic_inc(&site->cs_locks.cs_total);
+                atomic_inc(&site->cs_locks.cs_created);
+                cl_lock_lockdep_init(lock);
+                list_for_each_entry(obj, &head->loh_layers, co_lu.lo_linkage) {
+                        int err;
+
+                        err = obj->co_ops->coo_lock_init(env, obj, lock, io);
+                        if (err != 0) {
+                                cl_lock_finish(env, lock);
+                                lock = ERR_PTR(err);
+                                break;
+                        }
+                }
+        } else
+                lock = ERR_PTR(-ENOMEM);
+        RETURN(lock);
+}
+
+/**
+ * Returns true iff lock is "suitable" for given io. E.g., locks acquired by
+ * truncate and O_APPEND cannot be reused for read/non-append-write, as they
+ * cover multiple stripes and can trigger cascading timeouts.
+ */
+static int cl_lock_fits_into(const struct lu_env *env,
+                             const struct cl_lock *lock,
+                             const struct cl_lock_descr *need,
+                             const struct cl_io *io)
+{
+        const struct cl_lock_slice *slice;
+
+        LINVRNT(cl_lock_invariant_trusted(env, lock));
+        ENTRY;
+        list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+                if (slice->cls_ops->clo_fits_into != NULL &&
+                    !slice->cls_ops->clo_fits_into(env, slice, need, io))
+                        RETURN(0);
+        }
+        RETURN(1);
+}
+
+static struct cl_lock *cl_lock_lookup(const struct lu_env *env,
+                                      struct cl_object *obj,
+                                      const struct cl_io *io,
+                                      const struct cl_lock_descr *need)
+{
+        struct cl_lock          *lock;
+        struct cl_object_header *head;
+        struct cl_site          *site;
+
+        ENTRY;
+
+        head = cl_object_header(obj);
+        site = cl_object_site(obj);
+        LINVRNT(spin_is_locked(&head->coh_lock_guard));
+        atomic_inc(&site->cs_locks.cs_lookup);
+        list_for_each_entry(lock, &head->coh_locks, cll_linkage) {
+                int matched;
+
+                LASSERT(cl_is_lock(lock));
+                matched = cl_lock_ext_match(&lock->cll_descr, need) &&
+                        lock->cll_state < CLS_FREEING &&
+                        !(lock->cll_flags & CLF_CANCELLED) &&
+                        cl_lock_fits_into(env, lock, need, io);
+                CDEBUG(D_DLMTRACE, "has: "DDESCR"(%i) need: "DDESCR": %d\n",
+                       PDESCR(&lock->cll_descr), lock->cll_state, PDESCR(need),
+                       matched);
+                if (matched) {
+                        cl_lock_get_trust(lock);
+                        /* move the lock to the LRU head */
+                        list_move(&lock->cll_linkage, &head->coh_locks);
+                        atomic_inc(&cl_object_site(obj)->cs_locks.cs_hit);
+                        RETURN(lock);
+                }
+        }
+        RETURN(NULL);
+}
+
+/**
+ * Returns a lock matching description \a need.
+ *
+ * This is the main entry point into the cl_lock caching interface. First, a
+ * cache (implemented as a per-object linked list) is consulted. If lock is
+ * found there, it is returned immediately. Otherwise new lock is allocated
+ * and returned. In any case, additional reference to lock is acquired.
+ *
+ * \see cl_object_find(), cl_page_find()
+ */
+static struct cl_lock *cl_lock_find(const struct lu_env *env,
+                                    const struct cl_io *io,
+                                    const struct cl_lock_descr *need)
+{
+        struct cl_object_header *head;
+        struct cl_object        *obj;
+        struct cl_lock          *lock;
+        struct cl_site          *site;
+
+        ENTRY;
+
+        obj  = need->cld_obj;
+        head = cl_object_header(obj);
+        site = cl_object_site(obj);
+
+        spin_lock(&head->coh_lock_guard);
+        lock = cl_lock_lookup(env, obj, io, need);
+        spin_unlock(&head->coh_lock_guard);
+
+        if (lock == NULL) {
+                lock = cl_lock_alloc(env, obj, io, need);
+                if (!IS_ERR(lock)) {
+                        struct cl_lock *ghost;
+
+                        spin_lock(&head->coh_lock_guard);
+                        ghost = cl_lock_lookup(env, obj, io, need);
+                        if (ghost == NULL) {
+                                list_add(&lock->cll_linkage, &head->coh_locks);
+                                spin_unlock(&head->coh_lock_guard);
+                                atomic_inc(&site->cs_locks.cs_busy);
+                        } else {
+                                spin_unlock(&head->coh_lock_guard);
+                                /*
+                                 * Other threads can acquire references to the
+                                 * top-lock through its sub-locks. Hence, it
+                                 * cannot be cl_lock_free()-ed immediately.
+                                 */
+                                cl_lock_finish(env, lock);
+                                lock = ghost;
+                        }
+                }
+        }
+        RETURN(lock);
+}
+
+/**
+ * Returns existing lock matching given description. This is similar to
+ * cl_lock_find() except that no new lock is created, and returned lock is
+ * guaranteed to be in enum cl_lock_state::CLS_HELD state.
+ */
+struct cl_lock *cl_lock_peek(const struct lu_env *env, const struct cl_io *io,
+                             const struct cl_lock_descr *need,
+                             const char *scope, const void *source)
+{
+        struct cl_object_header *head;
+        struct cl_object        *obj;
+        struct cl_lock          *lock;
+
+        obj  = need->cld_obj;
+        head = cl_object_header(obj);
+
+        spin_lock(&head->coh_lock_guard);
+        lock = cl_lock_lookup(env, obj, io, need);
+        spin_unlock(&head->coh_lock_guard);
+
+        if (lock != NULL) {
+                int ok;
+
+                cl_lock_mutex_get(env, lock);
+                if (lock->cll_state == CLS_CACHED)
+                        cl_use_try(env, lock);
+                ok = lock->cll_state == CLS_HELD;
+                if (ok) {
+                        cl_lock_hold_add(env, lock, scope, source);
+                        cl_lock_user_add(env, lock);
+                }
+                cl_lock_mutex_put(env, lock);
+                if (!ok) {
+                        cl_lock_put(env, lock);
+                        lock = NULL;
+                }
+        }
+        return lock;
+}
+EXPORT_SYMBOL(cl_lock_peek);
+
+/**
+ * Returns a slice within a lock, corresponding to the given layer in the
+ * device stack.
+ *
+ * \see cl_page_at()
+ */
+const struct cl_lock_slice *cl_lock_at(const struct cl_lock *lock,
+                                       const struct lu_device_type *dtype)
+{
+        const struct cl_lock_slice *slice;
+
+        LINVRNT(cl_lock_invariant_trusted(NULL, lock));
+        ENTRY;
+
+        list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+                if (slice->cls_obj->co_lu.lo_dev->ld_type == dtype)
+                        RETURN(slice);
+        }
+        RETURN(NULL);
+}
+EXPORT_SYMBOL(cl_lock_at);
+
+static void cl_lock_trace(struct cl_thread_info *info,
+                          const char *prefix, const struct cl_lock *lock)
+{
+        CDEBUG(D_DLMTRACE|D_TRACE, "%s: %i@%p %p %i %i\n", prefix,
+               atomic_read(&lock->cll_ref), lock, lock->cll_guarder,
+               lock->cll_depth, info->clt_nr_locks_locked);
+}
+
+static void cl_lock_mutex_tail(const struct lu_env *env, struct cl_lock *lock)
+{
+        struct cl_thread_info *info;
+
+        info = cl_env_info(env);
+        lock->cll_depth++;
+        info->clt_nr_locks_locked++;
+        lu_ref_add(&info->clt_locks_locked, "cll_guard", lock);
+        cl_lock_trace(info, "got mutex", lock);
+}
+
+/**
+ * Locks cl_lock object.
+ *
+ * This is used to manipulate cl_lock fields, and to serialize state
+ * transitions in the lock state machine.
+ *
+ * \post cl_lock_is_mutexed(lock)
+ *
+ * \see cl_lock_mutex_put()
+ */
+void cl_lock_mutex_get(const struct lu_env *env, struct cl_lock *lock)
+{
+        LINVRNT(cl_lock_invariant(env, lock));
+
+        if (lock->cll_guarder == cfs_current()) {
+                LINVRNT(cl_lock_is_mutexed(lock));
+                LINVRNT(lock->cll_depth > 0);
+        } else {
+                struct cl_object_header *hdr;
+
+                LINVRNT(lock->cll_guarder != cfs_current());
+                hdr = cl_object_header(lock->cll_descr.cld_obj);
+                mutex_lock_nested(&lock->cll_guard, hdr->coh_nesting);
+                lock->cll_guarder = cfs_current();
+                LINVRNT(lock->cll_depth == 0);
+        }
+        cl_lock_mutex_tail(env, lock);
+}
+EXPORT_SYMBOL(cl_lock_mutex_get);
+
+/**
+ * Try-locks cl_lock object.
+ *
+ * \retval 0 \a lock was successfully locked
+ *
+ * \retval -EBUSY \a lock cannot be locked right now
+ *
+ * \post ergo(result == 0, cl_lock_is_mutexed(lock))
+ *
+ * \see cl_lock_mutex_get()
+ */
+int cl_lock_mutex_try(const struct lu_env *env, struct cl_lock *lock)
+{
+        int result;
+
+        LINVRNT(cl_lock_invariant_trusted(env, lock));
+        ENTRY;
+
+        result = 0;
+        if (lock->cll_guarder == cfs_current()) {
+                LINVRNT(lock->cll_depth > 0);
+                cl_lock_mutex_tail(env, lock);
+        } else if (mutex_trylock(&lock->cll_guard)) {
+                LINVRNT(lock->cll_depth == 0);
+                lock->cll_guarder = cfs_current();
+                cl_lock_mutex_tail(env, lock);
+        } else
+                result = -EBUSY;
+        RETURN(result);
+}
+EXPORT_SYMBOL(cl_lock_mutex_try);
+
+/**
+ * Unlocks cl_lock object.
+ *
+ * \pre cl_lock_is_mutexed(lock)
+ *
+ * \see cl_lock_mutex_get()
+ */
+void cl_lock_mutex_put(const struct lu_env *env, struct cl_lock *lock)
+{
+        struct cl_thread_info *info;
+
+        LINVRNT(cl_lock_invariant(env, lock));
+        LINVRNT(cl_lock_is_mutexed(lock));
+        LINVRNT(lock->cll_guarder == cfs_current());
+        LINVRNT(lock->cll_depth > 0);
+
+        info = cl_env_info(env);
+        LINVRNT(info->clt_nr_locks_locked > 0);
+
+        cl_lock_trace(info, "put mutex", lock);
+        lu_ref_del(&info->clt_locks_locked, "cll_guard", lock);
+        info->clt_nr_locks_locked--;
+        if (--lock->cll_depth == 0) {
+                lock->cll_guarder = NULL;
+                mutex_unlock(&lock->cll_guard);
+        }
+}
+EXPORT_SYMBOL(cl_lock_mutex_put);
+
+/**
+ * Returns true iff lock's mutex is owned by the current thread.
+ */
+int cl_lock_is_mutexed(struct cl_lock *lock)
+{
+        return lock->cll_guarder == cfs_current();
+}
+EXPORT_SYMBOL(cl_lock_is_mutexed);
+
+/**
+ * Returns number of cl_lock mutices held by the current thread (environment).
+ */
+int cl_lock_nr_mutexed(const struct lu_env *env)
+{
+        return cl_env_info(env)->clt_nr_locks_locked;
+}
+EXPORT_SYMBOL(cl_lock_nr_mutexed);
+
+static void cl_lock_cancel0(const struct lu_env *env, struct cl_lock *lock)
+{
+        LINVRNT(cl_lock_is_mutexed(lock));
+        LINVRNT(cl_lock_invariant(env, lock));
+        ENTRY;
+        if (!(lock->cll_flags & CLF_CANCELLED)) {
+                const struct cl_lock_slice *slice;
+
+                lock->cll_flags |= CLF_CANCELLED;
+                list_for_each_entry_reverse(slice, &lock->cll_layers,
+                                            cls_linkage) {
+                        if (slice->cls_ops->clo_cancel != NULL)
+                                slice->cls_ops->clo_cancel(env, slice);
+                }
+        }
+        EXIT;
+}
+
+static void cl_lock_delete0(const struct lu_env *env, struct cl_lock *lock)
+{
+        struct cl_object_header    *head;
+        const struct cl_lock_slice *slice;
+
+        LINVRNT(cl_lock_is_mutexed(lock));
+        LINVRNT(cl_lock_invariant(env, lock));
+
+        ENTRY;
+        if (lock->cll_state < CLS_FREEING) {
+                cl_lock_state_set(env, lock, CLS_FREEING);
+
+                head = cl_object_header(lock->cll_descr.cld_obj);
+
+                spin_lock(&head->coh_lock_guard);
+                list_del_init(&lock->cll_linkage);
+                /*
+                 * No locks, no pages. This is only valid for bottom sub-locks
+                 * and head->coh_nesting == 1 check assumes two level top-sub
+                 * hierarchy.
+                 */
+                LASSERT(ergo(head->coh_nesting == 1 &&
+                             list_empty(&head->coh_locks), !head->coh_pages));
+                spin_unlock(&head->coh_lock_guard);
+                /*
+                 * From now on, no new references to this lock can be acquired
+                 * by cl_lock_lookup().
+                 */
+                list_for_each_entry_reverse(slice, &lock->cll_layers,
+                                            cls_linkage) {
+                        if (slice->cls_ops->clo_delete != NULL)
+                                slice->cls_ops->clo_delete(env, slice);
+                }
+                /*
+                 * From now on, no new references to this lock can be acquired
+                 * by layer-specific means (like a pointer from struct
+                 * ldlm_lock in osc, or a pointer from top-lock to sub-lock in
+                 * lov).
+                 *
+                 * Lock will be finally freed in cl_lock_put() when last of
+                 * existing references goes away.
+                 */
+        }
+        EXIT;
+}
+
+static void cl_lock_hold_mod(const struct lu_env *env, struct cl_lock *lock,
+                             int delta)
+{
+        struct cl_thread_info   *cti;
+        struct cl_object_header *hdr;
+
+        cti = cl_env_info(env);
+        hdr = cl_object_header(lock->cll_descr.cld_obj);
+        lock->cll_holds += delta;
+        if (hdr->coh_nesting == 0) {
+                cti->clt_nr_held += delta;
+                LASSERT(cti->clt_nr_held >= 0);
+        }
+}
+
+static void cl_lock_used_mod(const struct lu_env *env, struct cl_lock *lock,
+                             int delta)
+{
+        struct cl_thread_info   *cti;
+        struct cl_object_header *hdr;
+
+        cti = cl_env_info(env);
+        hdr = cl_object_header(lock->cll_descr.cld_obj);
+        lock->cll_users += delta;
+        if (hdr->coh_nesting == 0) {
+                cti->clt_nr_used += delta;
+                LASSERT(cti->clt_nr_used >= 0);
+        }
+}
+
+static void cl_lock_hold_release(const struct lu_env *env, struct cl_lock *lock,
+                                 const char *scope, const void *source)
+{
+        LINVRNT(cl_lock_is_mutexed(lock));
+        LINVRNT(cl_lock_invariant(env, lock));
+        LASSERT(lock->cll_holds > 0);
+
+        ENTRY;
+        lu_ref_del(&lock->cll_holders, scope, source);
+        cl_lock_hold_mod(env, lock, -1);
+        if (lock->cll_holds == 0) {
+                if (lock->cll_descr.cld_mode == CLM_PHANTOM)
+                        /*
+                         * If lock is still phantom when user is done with
+                         * it---destroy the lock.
+                         */
+                        lock->cll_flags |= CLF_CANCELPEND|CLF_DOOMED;
+                if (lock->cll_flags & CLF_CANCELPEND) {
+                        lock->cll_flags &= ~CLF_CANCELPEND;
+                        cl_lock_cancel0(env, lock);
+                }
+                if (lock->cll_flags & CLF_DOOMED) {
+                        /* no longer doomed: it's dead... Jim. */
+                        lock->cll_flags &= ~CLF_DOOMED;
+                        cl_lock_delete0(env, lock);
+                }
+        }
+        EXIT;
+}
+
+
+/**
+ * Waits until lock state is changed.
+ *
+ * This function is called with cl_lock mutex locked, atomically releases
+ * mutex and goes to sleep, waiting for a lock state change (signaled by
+ * cl_lock_signal()), and re-acquires the mutex before return.
+ *
+ * This function is used to wait until lock state machine makes some progress
+ * and to emulate synchronous operations on top of asynchronous lock
+ * interface.
+ *
+ * \retval -EINTR wait was interrupted
+ *
+ * \retval 0 wait wasn't interrupted
+ *
+ * \pre cl_lock_is_mutexed(lock)
+ *
+ * \see cl_lock_signal()
+ */
+int cl_lock_state_wait(const struct lu_env *env, struct cl_lock *lock)
+{
+        cfs_waitlink_t waiter;
+        int result;
+
+        ENTRY;
+        LINVRNT(cl_lock_is_mutexed(lock));
+        LINVRNT(cl_lock_invariant(env, lock));
+        LASSERT(lock->cll_depth == 1);
+        LASSERT(lock->cll_state != CLS_FREEING); /* too late to wait */
+
+        result = lock->cll_error;
+        if (result == 0 && !(lock->cll_flags & CLF_STATE)) {
+                cfs_waitlink_init(&waiter);
+                cfs_waitq_add(&lock->cll_wq, &waiter);
+                set_current_state(CFS_TASK_INTERRUPTIBLE);
+                cl_lock_mutex_put(env, lock);
+
+                LASSERT(cl_lock_nr_mutexed(env) == 0);
+                cfs_waitq_wait(&waiter, CFS_TASK_INTERRUPTIBLE);
+
+                cl_lock_mutex_get(env, lock);
+                set_current_state(CFS_TASK_RUNNING);
+                cfs_waitq_del(&lock->cll_wq, &waiter);
+                result = cfs_signal_pending() ? -EINTR : 0;
+        }
+        lock->cll_flags &= ~CLF_STATE;
+        RETURN(result);
+}
+EXPORT_SYMBOL(cl_lock_state_wait);
+
+static void cl_lock_state_signal(const struct lu_env *env, struct cl_lock *lock,
+                                 enum cl_lock_state state)
+{
+        const struct cl_lock_slice *slice;
+
+        ENTRY;
+        LINVRNT(cl_lock_is_mutexed(lock));
+        LINVRNT(cl_lock_invariant(env, lock));
+
+        list_for_each_entry(slice, &lock->cll_layers, cls_linkage)
+                if (slice->cls_ops->clo_state != NULL)
+                        slice->cls_ops->clo_state(env, slice, state);
+        lock->cll_flags |= CLF_STATE;
+        cfs_waitq_broadcast(&lock->cll_wq);
+        EXIT;
+}
+
+/**
+ * Notifies waiters that lock state changed.
+ *
+ * Wakes up all waiters sleeping in cl_lock_state_wait(), also notifies all
+ * layers about state change by calling cl_lock_operations::clo_state()
+ * top-to-bottom.
+ */
+void cl_lock_signal(const struct lu_env *env, struct cl_lock *lock)
+{
+        ENTRY;
+        cl_lock_state_signal(env, lock, lock->cll_state);
+        EXIT;
+}
+EXPORT_SYMBOL(cl_lock_signal);
+
+/**
+ * Changes lock state.
+ *
+ * This function is invoked to notify layers that lock state changed, possible
+ * as a result of an asynchronous event such as call-back reception.
+ *
+ * \post lock->cll_state == state
+ *
+ * \see cl_lock_operations::clo_state()
+ */
+void cl_lock_state_set(const struct lu_env *env, struct cl_lock *lock,
+                       enum cl_lock_state state)
+{
+        struct cl_site *site = cl_object_site(lock->cll_descr.cld_obj);
+
+        ENTRY;
+        LASSERT(lock->cll_state <= state ||
+                (lock->cll_state == CLS_CACHED &&
+                 (state == CLS_HELD || /* lock found in cache */
+                  state == CLS_NEW     /* sub-lock canceled */)) ||
+                /* sub-lock canceled during unlocking */
+                (lock->cll_state == CLS_UNLOCKING && state == CLS_NEW));
+
+        if (lock->cll_state != state) {
+                atomic_dec(&site->cs_locks_state[lock->cll_state]);
+                atomic_inc(&site->cs_locks_state[state]);
+
+                cl_lock_state_signal(env, lock, state);
+                lock->cll_state = state;
+        }
+        EXIT;
+}
+EXPORT_SYMBOL(cl_lock_state_set);
+
+/**
+ * Yanks lock from the cache (cl_lock_state::CLS_CACHED state) by calling
+ * cl_lock_operations::clo_use() top-to-bottom to notify layers.
+ */
+int cl_use_try(const struct lu_env *env, struct cl_lock *lock)
+{
+        int result;
+        const struct cl_lock_slice *slice;
+
+        ENTRY;
+        result = -ENOSYS;
+        list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+                if (slice->cls_ops->clo_use != NULL) {
+                        result = slice->cls_ops->clo_use(env, slice);
+                        if (result != 0)
+                                break;
+                }
+        }
+        LASSERT(result != -ENOSYS);
+        if (result == 0)
+                cl_lock_state_set(env, lock, CLS_HELD);
+        RETURN(result);
+}
+EXPORT_SYMBOL(cl_use_try);
+
+/**
+ * Helper for cl_enqueue_try() that calls ->clo_enqueue() across all layers
+ * top-to-bottom.
+ */
+static int cl_enqueue_kick(const struct lu_env *env,
+                           struct cl_lock *lock,
+                           struct cl_io *io, __u32 flags)
+{
+        int result;
+        const struct cl_lock_slice *slice;
+
+        ENTRY;
+        result = -ENOSYS;
+        list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+                if (slice->cls_ops->clo_enqueue != NULL) {
+                        result = slice->cls_ops->clo_enqueue(env,
+                                                             slice, io, flags);
+                        if (result != 0)
+                                break;
+                }
+        }
+        LASSERT(result != -ENOSYS);
+        RETURN(result);
+}
+
+/**
+ * Tries to enqueue a lock.
+ *
+ * This function is called repeatedly by cl_enqueue() until either lock is
+ * enqueued, or error occurs. This function does not block waiting for
+ * networking communication to complete.
+ *
+ * \post ergo(result == 0, lock->cll_state == CLS_ENQUEUED ||
+ *                         lock->cll_state == CLS_HELD)
+ *
+ * \see cl_enqueue() cl_lock_operations::clo_enqueue()
+ * \see cl_lock_state::CLS_ENQUEUED
+ */
+int cl_enqueue_try(const struct lu_env *env, struct cl_lock *lock,
+                   struct cl_io *io, __u32 flags)
+{
+        int result;
+
+        ENTRY;
+        do {
+                result = 0;
+
+                LINVRNT(cl_lock_is_mutexed(lock));
+
+                if (lock->cll_error != 0)
+                        break;
+                switch (lock->cll_state) {
+                case CLS_NEW:
+                        cl_lock_state_set(env, lock, CLS_QUEUING);
+                        /* fall-through */
+                case CLS_QUEUING:
+                        /* kick layers. */
+                        result = cl_enqueue_kick(env, lock, io, flags);
+                        if (result == 0)
+                                cl_lock_state_set(env, lock, CLS_ENQUEUED);
+                        break;
+                case CLS_UNLOCKING:
+                        /* wait until unlocking finishes, and enqueue lock
+                         * afresh. */
+                        result = CLO_WAIT;
+                        break;
+                case CLS_CACHED:
+                        /* yank lock from the cache. */
+                        result = cl_use_try(env, lock);
+                        break;
+                case CLS_ENQUEUED:
+                case CLS_HELD:
+                        result = 0;
+                        break;
+                default:
+                case CLS_FREEING:
+                        /*
+                         * impossible, only held locks with increased
+                         * ->cll_holds can be enqueued, and they cannot be
+                         * freed.
+                         */
+                        LBUG();
+                }
+        } while (result == CLO_REPEAT);
+        if (result < 0)
+                cl_lock_error(env, lock, result);
+        RETURN(result ?: lock->cll_error);
+}
+EXPORT_SYMBOL(cl_enqueue_try);
+
+static int cl_enqueue_locked(const struct lu_env *env, struct cl_lock *lock,
+                             struct cl_io *io, __u32 enqflags)
+{
+        int result;
+
+        ENTRY;
+
+        LINVRNT(cl_lock_is_mutexed(lock));
+        LINVRNT(cl_lock_invariant(env, lock));
+        LASSERT(lock->cll_holds > 0);
+
+        cl_lock_user_add(env, lock);
+        do {
+                result = cl_enqueue_try(env, lock, io, enqflags);
+                if (result == CLO_WAIT) {
+                        result = cl_lock_state_wait(env, lock);
+                        if (result == 0)
+                                continue;
+                }
+                break;
+        } while (1);
+        if (result != 0) {
+                cl_lock_user_del(env, lock);
+                if (result != -EINTR)
+                        cl_lock_error(env, lock, result);
+        }
+        LASSERT(ergo(result == 0, lock->cll_state == CLS_ENQUEUED ||
+                     lock->cll_state == CLS_HELD));
+        RETURN(result);
+}
+
+/**
+ * Enqueues a lock.
+ *
+ * \pre current thread or io owns a hold on lock.
+ *
+ * \post ergo(result == 0, lock->users increased)
+ * \post ergo(result == 0, lock->cll_state == CLS_ENQUEUED ||
+ *                         lock->cll_state == CLS_HELD)
+ */
+int cl_enqueue(const struct lu_env *env, struct cl_lock *lock,
+               struct cl_io *io, __u32 enqflags)
+{
+        int result;
+
+        ENTRY;
+
+        cl_lock_lockdep_acquire(env, lock, enqflags);
+        cl_lock_mutex_get(env, lock);
+        result = cl_enqueue_locked(env, lock, io, enqflags);
+        cl_lock_mutex_put(env, lock);
+        if (result != 0)
+                cl_lock_lockdep_release(env, lock);
+        LASSERT(ergo(result == 0, lock->cll_state == CLS_ENQUEUED ||
+                     lock->cll_state == CLS_HELD));
+        RETURN(result);
+}
+EXPORT_SYMBOL(cl_enqueue);
+
+/**
+ * Tries to unlock a lock.
+ *
+ * This function is called repeatedly by cl_unuse() until either lock is
+ * unlocked, or error occurs.
+ *
+ * \ppre lock->cll_state <= CLS_HELD || lock->cll_state == CLS_UNLOCKING
+ *
+ * \post ergo(result == 0, lock->cll_state == CLS_CACHED)
+ *
+ * \see cl_unuse() cl_lock_operations::clo_unuse()
+ * \see cl_lock_state::CLS_CACHED
+ */
+int cl_unuse_try(const struct lu_env *env, struct cl_lock *lock)
+{
+        const struct cl_lock_slice *slice;
+        int                         result;
+
+        ENTRY;
+        if (lock->cll_state != CLS_UNLOCKING) {
+                if (lock->cll_users > 1) {
+                        cl_lock_user_del(env, lock);
+                        RETURN(0);
+                }
+                /*
+                 * New lock users (->cll_users) are not protecting unlocking
+                 * from proceeding. From this point, lock eventually reaches
+                 * CLS_CACHED, is reinitialized to CLS_NEW or fails into
+                 * CLS_FREEING.
+                 */
+                cl_lock_state_set(env, lock, CLS_UNLOCKING);
+        }
+        do {
+                result = 0;
+
+                if (lock->cll_error != 0)
+                        break;
+
+                LINVRNT(cl_lock_is_mutexed(lock));
+                LINVRNT(cl_lock_invariant(env, lock));
+                LASSERT(lock->cll_state == CLS_UNLOCKING);
+                LASSERT(lock->cll_users > 0);
+                LASSERT(lock->cll_holds > 0);
+
+                result = -ENOSYS;
+                list_for_each_entry_reverse(slice, &lock->cll_layers,
+                                            cls_linkage) {
+                        if (slice->cls_ops->clo_unuse != NULL) {
+                                result = slice->cls_ops->clo_unuse(env, slice);
+                                if (result != 0)
+                                        break;
+                        }
+                }
+                LASSERT(result != -ENOSYS);
+        } while (result == CLO_REPEAT);
+        if (result != CLO_WAIT)
+                /*
+                 * Once there is no more need to iterate ->clo_unuse() calls,
+                 * remove lock user. This is done even if unrecoverable error
+                 * happened during unlocking, because nothing else can be
+                 * done.
+                 */
+                cl_lock_user_del(env, lock);
+        if (result == 0 || result == -ESTALE) {
+                enum cl_lock_state state;
+
+                /*
+                 * Return lock back to the cache. This is the only
+                 * place where lock is moved into CLS_CACHED state.
+                 *
+                 * If one of ->clo_unuse() methods returned -ESTALE, lock
+                 * cannot be placed into cache and has to be
+                 * re-initialized. This happens e.g., when a sub-lock was
+                 * canceled while unlocking was in progress.
+                 */
+                state = result == 0 ? CLS_CACHED : CLS_NEW;
+                cl_lock_state_set(env, lock, state);
+
+                /*
+                 * Hide -ESTALE error.
+                 * If the lock is a glimpse lock, and it has multiple
+                 * stripes. Assuming that one of its sublock returned -ENAVAIL,
+                 * and other sublocks are matched write locks. In this case,
+                 * we can't set this lock to error because otherwise some of
+                 * its sublocks may not be canceled. This causes some dirty
+                 * pages won't be written to OSTs. -jay
+                 */
+                result = 0;
+        }
+        result = result ?: lock->cll_error;
+        if (result < 0)
+                cl_lock_error(env, lock, result);
+        RETURN(result);
+}
+EXPORT_SYMBOL(cl_unuse_try);
+
+static void cl_unuse_locked(const struct lu_env *env, struct cl_lock *lock)
+{
+        ENTRY;
+        LASSERT(lock->cll_state <= CLS_HELD);
+        do {
+                int result;
+
+                result = cl_unuse_try(env, lock);
+                if (result == CLO_WAIT) {
+                        result = cl_lock_state_wait(env, lock);
+                        if (result == 0)
+                                continue;
+                }
+                break;
+        } while (1);
+        EXIT;
+}
+
+/**
+ * Unlocks a lock.
+ */
+void cl_unuse(const struct lu_env *env, struct cl_lock *lock)
+{
+        ENTRY;
+        cl_lock_mutex_get(env, lock);
+        cl_unuse_locked(env, lock);
+        cl_lock_mutex_put(env, lock);
+        cl_lock_lockdep_release(env, lock);
+        EXIT;
+}
+EXPORT_SYMBOL(cl_unuse);
+
+/**
+ * Tries to wait for a lock.
+ *
+ * This function is called repeatedly by cl_wait() until either lock is
+ * granted, or error occurs. This function does not block waiting for network
+ * communication to complete.
+ *
+ * \see cl_wait() cl_lock_operations::clo_wait()
+ * \see cl_lock_state::CLS_HELD
+ */
+int cl_wait_try(const struct lu_env *env, struct cl_lock *lock)
+{
+        const struct cl_lock_slice *slice;
+        int                         result;
+
+        ENTRY;
+        do {
+                LINVRNT(cl_lock_is_mutexed(lock));
+                LINVRNT(cl_lock_invariant(env, lock));
+                LASSERT(lock->cll_state == CLS_ENQUEUED ||
+                        lock->cll_state == CLS_HELD);
+                LASSERT(lock->cll_users > 0);
+                LASSERT(lock->cll_holds > 0);
+
+                result = 0;
+                if (lock->cll_error != 0)
+                        break;
+                if (lock->cll_state == CLS_HELD)
+                        /* nothing to do */
+                        break;
+
+                result = -ENOSYS;
+                list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+                        if (slice->cls_ops->clo_wait != NULL) {
+                                result = slice->cls_ops->clo_wait(env, slice);
+                                if (result != 0)
+                                        break;
+                        }
+                }
+                LASSERT(result != -ENOSYS);
+                if (result == 0)
+                        cl_lock_state_set(env, lock, CLS_HELD);
+        } while (result == CLO_REPEAT);
+        RETURN(result ?: lock->cll_error);
+}
+EXPORT_SYMBOL(cl_wait_try);
+
+/**
+ * Waits until enqueued lock is granted.
+ *
+ * \pre current thread or io owns a hold on the lock
+ * \pre ergo(result == 0, lock->cll_state == CLS_ENQUEUED ||
+ *                        lock->cll_state == CLS_HELD)
+ *
+ * \post ergo(result == 0, lock->cll_state == CLS_HELD)
+ */
+int cl_wait(const struct lu_env *env, struct cl_lock *lock)
+{
+        int result;
+
+        ENTRY;
+        cl_lock_mutex_get(env, lock);
+
+        LINVRNT(cl_lock_invariant(env, lock));
+        LASSERT(lock->cll_state == CLS_ENQUEUED || lock->cll_state == CLS_HELD);
+        LASSERT(lock->cll_holds > 0);
+
+        do {
+                result = cl_wait_try(env, lock);
+                if (result == CLO_WAIT) {
+                        result = cl_lock_state_wait(env, lock);
+                        if (result == 0)
+                                continue;
+                }
+                break;
+        } while (1);
+        if (result < 0) {
+                cl_lock_user_del(env, lock);
+                if (result != -EINTR)
+                        cl_lock_error(env, lock, result);
+                cl_lock_lockdep_release(env, lock);
+        }
+        cl_lock_mutex_put(env, lock);
+        LASSERT(ergo(result == 0, lock->cll_state == CLS_HELD));
+        RETURN(result);
+}
+EXPORT_SYMBOL(cl_wait);
+
+/**
+ * Executes cl_lock_operations::clo_weigh(), and sums results to estimate lock
+ * value.
+ */
+unsigned long cl_lock_weigh(const struct lu_env *env, struct cl_lock *lock)
+{
+        const struct cl_lock_slice *slice;
+        unsigned long pound;
+        unsigned long ounce;
+
+        ENTRY;
+        LINVRNT(cl_lock_is_mutexed(lock));
+        LINVRNT(cl_lock_invariant(env, lock));
+
+        pound = 0;
+        list_for_each_entry_reverse(slice, &lock->cll_layers, cls_linkage) {
+                if (slice->cls_ops->clo_weigh != NULL) {
+                        ounce = slice->cls_ops->clo_weigh(env, slice);
+                        pound += ounce;
+                        if (pound < ounce) /* over-weight^Wflow */
+                                pound = ~0UL;
+                }
+        }
+        RETURN(pound);
+}
+EXPORT_SYMBOL(cl_lock_weigh);
+
+/**
+ * Notifies layers that lock description changed.
+ *
+ * The server can grant client a lock different from one that was requested
+ * (e.g., larger in extent). This method is called when actually granted lock
+ * description becomes known to let layers to accommodate for changed lock
+ * description.
+ *
+ * \see cl_lock_operations::clo_modify()
+ */
+int cl_lock_modify(const struct lu_env *env, struct cl_lock *lock,
+                   const struct cl_lock_descr *desc)
+{
+        const struct cl_lock_slice *slice;
+        struct cl_object           *obj = lock->cll_descr.cld_obj;
+        struct cl_object_header    *hdr = cl_object_header(obj);
+        int result;
+
+        ENTRY;
+        /* don't allow object to change */
+        LASSERT(obj == desc->cld_obj);
+        LINVRNT(cl_lock_is_mutexed(lock));
+        LINVRNT(cl_lock_invariant(env, lock));
+
+        list_for_each_entry_reverse(slice, &lock->cll_layers, cls_linkage) {
+                if (slice->cls_ops->clo_modify != NULL) {
+                        result = slice->cls_ops->clo_modify(env, slice, desc);
+                        if (result != 0)
+                                RETURN(result);
+                }
+        }
+        CL_LOCK_DEBUG(D_DLMTRACE, env, lock, " -> "DDESCR"@"DFID"\n",
+                      PDESCR(desc), PFID(lu_object_fid(&desc->cld_obj->co_lu)));
+        /*
+         * Just replace description in place. Nothing more is needed for
+         * now. If locks were indexed according to their extent and/or mode,
+         * that index would have to be updated here.
+         */
+        spin_lock(&hdr->coh_lock_guard);
+        lock->cll_descr = *desc;
+        spin_unlock(&hdr->coh_lock_guard);
+        RETURN(0);
+}
+EXPORT_SYMBOL(cl_lock_modify);
+
+/**
+ * Initializes lock closure with a given origin.
+ *
+ * \see cl_lock_closure
+ */
+void cl_lock_closure_init(const struct lu_env *env,
+                          struct cl_lock_closure *closure,
+                          struct cl_lock *origin, int wait)
+{
+        LINVRNT(cl_lock_is_mutexed(origin));
+        LINVRNT(cl_lock_invariant(env, origin));
+
+        CFS_INIT_LIST_HEAD(&closure->clc_list);
+        closure->clc_origin = origin;
+        closure->clc_wait   = wait;
+        closure->clc_nr     = 0;
+}
+EXPORT_SYMBOL(cl_lock_closure_init);
+
+/**
+ * Builds a closure of \a lock.
+ *
+ * Building of a closure consists of adding initial lock (\a lock) into it,
+ * and calling cl_lock_operations::clo_closure() methods of \a lock. These
+ * methods might call cl_lock_closure_build() recursively again, adding more
+ * locks to the closure, etc.
+ *
+ * \see cl_lock_closure
+ */
+int cl_lock_closure_build(const struct lu_env *env, struct cl_lock *lock,
+                          struct cl_lock_closure *closure)
+{
+        const struct cl_lock_slice *slice;
+        int result;
+
+        ENTRY;
+        LINVRNT(cl_lock_is_mutexed(closure->clc_origin));
+        LINVRNT(cl_lock_invariant(env, closure->clc_origin));
+
+        result = cl_lock_enclosure(env, lock, closure);
+        if (result == 0) {
+                list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+                        if (slice->cls_ops->clo_closure != NULL) {
+                                result = slice->cls_ops->clo_closure(env, slice,
+                                                                     closure);
+                                if (result != 0)
+                                        break;
+                        }
+                }
+        }
+        if (result != 0)
+                cl_lock_disclosure(env, closure);
+        RETURN(result);
+}
+EXPORT_SYMBOL(cl_lock_closure_build);
+
+/**
+ * Adds new lock to a closure.
+ *
+ * Try-locks \a lock and if succeeded, adds it to the closure (never more than
+ * once). If try-lock failed, returns CLO_REPEAT, after optionally waiting
+ * until next try-lock is likely to succeed.
+ */
+int cl_lock_enclosure(const struct lu_env *env, struct cl_lock *lock,
+                      struct cl_lock_closure *closure)
+{
+        int result;
+        ENTRY;
+        if (!cl_lock_mutex_try(env, lock)) {
+                /*
+                 * If lock->cll_inclosure is not empty, lock is already in
+                 * this closure.
+                 */
+                if (list_empty(&lock->cll_inclosure)) {
+                        cl_lock_get_trust(lock);
+                        lu_ref_add(&lock->cll_reference, "closure", closure);
+                        list_add(&lock->cll_inclosure, &closure->clc_list);
+                        closure->clc_nr++;
+                } else
+                        cl_lock_mutex_put(env, lock);
+                result = 0;
+        } else {
+                cl_lock_disclosure(env, closure);
+                if (closure->clc_wait) {
+                        cl_lock_get_trust(lock);
+                        lu_ref_add(&lock->cll_reference, "closure-w", closure);
+                        cl_lock_mutex_put(env, closure->clc_origin);
+
+                        LASSERT(cl_lock_nr_mutexed(env) == 0);
+                        cl_lock_mutex_get(env, lock);
+                        cl_lock_mutex_put(env, lock);
+
+                        cl_lock_mutex_get(env, closure->clc_origin);
+                        lu_ref_del(&lock->cll_reference, "closure-w", closure);
+                        cl_lock_put(env, lock);
+                }
+                result = CLO_REPEAT;
+        }
+        RETURN(result);
+}
+EXPORT_SYMBOL(cl_lock_enclosure);
+
+/** Releases mutices of enclosed locks. */
+void cl_lock_disclosure(const struct lu_env *env,
+                        struct cl_lock_closure *closure)
+{
+        struct cl_lock *scan;
+        struct cl_lock *temp;
+
+        list_for_each_entry_safe(scan, temp, &closure->clc_list, cll_inclosure){
+                list_del_init(&scan->cll_inclosure);
+                cl_lock_mutex_put(env, scan);
+                lu_ref_del(&scan->cll_reference, "closure", closure);
+                cl_lock_put(env, scan);
+                closure->clc_nr--;
+        }
+        LASSERT(closure->clc_nr == 0);
+}
+EXPORT_SYMBOL(cl_lock_disclosure);
+
+/** Finalizes a closure. */
+void cl_lock_closure_fini(struct cl_lock_closure *closure)
+{
+        LASSERT(closure->clc_nr == 0);
+        LASSERT(list_empty(&closure->clc_list));
+}
+EXPORT_SYMBOL(cl_lock_closure_fini);
+
+/**
+ * Destroys this lock. Notifies layers (bottom-to-top) that lock is being
+ * destroyed, then destroy the lock. If there are holds on the lock, postpone
+ * destruction until all holds are released. This is called when a decision is
+ * made to destroy the lock in the future. E.g., when a blocking AST is
+ * received on it, or fatal communication error happens.
+ *
+ * Caller must have a reference on this lock to prevent a situation, when
+ * deleted lock lingers in memory for indefinite time, because nobody calls
+ * cl_lock_put() to finish it.
+ *
+ * \pre atomic_read(&lock->cll_ref) > 0
+ *
+ * \see cl_lock_operations::clo_delete()
+ * \see cl_lock::cll_holds
+ */
+void cl_lock_delete(const struct lu_env *env, struct cl_lock *lock)
+{
+        LINVRNT(cl_lock_is_mutexed(lock));
+        LINVRNT(cl_lock_invariant(env, lock));
+
+        ENTRY;
+        if (lock->cll_holds == 0)
+                cl_lock_delete0(env, lock);
+        else
+                lock->cll_flags |= CLF_DOOMED;
+        EXIT;
+}
+EXPORT_SYMBOL(cl_lock_delete);
+
+/**
+ * Mark lock as irrecoverably failed, and mark it for destruction. This
+ * happens when, e.g., server fails to grant a lock to us, or networking
+ * time-out happens.
+ *
+ * \pre atomic_read(&lock->cll_ref) > 0
+ *
+ * \see clo_lock_delete()
+ * \see cl_lock::cll_holds
+ */
+void cl_lock_error(const struct lu_env *env, struct cl_lock *lock, int error)
+{
+        LINVRNT(cl_lock_is_mutexed(lock));
+        LINVRNT(cl_lock_invariant(env, lock));
+
+        ENTRY;
+        if (lock->cll_error == 0 && error != 0) {
+                lock->cll_error = error;
+                cl_lock_signal(env, lock);
+                cl_lock_cancel(env, lock);
+                cl_lock_delete(env, lock);
+        }
+        EXIT;
+}
+EXPORT_SYMBOL(cl_lock_error);
+
+/**
+ * Cancels this lock. Notifies layers
+ * (bottom-to-top) that lock is being cancelled, then destroy the lock. If
+ * there are holds on the lock, postpone cancellation until
+ * all holds are released.
+ *
+ * Cancellation notification is delivered to layers at most once.
+ *
+ * \see cl_lock_operations::clo_cancel()
+ * \see cl_lock::cll_holds
+ */
+void cl_lock_cancel(const struct lu_env *env, struct cl_lock *lock)
+{
+        LINVRNT(cl_lock_is_mutexed(lock));
+        LINVRNT(cl_lock_invariant(env, lock));
+        ENTRY;
+        if (lock->cll_holds == 0)
+                cl_lock_cancel0(env, lock);
+        else
+                lock->cll_flags |= CLF_CANCELPEND;
+        EXIT;
+}
+EXPORT_SYMBOL(cl_lock_cancel);
+
+/**
+ * Finds an existing lock covering given page and optionally different from a
+ * given \a except lock.
+ */
+struct cl_lock *cl_lock_at_page(const struct lu_env *env, struct cl_object *obj,
+                                struct cl_page *page, struct cl_lock *except,
+                                int pending, int canceld)
+{
+        struct cl_object_header *head;
+        struct cl_lock          *scan;
+        struct cl_lock          *lock;
+        struct cl_lock_descr    *need;
+
+        ENTRY;
+
+        head = cl_object_header(obj);
+        need = &cl_env_info(env)->clt_descr;
+        lock = NULL;
+
+        need->cld_mode = CLM_READ; /* CLM_READ matches both READ & WRITE, but
+                                    * not PHANTOM */
+        need->cld_start = need->cld_end = page->cp_index;
+
+        spin_lock(&head->coh_lock_guard);
+        list_for_each_entry(scan, &head->coh_locks, cll_linkage) {
+                if (scan != except &&
+                    cl_lock_ext_match(&scan->cll_descr, need) &&
+                    scan->cll_state < CLS_FREEING &&
+                    /*
+                     * This check is racy as the lock can be canceled right
+                     * after it is done, but this is fine, because page exists
+                     * already.
+                     */
+                    (canceld || !(scan->cll_flags & CLF_CANCELLED)) &&
+                    (pending || !(scan->cll_flags & CLF_CANCELPEND))) {
+                        /* Don't increase cs_hit here since this
+                         * is just a helper function. */
+                        cl_lock_get_trust(scan);
+                        lock = scan;
+                        break;
+                }
+        }
+        spin_unlock(&head->coh_lock_guard);
+        RETURN(lock);
+}
+EXPORT_SYMBOL(cl_lock_at_page);
+
+/**
+ * Returns a list of pages protected (only) by a given lock.
+ *
+ * Scans an extent of page radix tree, corresponding to the \a lock and queues
+ * all pages that are not protected by locks other than \a lock into \a queue.
+ */
+void cl_lock_page_list_fixup(const struct lu_env *env,
+                             struct cl_io *io, struct cl_lock *lock,
+                             struct cl_page_list *queue)
+{
+        struct cl_page        *page;
+        struct cl_page        *temp;
+        struct cl_page_list   *plist = &cl_env_info(env)->clt_list;
+
+        LINVRNT(cl_lock_invariant(env, lock));
+        ENTRY;
+
+        /* Now, we have a list of cl_pages under the \a lock, we need
+         * to check if some of pages are covered by other ldlm lock.
+         * If this is the case, they aren't needed to be written out this time.
+         *
+         * For example, we have A:[0,200] & B:[100,300] PW locks on client, now
+         * the latter is to be canceled, this means other client is
+         * reading/writing [200,300] since A won't canceled. Actually
+         * we just need to write the pages covered by [200,300]. This is safe,
+         * since [100,200] is also protected lock A.
+         */
+
+        cl_page_list_init(plist);
+        cl_page_list_for_each_safe(page, temp, queue) {
+                pgoff_t                idx = page->cp_index;
+                struct cl_lock        *found;
+                struct cl_lock_descr  *descr;
+
+                /* The algorithm counts on the index-ascending page index. */
+                LASSERT(ergo(&temp->cp_batch != &queue->pl_pages,
+                        page->cp_index < temp->cp_index));
+
+                found = cl_lock_at_page(env, lock->cll_descr.cld_obj,
+                                        page, lock, 0, 0);
+                if (found == NULL)
+                        continue;
+
+                descr = &found->cll_descr;
+                list_for_each_entry_safe_from(page, temp, &queue->pl_pages,
+                                              cp_batch) {
+                        idx = page->cp_index;
+                        if (descr->cld_start > idx || descr->cld_end < idx)
+                                break;
+                        cl_page_list_move(plist, queue, page);
+                }
+                cl_lock_put(env, found);
+        }
+
+        /* The pages in plist are covered by other locks, don't handle them
+         * this time.
+         */
+        if (io != NULL)
+                cl_page_list_disown(env, io, plist);
+        cl_page_list_fini(env, plist);
+        EXIT;
+}
+EXPORT_SYMBOL(cl_lock_page_list_fixup);
+
+/**
+ * Invalidate pages protected by the given lock, sending them out to the
+ * server first, if necessary.
+ *
+ * This function does the following:
+ *
+ *     - collects a list of pages to be invalidated,
+ *
+ *     - unmaps them from the user virtual memory,
+ *
+ *     - sends dirty pages to the server,
+ *
+ *     - waits for transfer completion,
+ *
+ *     - discards pages, and throws them out of memory.
+ *
+ * If \a discard is set, pages are discarded without sending them to the
+ * server.
+ *
+ * If error happens on any step, the process continues anyway (the reasoning
+ * behind this being that lock cancellation cannot be delayed indefinitely).
+ */
+int cl_lock_page_out(const struct lu_env *env, struct cl_lock *lock,
+                     int discard)
+{
+        struct cl_thread_info *info  = cl_env_info(env);
+        struct cl_io          *io    = &info->clt_io;
+        struct cl_2queue      *queue = &info->clt_queue;
+        struct cl_lock_descr  *descr = &lock->cll_descr;
+        int                      result;
+        int                      rc0;
+        int                      rc1;
+
+        LINVRNT(cl_lock_invariant(env, lock));
+        ENTRY;
+
+        io->ci_obj = cl_object_top(descr->cld_obj);
+        result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+        if (result == 0) {
+
+                cl_2queue_init(queue);
+                cl_page_gang_lookup(env, descr->cld_obj, io, descr->cld_start,
+                                    descr->cld_end, &queue->c2_qin);
+                if (queue->c2_qin.pl_nr > 0) {
+                        result = cl_page_list_unmap(env, io, &queue->c2_qin);
+                        if (!discard) {
+                                rc0 = cl_io_submit_rw(env, io,
+                                                      CRT_WRITE, queue);
+                                rc1 = cl_page_list_own(env, io,
+                                                       &queue->c2_qout);
+                                result = result ?: rc0 ?: rc1;
+                        }
+                        cl_lock_page_list_fixup(env, io, lock, &queue->c2_qout);
+                        cl_2queue_discard(env, io, queue);
+                        cl_2queue_disown(env, io, queue);
+                }
+                cl_2queue_fini(env, queue);
+        }
+        cl_io_fini(env, io);
+        RETURN(result);
+}
+EXPORT_SYMBOL(cl_lock_page_out);
+
+/**
+ * Eliminate all locks for a given object.
+ *
+ * Caller has to guarantee that no lock is in active use.
+ *
+ * \param cancel when this is set, cl_locks_prune() cancels locks before
+ *               destroying.
+ */
+void cl_locks_prune(const struct lu_env *env, struct cl_object *obj, int cancel)
+{
+        struct cl_object_header *head;
+        struct cl_lock          *lock;
+
+        ENTRY;
+        head = cl_object_header(obj);
+        /*
+         * If locks are destroyed without cancellation, all pages must be
+         * already destroyed (as otherwise they will be left unprotected).
+         */
+        LASSERT(ergo(!cancel,
+                     head->coh_tree.rnode == NULL && head->coh_pages == 0));
+
+        spin_lock(&head->coh_lock_guard);
+        while (!list_empty(&head->coh_locks)) {
+                lock = container_of(head->coh_locks.next,
+                                    struct cl_lock, cll_linkage);
+                cl_lock_get_trust(lock);
+                spin_unlock(&head->coh_lock_guard);
+                lu_ref_add(&lock->cll_reference, "prune", cfs_current());
+                cl_lock_mutex_get(env, lock);
+                if (lock->cll_state < CLS_FREEING) {
+                        LASSERT(lock->cll_holds == 0);
+                        LASSERT(lock->cll_users == 0);
+                        if (cancel)
+                                cl_lock_cancel(env, lock);
+                        cl_lock_delete(env, lock);
+                }
+                cl_lock_mutex_put(env, lock);
+                lu_ref_del(&lock->cll_reference, "prune", cfs_current());
+                cl_lock_put(env, lock);
+                spin_lock(&head->coh_lock_guard);
+        }
+        spin_unlock(&head->coh_lock_guard);
+        EXIT;
+}
+EXPORT_SYMBOL(cl_locks_prune);
+
+/**
+ * Returns true if \a addr is an address of an allocated cl_lock. Used in
+ * assertions. This check is optimistically imprecise, i.e., it occasionally
+ * returns true for the incorrect addresses, but if it returns false, then the
+ * address is guaranteed to be incorrect. (Should be named cl_lockp().)
+ *
+ * \see cl_is_page()
+ */
+int cl_is_lock(const void *addr)
+{
+        return cfs_mem_is_in_cache(addr, cl_lock_kmem);
+}
+EXPORT_SYMBOL(cl_is_lock);
+
+static struct cl_lock *cl_lock_hold_mutex(const struct lu_env *env,
+                                          const struct cl_io *io,
+                                          const struct cl_lock_descr *need,
+                                          const char *scope, const void *source)
+{
+        struct cl_lock *lock;
+
+        ENTRY;
+
+        while (1) {
+                lock = cl_lock_find(env, io, need);
+                if (IS_ERR(lock))
+                        break;
+                cl_lock_mutex_get(env, lock);
+                if (lock->cll_state < CLS_FREEING) {
+                        cl_lock_hold_mod(env, lock, +1);
+                        lu_ref_add(&lock->cll_holders, scope, source);
+                        lu_ref_add(&lock->cll_reference, scope, source);
+                        break;
+                }
+                cl_lock_mutex_put(env, lock);
+                cl_lock_put(env, lock);
+        }
+        RETURN(lock);
+}
+
+/**
+ * Returns a lock matching \a need description with a reference and a hold on
+ * it.
+ *
+ * This is much like cl_lock_find(), except that cl_lock_hold() additionally
+ * guarantees that lock is not in the CLS_FREEING state on return.
+ */
+struct cl_lock *cl_lock_hold(const struct lu_env *env, const struct cl_io *io,
+                             const struct cl_lock_descr *need,
+                             const char *scope, const void *source)
+{
+        struct cl_lock *lock;
+
+        ENTRY;
+
+        lock = cl_lock_hold_mutex(env, io, need, scope, source);
+        if (!IS_ERR(lock))
+                cl_lock_mutex_put(env, lock);
+        RETURN(lock);
+}
+EXPORT_SYMBOL(cl_lock_hold);
+
+/**
+ * Main high-level entry point of cl_lock interface that finds existing or
+ * enqueues new lock matching given description.
+ */
+struct cl_lock *cl_lock_request(const struct lu_env *env, struct cl_io *io,
+                                const struct cl_lock_descr *need,
+                                __u32 enqflags,
+                                const char *scope, const void *source)
+{
+        struct cl_lock       *lock;
+        const struct lu_fid  *fid;
+        int                   rc;
+        int                   iter;
+
+        ENTRY;
+        fid = lu_object_fid(&io->ci_obj->co_lu);
+        iter = 0;
+        do {
+                CDEBUG(iter >= 16 && IS_PO2(iter) ? D_WARNING : D_DLMTRACE,
+                       DDESCR"@"DFID" %i %08x `%s'\n",
+                       PDESCR(need), PFID(fid), iter, enqflags, scope);
+                lock = cl_lock_hold_mutex(env, io, need, scope, source);
+                if (!IS_ERR(lock)) {
+                        rc = cl_enqueue_locked(env, lock, io, enqflags);
+                        if (rc == 0) {
+                                if (cl_lock_fits_into(env, lock, need, io)) {
+                                        cl_lock_mutex_put(env, lock);
+                                        cl_lock_lockdep_acquire(env,
+                                                                lock, enqflags);
+                                        break;
+                                }
+                                cl_unuse_locked(env, lock);
+                        }
+                        cl_lock_hold_release(env, lock, scope, source);
+                        cl_lock_mutex_put(env, lock);
+                        lu_ref_del(&lock->cll_reference, scope, source);
+                        cl_lock_put(env, lock);
+                        lock = ERR_PTR(rc);
+                } else
+                        rc = PTR_ERR(lock);
+                iter++;
+        } while (rc == 0);
+        RETURN(lock);
+}
+EXPORT_SYMBOL(cl_lock_request);
+
+/**
+ * Adds a hold to a known lock.
+ */
+void cl_lock_hold_add(const struct lu_env *env, struct cl_lock *lock,
+                      const char *scope, const void *source)
+{
+        LINVRNT(cl_lock_is_mutexed(lock));
+        LINVRNT(cl_lock_invariant(env, lock));
+        LASSERT(lock->cll_state != CLS_FREEING);
+
+        ENTRY;
+        cl_lock_hold_mod(env, lock, +1);
+        cl_lock_get(lock);
+        lu_ref_add(&lock->cll_holders, scope, source);
+        lu_ref_add(&lock->cll_reference, scope, source);
+        EXIT;
+}
+EXPORT_SYMBOL(cl_lock_hold_add);
+
+/**
+ * Releases a hold and a reference on a lock, on which caller acquired a
+ * mutex.
+ */
+void cl_lock_unhold(const struct lu_env *env, struct cl_lock *lock,
+                    const char *scope, const void *source)
+{
+        LINVRNT(cl_lock_invariant(env, lock));
+        ENTRY;
+        cl_lock_hold_release(env, lock, scope, source);
+        lu_ref_del(&lock->cll_reference, scope, source);
+        cl_lock_put(env, lock);
+        EXIT;
+}
+EXPORT_SYMBOL(cl_lock_unhold);
+
+/**
+ * Releases a hold and a reference on a lock, obtained by cl_lock_hold().
+ */
+void cl_lock_release(const struct lu_env *env, struct cl_lock *lock,
+                     const char *scope, const void *source)
+{
+        LINVRNT(cl_lock_invariant(env, lock));
+        ENTRY;
+        cl_lock_mutex_get(env, lock);
+        cl_lock_hold_release(env, lock, scope, source);
+        cl_lock_mutex_put(env, lock);
+        lu_ref_del(&lock->cll_reference, scope, source);
+        cl_lock_put(env, lock);
+        EXIT;
+}
+EXPORT_SYMBOL(cl_lock_release);
+
+void cl_lock_user_add(const struct lu_env *env, struct cl_lock *lock)
+{
+        LINVRNT(cl_lock_is_mutexed(lock));
+        LINVRNT(cl_lock_invariant(env, lock));
+
+        ENTRY;
+        cl_lock_used_mod(env, lock, +1);
+        EXIT;
+}
+EXPORT_SYMBOL(cl_lock_user_add);
+
+int cl_lock_user_del(const struct lu_env *env, struct cl_lock *lock)
+{
+        LINVRNT(cl_lock_is_mutexed(lock));
+        LINVRNT(cl_lock_invariant(env, lock));
+        LASSERT(lock->cll_users > 0);
+
+        ENTRY;
+        cl_lock_used_mod(env, lock, -1);
+        RETURN(lock->cll_users == 0);
+}
+EXPORT_SYMBOL(cl_lock_user_del);
+
+/**
+ * Check if two lock's mode are compatible.
+ *
+ * This returns true iff en-queuing \a lock2 won't cause cancellation of \a
+ * lock1 even when these locks overlap.
+ */
+int cl_lock_compatible(const struct cl_lock *lock1, const struct cl_lock *lock2)
+{
+        enum cl_lock_mode mode1;
+        enum cl_lock_mode mode2;
+
+        ENTRY;
+        mode1 = lock1->cll_descr.cld_mode;
+        mode2 = lock2->cll_descr.cld_mode;
+        RETURN(mode2 == CLM_PHANTOM ||
+               (mode1 == CLM_READ && mode2 == CLM_READ));
+}
+EXPORT_SYMBOL(cl_lock_compatible);
+
+const char *cl_lock_mode_name(const enum cl_lock_mode mode)
+{
+        static const char *names[] = {
+                [CLM_PHANTOM] = "PHANTOM",
+                [CLM_READ]    = "READ",
+                [CLM_WRITE]   = "WRITE"
+        };
+        if (0 <= mode && mode < ARRAY_SIZE(names))
+                return names[mode];
+        else
+                return "UNKNW";
+}
+EXPORT_SYMBOL(cl_lock_mode_name);
+
+/**
+ * Prints human readable representation of a lock description.
+ */
+void cl_lock_descr_print(const struct lu_env *env, void *cookie,
+                       lu_printer_t printer,
+                       const struct cl_lock_descr *descr)
+{
+        const struct lu_fid  *fid;
+
+        fid = lu_object_fid(&descr->cld_obj->co_lu);
+        (*printer)(env, cookie, DDESCR"@"DFID, PDESCR(descr), PFID(fid));
+}
+EXPORT_SYMBOL(cl_lock_descr_print);
+
+/**
+ * Prints human readable representation of \a lock to the \a f.
+ */
+void cl_lock_print(const struct lu_env *env, void *cookie,
+                   lu_printer_t printer, const struct cl_lock *lock)
+{
+        const struct cl_lock_slice *slice;
+        (*printer)(env, cookie, "lock@%p[%d %d %d %d %d %08lx] ",
+                   lock, atomic_read(&lock->cll_ref),
+                   lock->cll_state, lock->cll_error, lock->cll_holds,
+                   lock->cll_users, lock->cll_flags);
+        cl_lock_descr_print(env, cookie, printer, &lock->cll_descr);
+        (*printer)(env, cookie, " {\n");
+
+        list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+                (*printer)(env, cookie, "    %s@%p: ",
+                           slice->cls_obj->co_lu.lo_dev->ld_type->ldt_name,
+                           slice);
+                if (slice->cls_ops->clo_print != NULL)
+                        slice->cls_ops->clo_print(env, cookie, printer, slice);
+                (*printer)(env, cookie, "\n");
+        }
+        (*printer)(env, cookie, "} lock@%p\n", lock);
+}
+EXPORT_SYMBOL(cl_lock_print);
+
+int cl_lock_init(void)
+{
+        return lu_kmem_init(cl_lock_caches);
+}
+
+void cl_lock_fini(void)
+{
+        lu_kmem_fini(cl_lock_caches);
+}
diff --git a/lustre/obdclass/cl_object.c b/lustre/obdclass/cl_object.c

new file mode 100644 (file)

index 0000000..98f7c09
--- /dev/null
+++ b/lustre/obdclass/cl_object.c
@@ -0,0 +1,1077 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Client Lustre Object.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+/*
+ * Locking.
+ *
+ *  i_mutex
+ *      PG_locked
+ *          ->coh_page_guard
+ *          ->coh_lock_guard
+ *          ->coh_attr_guard
+ *          ->ls_guard
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+
+#include <libcfs/libcfs.h>
+/* class_put_type() */
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <libcfs/list.h>
+/* lu_time_global_{init,fini}() */
+#include <lu_time.h>
+
+#include <cl_object.h>
+#include "cl_internal.h"
+
+static cfs_mem_cache_t *cl_env_kmem;
+
+/** Lock class of cl_object_header::coh_page_guard */
+static struct lock_class_key cl_page_guard_class;
+/** Lock class of cl_object_header::coh_lock_guard */
+static struct lock_class_key cl_lock_guard_class;
+/** Lock class of cl_object_header::coh_attr_guard */
+static struct lock_class_key cl_attr_guard_class;
+
+/**
+ * Initialize cl_object_header.
+ */
+int cl_object_header_init(struct cl_object_header *h)
+{
+        int result;
+
+        ENTRY;
+        result = lu_object_header_init(&h->coh_lu);
+        if (result == 0) {
+                spin_lock_init(&h->coh_page_guard);
+                spin_lock_init(&h->coh_lock_guard);
+                spin_lock_init(&h->coh_attr_guard);
+                lockdep_set_class(&h->coh_attr_guard, &cl_page_guard_class);
+                lockdep_set_class(&h->coh_attr_guard, &cl_lock_guard_class);
+                lockdep_set_class(&h->coh_attr_guard, &cl_attr_guard_class);
+                h->coh_pages = 0;
+                /* XXX hard coded GFP_* mask. */
+                INIT_RADIX_TREE(&h->coh_tree, GFP_ATOMIC);
+                CFS_INIT_LIST_HEAD(&h->coh_locks);
+        }
+        RETURN(result);
+}
+EXPORT_SYMBOL(cl_object_header_init);
+
+/**
+ * Finalize cl_object_header.
+ */
+void cl_object_header_fini(struct cl_object_header *h)
+{
+        LASSERT(list_empty(&h->coh_locks));
+        lu_object_header_fini(&h->coh_lu);
+}
+EXPORT_SYMBOL(cl_object_header_fini);
+
+/**
+ * Returns a cl_object with a given \a fid.
+ *
+ * Returns either cached or newly created object. Additional reference on the
+ * returned object is acquired.
+ *
+ * \see lu_object_find(), cl_page_find(), cl_lock_find()
+ */
+struct cl_object *cl_object_find(const struct lu_env *env,
+                                 struct cl_device *cd, const struct lu_fid *fid,
+                                 const struct cl_object_conf *c)
+{
+        might_sleep();
+        return lu2cl(lu_object_find_slice(env, cl2lu_dev(cd), fid, &c->coc_lu));
+}
+EXPORT_SYMBOL(cl_object_find);
+
+/**
+ * Releases a reference on \a o.
+ *
+ * When last reference is released object is returned to the cache, unless
+ * lu_object_header_flags::LU_OBJECT_HEARD_BANSHEE bit is set in its header.
+ *
+ * \see cl_page_put(), cl_lock_put().
+ */
+void cl_object_put(const struct lu_env *env, struct cl_object *o)
+{
+        lu_object_put(env, &o->co_lu);
+}
+EXPORT_SYMBOL(cl_object_put);
+
+/**
+ * Acquire an additional reference to the object \a o.
+ *
+ * This can only be used to acquire _additional_ reference, i.e., caller
+ * already has to possess at least one reference to \a o before calling this.
+ *
+ * \see cl_page_get(), cl_lock_get().
+ */
+void cl_object_get(struct cl_object *o)
+{
+        lu_object_get(&o->co_lu);
+}
+EXPORT_SYMBOL(cl_object_get);
+
+/**
+ * Returns the top-object for a given \a o.
+ *
+ * \see cl_page_top(), cl_io_top()
+ */
+struct cl_object *cl_object_top(struct cl_object *o)
+{
+        struct cl_object_header *hdr = cl_object_header(o);
+        struct cl_object *top;
+
+        while (hdr->coh_parent != NULL)
+                hdr = hdr->coh_parent;
+
+        top = lu2cl(lu_object_top(&hdr->coh_lu));
+        CDEBUG(D_TRACE, "%p -> %p\n", o, top);
+        return top;
+}
+EXPORT_SYMBOL(cl_object_top);
+
+/**
+ * Returns pointer to the lock protecting data-attributes for the given object
+ * \a o.
+ *
+ * Data-attributes are protected by the cl_object_header::coh_attr_guard
+ * spin-lock in the top-object.
+ *
+ * \see cl_attr, cl_object_attr_lock(), cl_object_operations::coo_attr_get().
+ */
+static spinlock_t *cl_object_attr_guard(struct cl_object *o)
+{
+        return &cl_object_header(cl_object_top(o))->coh_attr_guard;
+}
+
+/**
+ * Locks data-attributes.
+ *
+ * Prevents data-attributes from changing, until lock is released by
+ * cl_object_attr_unlock(). This has to be called before calls to
+ * cl_object_attr_get(), cl_object_attr_set().
+ */
+void cl_object_attr_lock(struct cl_object *o)
+{
+        spin_lock(cl_object_attr_guard(o));
+}
+EXPORT_SYMBOL(cl_object_attr_lock);
+
+/**
+ * Releases data-attributes lock, acquired by cl_object_attr_lock().
+ */
+void cl_object_attr_unlock(struct cl_object *o)
+{
+        spin_unlock(cl_object_attr_guard(o));
+}
+EXPORT_SYMBOL(cl_object_attr_unlock);
+
+/**
+ * Returns data-attributes of an object \a obj.
+ *
+ * Every layer is asked (by calling cl_object_operations::coo_attr_get())
+ * top-to-bottom to fill in parts of \a attr that this layer is responsible
+ * for.
+ */
+int cl_object_attr_get(const struct lu_env *env, struct cl_object *obj,
+                       struct cl_attr *attr)
+{
+        struct lu_object_header *top;
+        int result;
+
+        LASSERT(spin_is_locked(cl_object_attr_guard(obj)));
+        ENTRY;
+
+        top = obj->co_lu.lo_header;
+        result = 0;
+        list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) {
+                if (obj->co_ops->coo_attr_get != NULL) {
+                        result = obj->co_ops->coo_attr_get(env, obj, attr);
+                        if (result != 0) {
+                                if (result > 0)
+                                        result = 0;
+                                break;
+                        }
+                }
+        }
+        RETURN(result);
+}
+EXPORT_SYMBOL(cl_object_attr_get);
+
+/**
+ * Updates data-attributes of an object \a obj.
+ *
+ * Only attributes, mentioned in a validness bit-mask \a v are
+ * updated. Calls cl_object_operations::coo_attr_set() on every layer, bottom
+ * to top.
+ */
+int cl_object_attr_set(const struct lu_env *env, struct cl_object *obj,
+                       const struct cl_attr *attr, unsigned v)
+{
+        struct lu_object_header *top;
+        int result;
+
+        LASSERT(spin_is_locked(cl_object_attr_guard(obj)));
+        ENTRY;
+
+        top = obj->co_lu.lo_header;
+        result = 0;
+        list_for_each_entry_reverse(obj, &top->loh_layers, co_lu.lo_linkage) {
+                if (obj->co_ops->coo_attr_set != NULL) {
+                        result = obj->co_ops->coo_attr_set(env, obj, attr, v);
+                        if (result != 0) {
+                                if (result > 0)
+                                        result = 0;
+                                break;
+                        }
+                }
+        }
+        RETURN(result);
+}
+EXPORT_SYMBOL(cl_object_attr_set);
+
+/**
+ * Notifies layers (bottom-to-top) that glimpse AST was received.
+ *
+ * Layers have to fill \a lvb fields with information that will be shipped
+ * back to glimpse issuer.
+ *
+ * \see cl_lock_operations::clo_glimpse()
+ */
+int cl_object_glimpse(const struct lu_env *env, struct cl_object *obj,
+                      struct ost_lvb *lvb)
+{
+        struct lu_object_header *top;
+        int result;
+
+        ENTRY;
+        top = obj->co_lu.lo_header;
+        result = 0;
+        list_for_each_entry_reverse(obj, &top->loh_layers, co_lu.lo_linkage) {
+                if (obj->co_ops->coo_glimpse != NULL) {
+                        result = obj->co_ops->coo_glimpse(env, obj, lvb);
+                        if (result != 0)
+                                break;
+                }
+        }
+        LU_OBJECT_HEADER(D_DLMTRACE, env, &obj->co_lu,
+                         "size: "LPU64" mtime: "LPU64" atime: "LPU64" "
+                         "ctime: "LPU64" blocks: "LPU64"\n",
+                         lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime,
+                         lvb->lvb_ctime, lvb->lvb_blocks);
+        RETURN(result);
+}
+EXPORT_SYMBOL(cl_object_glimpse);
+
+/**
+ * Updates a configuration of an object \a obj.
+ */
+int cl_conf_set(const struct lu_env *env, struct cl_object *obj,
+                const struct cl_object_conf *conf)
+{
+        struct lu_object_header *top;
+        int result;
+
+        ENTRY;
+        top = obj->co_lu.lo_header;
+        result = 0;
+        list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) {
+                if (obj->co_ops->coo_conf_set != NULL) {
+                        result = obj->co_ops->coo_conf_set(env, obj, conf);
+                        if (result != 0)
+                                break;
+                }
+        }
+        RETURN(result);
+}
+EXPORT_SYMBOL(cl_conf_set);
+
+/**
+ * Helper function removing all object locks, and marking object for
+ * deletion. All object pages must have been deleted at this point.
+ *
+ * This is called by cl_inode_fini() and lov_object_delete() to destroy top-
+ * and sub- objects respectively.
+ */
+void cl_object_kill(const struct lu_env *env, struct cl_object *obj)
+{
+        struct cl_object_header *hdr;
+
+        hdr = cl_object_header(obj);
+        LASSERT(hdr->coh_tree.rnode == NULL);
+        LASSERT(hdr->coh_pages == 0);
+
+        set_bit(LU_OBJECT_HEARD_BANSHEE, &hdr->coh_lu.loh_flags);
+        /*
+         * Destroy all locks. Object destruction (including cl_inode_fini())
+         * cannot cancel the locks, because in the case of a local client,
+         * where client and server share the same thread running
+         * prune_icache(), this can dead-lock with ldlm_cancel_handler()
+         * waiting on __wait_on_freeing_inode().
+         */
+        cl_locks_prune(env, obj, 0);
+}
+EXPORT_SYMBOL(cl_object_kill);
+
+/**
+ * Prunes caches of pages and locks for this object.
+ */
+void cl_object_prune(const struct lu_env *env, struct cl_object *obj)
+{
+        ENTRY;
+        cl_pages_prune(env, obj);
+        cl_locks_prune(env, obj, 1);
+        EXIT;
+}
+EXPORT_SYMBOL(cl_object_prune);
+
+void cache_stats_init(struct cache_stats *cs, const char *name)
+{
+        cs->cs_name = name;
+        atomic_set(&cs->cs_lookup, 0);
+        atomic_set(&cs->cs_hit,    0);
+        atomic_set(&cs->cs_total,  0);
+        atomic_set(&cs->cs_busy,   0);
+}
+
+int cache_stats_print(const struct cache_stats *cs,
+                      char *page, int count, int h)
+{
+        int nob = 0;
+/*
+       lookup    hit  total cached create
+  env: ...... ...... ...... ...... ......
+*/
+        if (h)
+                nob += snprintf(page, count,
+                                "       lookup    hit  total   busy create\n");
+
+        nob += snprintf(page + nob, count - nob,
+                        "%5.5s: %6u %6u %6u %6u %6u",
+                        cs->cs_name,
+                        atomic_read(&cs->cs_lookup),
+                        atomic_read(&cs->cs_hit),
+                        atomic_read(&cs->cs_total),
+                        atomic_read(&cs->cs_busy),
+                        atomic_read(&cs->cs_created));
+        return nob;
+}
+
+/**
+ * Initialize client site.
+ *
+ * Perform common initialization (lu_site_init()), and initialize statistical
+ * counters. Also perform global initializations on the first call.
+ */
+int cl_site_init(struct cl_site *s, struct cl_device *d)
+{
+        int i;
+        int result;
+
+        result = lu_site_init(&s->cs_lu, &d->cd_lu_dev);
+        if (result == 0) {
+                cache_stats_init(&s->cs_pages, "pages");
+                cache_stats_init(&s->cs_locks, "locks");
+                for (i = 0; i < ARRAY_SIZE(s->cs_pages_state); ++i)
+                        atomic_set(&s->cs_pages_state[0], 0);
+                for (i = 0; i < ARRAY_SIZE(s->cs_locks_state); ++i)
+                        atomic_set(&s->cs_locks_state[i], 0);
+        }
+        return result;
+}
+EXPORT_SYMBOL(cl_site_init);
+
+/**
+ * Finalize client site. Dual to cl_site_init().
+ */
+void cl_site_fini(struct cl_site *s)
+{
+        lu_site_fini(&s->cs_lu);
+}
+EXPORT_SYMBOL(cl_site_fini);
+
+static struct cache_stats cl_env_stats = {
+        .cs_name    = "envs",
+        .cs_created = ATOMIC_INIT(0),
+        .cs_lookup  = ATOMIC_INIT(0),
+        .cs_hit     = ATOMIC_INIT(0),
+        .cs_total   = ATOMIC_INIT(0),
+        .cs_busy    = ATOMIC_INIT(0)
+};
+
+/**
+ * Outputs client site statistical counters into a buffer. Suitable for
+ * ll_rd_*()-style functions.
+ */
+int cl_site_stats_print(const struct cl_site *site, char *page, int count)
+{
+        int nob;
+        int i;
+        static const char *pstate[] = {
+                [CPS_CACHED]  = "c",
+                [CPS_OWNED]   = "o",
+                [CPS_PAGEOUT] = "w",
+                [CPS_PAGEIN]  = "r",
+                [CPS_FREEING] = "f"
+        };
+        static const char *lstate[] = {
+                [CLS_NEW]       = "n",
+                [CLS_QUEUING]   = "q",
+                [CLS_ENQUEUED]  = "e",
+                [CLS_HELD]      = "h",
+                [CLS_UNLOCKING] = "u",
+                [CLS_CACHED]    = "c",
+                [CLS_FREEING]   = "f"
+        };
+/*
+       lookup    hit  total   busy create
+pages: ...... ...... ...... ...... ...... [...... ...... ...... ......]
+locks: ...... ...... ...... ...... ...... [...... ...... ...... ...... ......]
+  env: ...... ...... ...... ...... ......
+ */
+        nob = lu_site_stats_print(&site->cs_lu, page, count);
+        nob += cache_stats_print(&site->cs_pages, page + nob, count - nob, 1);
+        nob += snprintf(page + nob, count - nob, " [");
+        for (i = 0; i < ARRAY_SIZE(site->cs_pages_state); ++i)
+                nob += snprintf(page + nob, count - nob, "%s: %u ",
+                                pstate[i],
+                                atomic_read(&site->cs_pages_state[i]));
+        nob += snprintf(page + nob, count - nob, "]\n");
+        nob += cache_stats_print(&site->cs_locks, page + nob, count - nob, 0);
+        nob += snprintf(page + nob, count - nob, " [");
+        for (i = 0; i < ARRAY_SIZE(site->cs_locks_state); ++i)
+                nob += snprintf(page + nob, count - nob, "%s: %u ",
+                                lstate[i],
+                                atomic_read(&site->cs_locks_state[i]));
+        nob += snprintf(page + nob, count - nob, "]\n");
+        nob += cache_stats_print(&cl_env_stats, page + nob, count - nob, 0);
+        nob += snprintf(page + nob, count - nob, "\n");
+        return nob;
+}
+EXPORT_SYMBOL(cl_site_stats_print);
+
+/*****************************************************************************
+ *
+ * lu_env handling on client.
+ *
+ */
+
+/*
+ * TBD: Description.
+ *
+ * XXX: this assumes that re-entrant file system calls (e.g., ->writepage())
+ * do not modify already existing current->journal_info.
+ */
+
+static CFS_LIST_HEAD(cl_envs);
+static unsigned cl_envs_cached_nr  = 0;
+static unsigned cl_envs_cached_max = 128; /* XXX: prototype: arbitrary limit
+                                           * for now. */
+static spinlock_t cl_envs_guard = SPIN_LOCK_UNLOCKED;
+
+struct cl_env {
+        void             *ce_magic;
+        struct lu_env     ce_lu;
+        struct lu_context ce_ses;
+        /*
+         * Linkage into global list of all client environments. Used for
+         * garbage collection.
+         */
+        struct list_head  ce_linkage;
+        /*
+         *
+         */
+        int               ce_ref;
+        void             *ce_prev;
+        /*
+         * Debugging field: address of the caller who made original
+         * allocation.
+         */
+        void             *ce_debug;
+        void             *ce_owner;
+};
+
+#define CL_ENV_INC(counter) atomic_inc(&cl_env_stats.counter)
+
+#define CL_ENV_DEC(counter)                                             \
+        do {                                                            \
+                LASSERT(atomic_read(&cl_env_stats.counter) > 0);        \
+                atomic_dec(&cl_env_stats.counter);                      \
+        } while (0)
+
+static void cl_env_init0(struct cl_env *cle, void *debug)
+{
+        LASSERT(cle->ce_ref == 0);
+        LASSERT(cle->ce_magic == &cl_env_init0);
+        LASSERT(cle->ce_debug == NULL && cle->ce_owner == NULL);
+
+        cle->ce_ref = 1;
+        cle->ce_prev = current->journal_info;
+        cle->ce_debug = debug;
+        cle->ce_owner = current;
+        current->journal_info = cle;
+        CL_ENV_INC(cs_busy);
+}
+
+static struct lu_env *cl_env_new(__u32 tags, void *debug)
+{
+        struct lu_env *env;
+        struct cl_env *cle;
+
+        OBD_SLAB_ALLOC_PTR(cle, cl_env_kmem);
+        if (cle != NULL) {
+                int rc;
+
+                CFS_INIT_LIST_HEAD(&cle->ce_linkage);
+                cle->ce_magic = &cl_env_init0;
+                env = &cle->ce_lu;
+                rc = lu_env_init(env, LCT_CL_THREAD|tags);
+                if (rc == 0) {
+                        rc = lu_context_init(&cle->ce_ses, LCT_SESSION|tags);
+                        if (rc == 0) {
+                                lu_context_enter(&cle->ce_ses);
+                                env->le_ses = &cle->ce_ses;
+                                cl_env_init0(cle, debug);
+                        } else
+                                lu_env_fini(env);
+                }
+                if (rc != 0) {
+                        OBD_SLAB_FREE_PTR(cle, cl_env_kmem);
+                        env = ERR_PTR(rc);
+                } else {
+                        CL_ENV_INC(cs_created);
+                        CL_ENV_INC(cs_total);
+                }
+        } else
+                env = ERR_PTR(-ENOMEM);
+        return env;
+}
+
+static void cl_env_fini(struct cl_env *cle)
+{
+        CL_ENV_DEC(cs_total);
+        lu_context_fini(&cle->ce_lu.le_ctx);
+        lu_context_fini(&cle->ce_ses);
+        OBD_SLAB_FREE_PTR(cle, cl_env_kmem);
+}
+
+static struct lu_env *cl_env_obtain(void *debug)
+{
+        struct cl_env *cle;
+        struct lu_env *env;
+
+        ENTRY;
+        spin_lock(&cl_envs_guard);
+        LASSERT(equi(cl_envs_cached_nr == 0, list_empty(&cl_envs)));
+        if (cl_envs_cached_nr > 0) {
+                int rc;
+
+                cle = container_of(cl_envs.next, struct cl_env, ce_linkage);
+                list_del_init(&cle->ce_linkage);
+                cl_envs_cached_nr--;
+                spin_unlock(&cl_envs_guard);
+
+                env = &cle->ce_lu;
+                rc = lu_env_refill(env);
+                if (rc == 0) {
+                        cl_env_init0(cle, debug);
+                        lu_context_enter(&env->le_ctx);
+                        lu_context_enter(&cle->ce_ses);
+                } else {
+                        cl_env_fini(cle);
+                        env = ERR_PTR(rc);
+                }
+        } else {
+                spin_unlock(&cl_envs_guard);
+                env = cl_env_new(0, debug);
+        }
+        RETURN(env);
+}
+
+static inline struct cl_env *cl_env_container(struct lu_env *env)
+{
+        return container_of(env, struct cl_env, ce_lu);
+}
+
+struct lu_env *cl_env_peek(int *refcheck)
+{
+        struct lu_env *env;
+        struct cl_env *cle;
+
+        CL_ENV_INC(cs_lookup);
+
+        /* check that we don't go far from untrusted pointer */
+        CLASSERT(offsetof(struct cl_env, ce_magic) == 0);
+
+        env = NULL;
+        cle = current->journal_info;
+        if (cle != NULL && cle->ce_magic == &cl_env_init0) {
+                CL_ENV_INC(cs_hit);
+                env = &cle->ce_lu;
+                *refcheck = ++cle->ce_ref;
+        }
+        CDEBUG(D_OTHER, "%i@%p\n", cle ? cle->ce_ref : 0, cle);
+        return env;
+}
+EXPORT_SYMBOL(cl_env_peek);
+
+/**
+ * Returns lu_env: if there already is an environment associated with the
+ * current thread, it is returned, otherwise, new environment is allocated.
+ *
+ * Allocations are amortized through the global cache of environments.
+ *
+ * \param refcheck pointer to a counter used to detect environment leaks. In
+ * the usual case cl_env_get() and cl_env_put() are called in the same lexical
+ * scope and pointer to the same integer is passed as \a refcheck. This is
+ * used to detect missed cl_env_put().
+ *
+ * \see cl_env_put()
+ */
+struct lu_env *cl_env_get(int *refcheck)
+{
+        struct lu_env *env;
+
+        env = cl_env_peek(refcheck);
+        if (env == NULL) {
+                env = cl_env_obtain(__builtin_return_address(0));
+                if (!IS_ERR(env)) {
+                        struct cl_env *cle;
+
+                        cle = cl_env_container(env);
+                        *refcheck = cle->ce_ref;
+                        CDEBUG(D_OTHER, "%i@%p\n", cle->ce_ref, cle);
+                }
+        }
+        return env;
+}
+EXPORT_SYMBOL(cl_env_get);
+
+/**
+ * Forces an allocation of a fresh environment with given tags.
+ *
+ * \see cl_env_get()
+ */
+struct lu_env *cl_env_alloc(int *refcheck, __u32 tags)
+{
+        struct lu_env *env;
+
+        LASSERT(cl_env_peek(refcheck) == NULL);
+        env = cl_env_new(tags, __builtin_return_address(0));
+        if (!IS_ERR(env)) {
+                struct cl_env *cle;
+
+                cle = cl_env_container(env);
+                *refcheck = cle->ce_ref;
+                CDEBUG(D_OTHER, "%i@%p\n", cle->ce_ref, cle);
+        }
+        return env;
+}
+EXPORT_SYMBOL(cl_env_alloc);
+
+static void cl_env_exit(struct cl_env *cle)
+{
+        lu_context_exit(&cle->ce_lu.le_ctx);
+        lu_context_exit(&cle->ce_ses);
+}
+
+/**
+ * Finalizes and frees a given number of cached environments. This is done to
+ * (1) free some memory (not currently hooked into VM), or (2) release
+ * references to modules.
+ */
+unsigned cl_env_cache_purge(unsigned nr)
+{
+        struct cl_env *cle;
+
+        ENTRY;
+        spin_lock(&cl_envs_guard);
+        for (; !list_empty(&cl_envs) && nr > 0; --nr) {
+                cle = container_of(cl_envs.next, struct cl_env, ce_linkage);
+                list_del_init(&cle->ce_linkage);
+                LASSERT(cl_envs_cached_nr > 0);
+                cl_envs_cached_nr--;
+                spin_unlock(&cl_envs_guard);
+
+                cl_env_fini(cle);
+                spin_lock(&cl_envs_guard);
+        }
+        LASSERT(equi(cl_envs_cached_nr == 0, list_empty(&cl_envs)));
+        spin_unlock(&cl_envs_guard);
+        RETURN(nr);
+}
+EXPORT_SYMBOL(cl_env_cache_purge);
+
+/**
+ * Release an environment.
+ *
+ * Decrement \a env reference counter. When counter drops to 0, nothing in
+ * this thread is using environment and it is returned to the allocation
+ * cache, or freed straight away, if cache is large enough.
+ */
+void cl_env_put(struct lu_env *env, int *refcheck)
+{
+        struct cl_env *cle;
+
+        cle = cl_env_container(env);
+
+        LASSERT(cle->ce_ref > 0);
+        LASSERT(ergo(refcheck != NULL, cle->ce_ref == *refcheck));
+
+        CDEBUG(D_OTHER, "%i@%p\n", cle->ce_ref, cle);
+        if (--cle->ce_ref == 0) {
+                CL_ENV_DEC(cs_busy);
+                current->journal_info = cle->ce_prev;
+                LASSERT(cle->ce_prev == NULL ||
+                        cl_env_container(cle->ce_prev)->ce_magic !=
+                        &cl_env_init0);
+                cle->ce_debug = NULL;
+                cle->ce_owner = NULL;
+                cl_env_exit(cle);
+                /*
+                 * Don't bother to take a lock here.
+                 *
+                 * Return environment to the cache only when it was allocated
+                 * with the standard tags.
+                 */
+                if (cl_envs_cached_nr < cl_envs_cached_max &&
+                    (env->le_ctx.lc_tags & ~LCT_HAS_EXIT) == LCT_CL_THREAD &&
+                    (env->le_ses->lc_tags & ~LCT_HAS_EXIT) == LCT_SESSION) {
+                        spin_lock(&cl_envs_guard);
+                        list_add(&cle->ce_linkage, &cl_envs);
+                        cl_envs_cached_nr++;
+                        spin_unlock(&cl_envs_guard);
+                } else
+                        cl_env_fini(cle);
+        }
+}
+EXPORT_SYMBOL(cl_env_put);
+
+/**
+ * Declares a point of re-entrancy.
+ *
+ * In Linux kernel environments are attached to the thread through
+ * current->journal_info pointer that is used by other sub-systems also. When
+ * lustre code is invoked in the situation where current->journal_info is
+ * potentially already set, cl_env_reenter() is called to save
+ * current->journal_info value, so that current->journal_info field can be
+ * used to store pointer to the environment.
+ *
+ * \see cl_env_reexit()
+ */
+void *cl_env_reenter(void)
+{
+        void *cookie;
+
+        cookie = current->journal_info;
+        current->journal_info = NULL;
+        CDEBUG(D_OTHER, "cookie: %p\n", cookie);
+        return cookie;
+}
+EXPORT_SYMBOL(cl_env_reenter);
+
+/**
+ * Exits re-entrancy.
+ *
+ * This restores old value of current->journal_info that was saved by
+ * cl_env_reenter().
+ */
+void cl_env_reexit(void *cookie)
+{
+        current->journal_info = cookie;
+        CDEBUG(D_OTHER, "cookie: %p\n", cookie);
+}
+EXPORT_SYMBOL(cl_env_reexit);
+
+/**
+ * Setup user-supplied \a env as a current environment. This is to be used to
+ * guaranteed that environment exists even when cl_env_get() fails. It is up
+ * to user to ensure proper concurrency control.
+ *
+ * \see cl_env_unplant()
+ */
+void cl_env_implant(struct lu_env *env, int *refcheck)
+{
+        struct cl_env *cle = cl_env_container(env);
+
+        LASSERT(current->journal_info == NULL);
+        LASSERT(cle->ce_ref > 0);
+
+        current->journal_info = cle;
+        cl_env_get(refcheck);
+        CDEBUG(D_OTHER, "%i@%p\n", cle->ce_ref, cle);
+}
+EXPORT_SYMBOL(cl_env_implant);
+
+/**
+ * Detach environment installed earlier by cl_env_implant().
+ */
+void cl_env_unplant(struct lu_env *env, int *refcheck)
+{
+        struct cl_env *cle = cl_env_container(env);
+
+        LASSERT(cle == current->journal_info);
+        LASSERT(cle->ce_ref > 1);
+
+        CDEBUG(D_OTHER, "%i@%p\n", cle->ce_ref, cle);
+
+        cl_env_put(env, refcheck);
+        current->journal_info = NULL;
+}
+EXPORT_SYMBOL(cl_env_unplant);
+
+struct lu_env *cl_env_nested_get(struct cl_env_nest *nest)
+{
+        struct lu_env *env;
+
+        nest->cen_cookie = NULL;
+        env = cl_env_peek(&nest->cen_refcheck);
+        if (env != NULL) {
+                if (!cl_io_is_going(env))
+                        return env;
+                else {
+                        cl_env_put(env, &nest->cen_refcheck);
+                        nest->cen_cookie = cl_env_reenter();
+                }
+        }
+        env = cl_env_get(&nest->cen_refcheck);
+        LASSERT(ergo(!IS_ERR(env), !cl_io_is_going(env)));
+        return env;
+}
+EXPORT_SYMBOL(cl_env_nested_get);
+
+void cl_env_nested_put(struct cl_env_nest *nest, struct lu_env *env)
+{
+        cl_env_put(env, &nest->cen_refcheck);
+        cl_env_reexit(nest->cen_cookie);
+}
+EXPORT_SYMBOL(cl_env_nested_put);
+
+/**
+ * Converts struct cl_attr to struct ost_lvb.
+ *
+ * \see cl_lvb2attr
+ */
+void cl_attr2lvb(struct ost_lvb *lvb, const struct cl_attr *attr)
+{
+        ENTRY;
+        lvb->lvb_size   = attr->cat_size;
+        lvb->lvb_mtime  = attr->cat_mtime;
+        lvb->lvb_atime  = attr->cat_atime;
+        lvb->lvb_ctime  = attr->cat_ctime;
+        lvb->lvb_blocks = attr->cat_blocks;
+        EXIT;
+}
+EXPORT_SYMBOL(cl_attr2lvb);
+
+/**
+ * Converts struct ost_lvb to struct cl_attr.
+ *
+ * \see cl_attr2lvb
+ */
+void cl_lvb2attr(struct cl_attr *attr, const struct ost_lvb *lvb)
+{
+        ENTRY;
+        attr->cat_size   = lvb->lvb_size;
+        attr->cat_mtime  = lvb->lvb_mtime;
+        attr->cat_atime  = lvb->lvb_atime;
+        attr->cat_ctime  = lvb->lvb_ctime;
+        attr->cat_blocks = lvb->lvb_blocks;
+        EXIT;
+}
+EXPORT_SYMBOL(cl_lvb2attr);
+
+
+/*****************************************************************************
+ *
+ * Temporary prototype thing: mirror obd-devices into cl devices.
+ *
+ */
+
+struct cl_device *cl_type_setup(const struct lu_env *env, struct lu_site *site,
+                                struct lu_device_type *ldt,
+                                struct lu_device *next)
+{
+        const char       *typename;
+        struct lu_device *d;
+
+        LASSERT(ldt != NULL);
+
+        typename = ldt->ldt_name;
+        d = ldt->ldt_ops->ldto_device_alloc(env, ldt, NULL);
+        if (!IS_ERR(d)) {
+                int rc;
+
+                if (site != NULL)
+                        d->ld_site = site;
+                rc = ldt->ldt_ops->ldto_device_init(env, d, typename, next);
+                if (rc == 0) {
+                        lu_device_get(d);
+                        lu_ref_add(&d->ld_reference,
+                                   "lu-stack", &lu_site_init);
+                } else {
+                        ldt->ldt_ops->ldto_device_free(env, d);
+                        CERROR("can't init device '%s', %d\n", typename, rc);
+                        d = ERR_PTR(rc);
+                }
+        } else
+                CERROR("Cannot allocate device: '%s'\n", typename);
+        return lu2cl_dev(d);
+}
+EXPORT_SYMBOL(cl_type_setup);
+
+/**
+ * Finalize device stack by calling lu_stack_fini().
+ */
+void cl_stack_fini(const struct lu_env *env, struct cl_device *cl)
+{
+        lu_stack_fini(env, cl2lu_dev(cl));
+}
+EXPORT_SYMBOL(cl_stack_fini);
+
+int  cl_lock_init(void);
+void cl_lock_fini(void);
+
+int  cl_page_init(void);
+void cl_page_fini(void);
+
+static struct lu_context_key cl_key;
+
+struct cl_thread_info *cl_env_info(const struct lu_env *env)
+{
+        return lu_context_key_get(&env->le_ctx, &cl_key);
+}
+
+/* defines cl0_key_{init,fini}() */
+LU_KEY_INIT_FINI(cl0, struct cl_thread_info);
+
+static void *cl_key_init(const struct lu_context *ctx,
+                         struct lu_context_key *key)
+{
+        struct cl_thread_info *info;
+
+        info = cl0_key_init(ctx, key);
+        if (!IS_ERR(info))
+                lu_ref_init(&info->clt_locks_locked);
+        return info;
+}
+
+static void cl_key_fini(const struct lu_context *ctx,
+                        struct lu_context_key *key, void *data)
+{
+        struct cl_thread_info *info;
+
+        info = data;
+        lu_ref_fini(&info->clt_locks_locked);
+        cl0_key_fini(ctx, key, data);
+}
+
+static void cl_key_exit(const struct lu_context *ctx,
+                        struct lu_context_key *key, void *data)
+{
+        struct cl_thread_info *info = data;
+
+        LASSERT(info->clt_nr_locks_locked == 0);
+        LASSERT(info->clt_nr_held == 0);
+        LASSERT(info->clt_nr_used == 0);
+        LASSERT(info->clt_nr_locks_acquired == 0);
+
+        lu_ref_fini(&info->clt_locks_locked);
+        lu_ref_init(&info->clt_locks_locked);
+}
+
+static struct lu_context_key cl_key = {
+        .lct_tags = LCT_CL_THREAD,
+        .lct_init = cl_key_init,
+        .lct_fini = cl_key_fini,
+        .lct_exit = cl_key_exit
+};
+
+static struct lu_kmem_descr cl_object_caches[] = {
+        {
+                .ckd_cache = &cl_env_kmem,
+                .ckd_name  = "cl_env_kmem",
+                .ckd_size  = sizeof (struct cl_env)
+        },
+        {
+                .ckd_cache = NULL
+        }
+};
+
+/**
+ * Global initialization of cl-data. Create kmem caches, register
+ * lu_context_key's, etc.
+ *
+ * \see cl_global_fini()
+ */
+int cl_global_init(void)
+{
+        int result;
+
+        result = lu_kmem_init(cl_object_caches);
+        if (result == 0) {
+                LU_CONTEXT_KEY_INIT(&cl_key);
+                result = lu_context_key_register(&cl_key);
+                if (result == 0) {
+                        result = cl_lock_init();
+                        if (result == 0)
+                                result = cl_page_init();
+                }
+        }
+        return result;
+}
+
+/**
+ * Finalization of global cl-data. Dual to cl_global_init().
+ */
+void cl_global_fini(void)
+{
+        cl_lock_fini();
+        cl_page_fini();
+        lu_context_key_degister(&cl_key);
+        lu_kmem_fini(cl_object_caches);
+}
diff --git a/lustre/obdclass/cl_page.c b/lustre/obdclass/cl_page.c

new file mode 100644 (file)

index 0000000..6371a73
--- /dev/null
+++ b/lustre/obdclass/cl_page.c
@@ -0,0 +1,1516 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Client Lustre Page.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+
+#include <libcfs/libcfs.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <libcfs/list.h>
+
+#include <cl_object.h>
+#include "cl_internal.h"
+
+static void cl_page_delete0(const struct lu_env *env, struct cl_page *pg,
+                            int radix);
+
+static cfs_mem_cache_t      *cl_page_kmem = NULL;
+
+static struct lu_kmem_descr cl_page_caches[] = {
+        {
+                .ckd_cache = &cl_page_kmem,
+                .ckd_name  = "cl_page_kmem",
+                .ckd_size  = sizeof (struct cl_page)
+        },
+        {
+                .ckd_cache = NULL
+        }
+};
+
+#ifdef LIBCFS_DEBUG
+# define PASSERT(env, page, expr)                                       \
+  do {                                                                    \
+          if (unlikely(!(expr))) {                                      \
+                  CL_PAGE_DEBUG(D_ERROR, (env), (page), #expr "\n");    \
+                  LASSERT(0);                                           \
+          }                                                             \
+  } while (0)
+#else /* !LIBCFS_DEBUG */
+# define PASSERT(env, page, exp) \
+        ((void)sizeof(env), (void)sizeof(page), (void)sizeof !!(exp))
+#endif /* !LIBCFS_DEBUG */
+
+#ifdef INVARIANT_CHECK
+# define PINVRNT(env, page, expr)                                       \
+  do {                                                                    \
+          if (unlikely(!(expr))) {                                      \
+                  CL_PAGE_DEBUG(D_ERROR, (env), (page), #expr "\n");    \
+                  LINVRNT(0);                                           \
+          }                                                             \
+  } while (0)
+#else /* !INVARIANT_CHECK */
+# define PINVRNT(env, page, exp) \
+        ((void)sizeof(env), (void)sizeof(page), (void)sizeof !!(exp))
+#endif /* !INVARIANT_CHECK */
+
+/**
+ * Internal version of cl_page_top, it should be called with page referenced,
+ * or coh_page_guard held.
+ */
+static struct cl_page *cl_page_top_trusted(struct cl_page *page)
+{
+        LASSERT(cl_is_page(page));
+        while (page->cp_parent != NULL)
+                page = page->cp_parent;
+        return page;
+}
+
+/**
+ * Internal version of cl_page_get().
+ *
+ * This function can be used to obtain initial reference to previously
+ * unreferenced cached object. It can be called only if concurrent page
+ * reclamation is somehow prevented, e.g., by locking page radix-tree
+ * (cl_object_header::hdr->coh_page_guard), or by keeping a lock on a VM page,
+ * associated with \a page.
+ *
+ * Use with care! Not exported.
+ */
+static void cl_page_get_trust(struct cl_page *page)
+{
+        LASSERT(cl_is_page(page));
+        /*
+         * Checkless version for trusted users.
+         */
+        if (atomic_inc_return(&page->cp_ref) == 1)
+                atomic_inc(&cl_object_site(page->cp_obj)->cs_pages.cs_busy);
+}
+
+/**
+ * Returns a slice within a page, corresponding to the given layer in the
+ * device stack.
+ *
+ * \see cl_lock_at()
+ */
+static const struct cl_page_slice *
+cl_page_at_trusted(const struct cl_page *page,
+                   const struct lu_device_type *dtype)
+{
+        const struct cl_page_slice *slice;
+        struct cl_object_header *ch = cl_object_header(page->cp_obj);
+
+        ENTRY;
+        LINVRNT(ergo(!atomic_read(&page->cp_ref),
+                spin_is_locked(&ch->coh_page_guard)));
+
+        page = cl_page_top_trusted((struct cl_page *)page);
+        do {
+                list_for_each_entry(slice, &page->cp_layers, cpl_linkage) {
+                        if (slice->cpl_obj->co_lu.lo_dev->ld_type == dtype)
+                                RETURN(slice);
+                }
+                page = page->cp_child;
+        } while (page != NULL);
+        RETURN(NULL);
+}
+
+/**
+ * Returns a page with given index in the given object, or NULL if no page is
+ * found. Acquires a reference on \a page.
+ *
+ * Locking: called under cl_object_header::coh_page_guard spin-lock.
+ */
+struct cl_page *cl_page_lookup(struct cl_object_header *hdr, pgoff_t index)
+{
+        struct cl_page *page;
+
+        LASSERT(spin_is_locked(&hdr->coh_page_guard));
+
+        page = radix_tree_lookup(&hdr->coh_tree, index);
+        if (page != NULL) {
+                LASSERT(cl_is_page(page));
+                cl_page_get_trust(page);
+        }
+        return page;
+}
+EXPORT_SYMBOL(cl_page_lookup);
+
+/**
+ * Returns a list of pages by a given [start, end] of @obj.
+ *
+ * Gang tree lookup (radix_tree_gang_lookup()) optimization is absolutely
+ * crucial in the face of [offset, EOF] locks.
+ */
+void cl_page_gang_lookup(const struct lu_env *env, struct cl_object *obj,
+                         struct cl_io *io, pgoff_t start, pgoff_t end,
+                         struct cl_page_list *queue)
+{
+        struct cl_object_header *hdr;
+        struct cl_page          *page;
+        struct cl_page         **pvec;
+        const struct cl_page_slice  *slice;
+        const struct lu_device_type *dtype;
+        pgoff_t                  idx;
+        unsigned int             nr;
+        unsigned int             i;
+        unsigned int             j;
+        ENTRY;
+
+        idx = start;
+        hdr = cl_object_header(obj);
+        pvec = cl_env_info(env)->clt_pvec;
+        dtype = cl_object_top(obj)->co_lu.lo_dev->ld_type;
+        spin_lock(&hdr->coh_page_guard);
+        while ((nr = radix_tree_gang_lookup(&hdr->coh_tree, (void **)pvec,
+                                            idx, CLT_PVEC_SIZE)) > 0) {
+                idx = pvec[nr - 1]->cp_index + 1;
+                for (i = 0, j = 0; i < nr; ++i) {
+                        page = pvec[i];
+                        PASSERT(env, page, cl_is_page(page));
+                        pvec[i] = NULL;
+                        if (page->cp_index > end)
+                                break;
+                        if (page->cp_state == CPS_FREEING)
+                                continue;
+                        if (page->cp_type == CPT_TRANSIENT) {
+                                /* God, we found a transient page!*/
+                                continue;
+                        }
+
+                        slice = cl_page_at_trusted(page, dtype);
+                        /*
+                         * Pages for lsm-less file has no underneath sub-page
+                         * for osc, in case of ...
+                         */
+                        PASSERT(env, page, slice != NULL);
+                        page = slice->cpl_page;
+                        /*
+                         * Can safely call cl_page_get_trust() under
+                         * radix-tree spin-lock.
+                         *
+                         * XXX not true, because @page is from object another
+                         * than @hdr and protected by different tree lock.
+                         */
+                        cl_page_get_trust(page);
+                        lu_ref_add_atomic(&page->cp_reference,
+                                          "page_list", cfs_current());
+                        pvec[j++] = page;
+                }
+
+                /*
+                 * Here a delicate locking dance is performed. Current thread
+                 * holds a reference to a page, but has to own it before it
+                 * can be placed into queue. Owning implies waiting, so
+                 * radix-tree lock is to be released. After a wait one has to
+                 * check that pages weren't truncated (cl_page_own() returns
+                 * error in the latter case).
+                 */
+                spin_unlock(&hdr->coh_page_guard);
+                for (i = 0; i < j; ++i) {
+                        page = pvec[i];
+                        if (cl_page_own(env, io, page) == 0)
+                                cl_page_list_add(queue, page);
+                        lu_ref_del(&page->cp_reference,
+                                   "page_list", cfs_current());
+                        cl_page_put(env, page);
+                }
+                spin_lock(&hdr->coh_page_guard);
+                if (nr < CLT_PVEC_SIZE)
+                        break;
+        }
+        spin_unlock(&hdr->coh_page_guard);
+        EXIT;
+}
+EXPORT_SYMBOL(cl_page_gang_lookup);
+
+static void cl_page_free(const struct lu_env *env, struct cl_page *page)
+{
+        struct cl_object *obj  = page->cp_obj;
+        struct cl_site   *site = cl_object_site(obj);
+
+        PASSERT(env, page, cl_is_page(page));
+        PASSERT(env, page, list_empty(&page->cp_batch));
+        PASSERT(env, page, page->cp_owner == NULL);
+        PASSERT(env, page, page->cp_req == NULL);
+        PASSERT(env, page, page->cp_parent == NULL);
+        PASSERT(env, page, page->cp_state == CPS_FREEING);
+
+        ENTRY;
+        might_sleep();
+        while (!list_empty(&page->cp_layers)) {
+                struct cl_page_slice *slice;
+
+                slice = list_entry(page->cp_layers.next, struct cl_page_slice,
+                                   cpl_linkage);
+                list_del_init(page->cp_layers.next);
+                slice->cpl_ops->cpo_fini(env, slice);
+        }
+        atomic_dec(&site->cs_pages.cs_total);
+        atomic_dec(&site->cs_pages_state[page->cp_state]);
+        lu_object_ref_del_at(&obj->co_lu, page->cp_obj_ref, "cl_page", page);
+        cl_object_put(env, obj);
+        lu_ref_fini(&page->cp_reference);
+        OBD_SLAB_FREE_PTR(page, cl_page_kmem);
+        EXIT;
+}
+
+/**
+ * Helper function updating page state. This is the only place in the code
+ * where cl_page::cp_state field is mutated.
+ */
+static inline void cl_page_state_set_trust(struct cl_page *page,
+                                           enum cl_page_state state)
+{
+        /* bypass const. */
+        *(enum cl_page_state *)&page->cp_state = state;
+}
+
+static int cl_page_alloc(const struct lu_env *env, struct cl_object *o,
+                         pgoff_t ind, struct page *vmpage,
+                         enum cl_page_type type, struct cl_page **out)
+{
+        struct cl_page          *page;
+        struct cl_page          *err  = NULL;
+        struct lu_object_header *head;
+        struct cl_site          *site = cl_object_site(o);
+        int                      result;
+
+        ENTRY;
+        result = +1;
+        OBD_SLAB_ALLOC_PTR(page, cl_page_kmem);
+        if (page != NULL) {
+                atomic_set(&page->cp_ref, 1);
+                page->cp_obj = o;
+                cl_object_get(o);
+                page->cp_obj_ref = lu_object_ref_add(&o->co_lu,
+                                                     "cl_page", page);
+                page->cp_index = ind;
+                cl_page_state_set_trust(page, CPS_CACHED);
+                page->cp_type = type;
+                CFS_INIT_LIST_HEAD(&page->cp_layers);
+                CFS_INIT_LIST_HEAD(&page->cp_batch);
+                CFS_INIT_LIST_HEAD(&page->cp_flight);
+                mutex_init(&page->cp_mutex);
+                lu_ref_init(&page->cp_reference);
+                head = o->co_lu.lo_header;
+                list_for_each_entry(o, &head->loh_layers, co_lu.lo_linkage) {
+                        if (o->co_ops->coo_page_init != NULL) {
+                                err = o->co_ops->coo_page_init(env, o,
+                                                               page, vmpage);
+                                if (err != NULL) {
+                                        cl_page_state_set_trust(page,
+                                                                CPS_FREEING);
+                                        cl_page_free(env, page);
+                                        page = err;
+                                        break;
+                                }
+                        }
+                }
+                if (err == NULL) {
+                        atomic_inc(&site->cs_pages.cs_busy);
+                        atomic_inc(&site->cs_pages.cs_total);
+                        atomic_inc(&site->cs_pages_state[CPS_CACHED]);
+                        atomic_inc(&site->cs_pages.cs_created);
+                        result = 0;
+                }
+        } else
+                page = ERR_PTR(-ENOMEM);
+        *out = page;
+        RETURN(result);
+}
+
+/**
+ * Returns a cl_page with index \a idx at the object \a o, and associated with
+ * the VM page \a vmpage.
+ *
+ * This is the main entry point into the cl_page caching interface. First, a
+ * cache (implemented as a per-object radix tree) is consulted. If page is
+ * found there, it is returned immediately. Otherwise new page is allocated
+ * and returned. In any case, additional reference to page is acquired.
+ *
+ * \see cl_object_find(), cl_lock_find()
+ */
+struct cl_page *cl_page_find(const struct lu_env *env, struct cl_object *o,
+                             pgoff_t idx, struct page *vmpage,
+                             enum cl_page_type type)
+{
+        struct cl_page          *page;
+        struct cl_page          *ghost = NULL;
+        struct cl_object_header *hdr;
+        struct cl_site          *site = cl_object_site(o);
+        int err;
+
+        LINVRNT(type == CPT_CACHEABLE || type == CPT_TRANSIENT);
+        might_sleep();
+
+        ENTRY;
+
+        hdr = cl_object_header(o);
+        atomic_inc(&site->cs_pages.cs_lookup);
+
+        CDEBUG(D_PAGE, "%lu@"DFID" %p %lu %i\n",
+               idx, PFID(&hdr->coh_lu.loh_fid), vmpage, vmpage->private, type);
+        /* fast path. */
+        if (type == CPT_CACHEABLE) {
+                /*
+                 * cl_vmpage_page() can be called here without any locks as
+                 *
+                 *     - "vmpage" is locked (which prevents ->private from
+                 *       concurrent updates), and
+                 *
+                 *     - "o" cannot be destroyed while current thread holds a
+                 *       reference on it.
+                 */
+                page = cl_vmpage_page(vmpage, o);
+                PINVRNT(env, page,
+                        ergo(page != NULL,
+                             cl_page_vmpage(env, page) == vmpage &&
+                             (void *)radix_tree_lookup(&hdr->coh_tree,
+                                                       idx) == page));
+        } else {
+                spin_lock(&hdr->coh_page_guard);
+                page = cl_page_lookup(hdr, idx);
+                spin_unlock(&hdr->coh_page_guard);
+        }
+        if (page != NULL) {
+                atomic_inc(&site->cs_pages.cs_hit);
+                RETURN(page);
+        }
+
+        /* allocate and initialize cl_page */
+        err = cl_page_alloc(env, o, idx, vmpage, type, &page);
+        if (err != 0)
+                RETURN(page);
+        /*
+         * XXX optimization: use radix_tree_preload() here, and change tree
+         * gfp mask to GFP_KERNEL in cl_object_header_init().
+         */
+        spin_lock(&hdr->coh_page_guard);
+        err = radix_tree_insert(&hdr->coh_tree, idx, page);
+        if (err != 0) {
+                ghost = page;
+                /*
+                 * Noted by Jay: a lock on \a vmpage protects cl_page_find()
+                 * from this race, but
+                 *
+                 *     0. it's better to have cl_page interface "locally
+                 *     consistent" so that its correctness can be reasoned
+                 *     about without appealing to the (obscure world of) VM
+                 *     locking.
+                 *
+                 *     1. handling this race allows ->coh_tree to remain
+                 *     consistent even when VM locking is somehow busted,
+                 *     which is very useful during diagnosing and debugging.
+                 */
+                if (err == -EEXIST) {
+                        /*
+                         * XXX in case of a lookup for CPT_TRANSIENT page,
+                         * nothing protects a CPT_CACHEABLE page from being
+                         * concurrently moved into CPS_FREEING state.
+                         */
+                        page = cl_page_lookup(hdr, idx);
+                        PASSERT(env, page, page != NULL);
+                        if (page->cp_type == CPT_TRANSIENT &&
+                            type == CPT_CACHEABLE) {
+                                /* XXX: We should make sure that inode sem
+                                 * keeps being held in the lifetime of
+                                 * transient pages, so it is impossible to
+                                 * have conflicting transient pages.
+                                 */
+                                spin_unlock(&hdr->coh_page_guard);
+                                cl_page_put(env, page);
+                                spin_lock(&hdr->coh_page_guard);
+                                page = ERR_PTR(-EBUSY);
+                        }
+                } else
+                        page = ERR_PTR(err);
+        } else
+                hdr->coh_pages++;
+        spin_unlock(&hdr->coh_page_guard);
+
+        if (unlikely(ghost != NULL)) {
+                atomic_dec(&site->cs_pages.cs_busy);
+                cl_page_delete0(env, ghost, 0);
+                cl_page_free(env, ghost);
+        }
+        RETURN(page);
+}
+EXPORT_SYMBOL(cl_page_find);
+
+static inline int cl_page_invariant(const struct cl_page *pg)
+{
+        struct cl_object_header *header;
+        struct cl_page          *parent;
+        struct cl_page          *child;
+        struct cl_io            *owner;
+
+        LASSERT(cl_is_page(pg));
+        /*
+         * Page invariant is protected by a VM lock.
+         */
+        LINVRNT(cl_page_is_vmlocked(NULL, pg));
+
+        header = cl_object_header(pg->cp_obj);
+        parent = pg->cp_parent;
+        child  = pg->cp_child;
+        owner  = pg->cp_owner;
+
+        return atomic_read(&pg->cp_ref) > 0 &&
+                ergo(parent != NULL, parent->cp_child == pg) &&
+                ergo(child != NULL, child->cp_parent == pg) &&
+                ergo(child != NULL, pg->cp_obj != child->cp_obj) &&
+                ergo(parent != NULL, pg->cp_obj != parent->cp_obj) &&
+                ergo(owner != NULL && parent != NULL,
+                     parent->cp_owner == pg->cp_owner->ci_parent) &&
+                ergo(owner != NULL && child != NULL,
+                     child->cp_owner->ci_parent == owner) &&
+                /*
+                 * Either page is early in initialization (has neither child
+                 * nor parent yet), or it is in the object radix tree.
+                 */
+                ergo(pg->cp_state < CPS_FREEING,
+                     (void *)radix_tree_lookup(&header->coh_tree,
+                                               pg->cp_index) == pg ||
+                     (child == NULL && parent == NULL));
+}
+
+static void cl_page_state_set0(const struct lu_env *env,
+                               struct cl_page *page, enum cl_page_state state)
+{
+        enum cl_page_state old;
+        struct cl_site *site = cl_object_site(page->cp_obj);
+
+        /*
+         * Matrix of allowed state transitions [old][new], for sanity
+         * checking.
+         */
+        static const int allowed_transitions[CPS_NR][CPS_NR] = {
+                [CPS_CACHED] = {
+                        [CPS_CACHED]  = 0,
+                        [CPS_OWNED]   = 1, /* io finds existing cached page */
+                        [CPS_PAGEIN]  = 0,
+                        [CPS_PAGEOUT] = 1, /* write-out from the cache */
+                        [CPS_FREEING] = 1, /* eviction on the memory pressure */
+                },
+                [CPS_OWNED] = {
+                        [CPS_CACHED]  = 1, /* release to the cache */
+                        [CPS_OWNED]   = 0,
+                        [CPS_PAGEIN]  = 1, /* start read immediately */
+                        [CPS_PAGEOUT] = 1, /* start write immediately */
+                        [CPS_FREEING] = 1, /* lock invalidation or truncate */
+                },
+                [CPS_PAGEIN] = {
+                        [CPS_CACHED]  = 1, /* io completion */
+                        [CPS_OWNED]   = 0,
+                        [CPS_PAGEIN]  = 0,
+                        [CPS_PAGEOUT] = 0,
+                        [CPS_FREEING] = 0,
+                },
+                [CPS_PAGEOUT] = {
+                        [CPS_CACHED]  = 1, /* io completion */
+                        [CPS_OWNED]   = 0,
+                        [CPS_PAGEIN]  = 0,
+                        [CPS_PAGEOUT] = 0,
+                        [CPS_FREEING] = 0,
+                },
+                [CPS_FREEING] = {
+                        [CPS_CACHED]  = 0,
+                        [CPS_OWNED]   = 0,
+                        [CPS_PAGEIN]  = 0,
+                        [CPS_PAGEOUT] = 0,
+                        [CPS_FREEING] = 0,
+                }
+        };
+
+        ENTRY;
+        old = page->cp_state;
+        PASSERT(env, page, allowed_transitions[old][state]);
+        CL_PAGE_HEADER(D_TRACE, env, page, "%i -> %i\n", old, state);
+        for (; page != NULL; page = page->cp_child) {
+                PASSERT(env, page, page->cp_state == old);
+                PASSERT(env, page,
+                        equi(state == CPS_OWNED, page->cp_owner != NULL));
+
+                atomic_dec(&site->cs_pages_state[page->cp_state]);
+                atomic_inc(&site->cs_pages_state[state]);
+                cl_page_state_set_trust(page, state);
+        }
+        EXIT;
+}
+
+static void cl_page_state_set(const struct lu_env *env,
+                              struct cl_page *page, enum cl_page_state state)
+{
+        PINVRNT(env, page, cl_page_invariant(page));
+        cl_page_state_set0(env, page, state);
+}
+
+/**
+ * Acquires an additional reference to a page.
+ *
+ * This can be called only by caller already possessing a reference to \a
+ * page.
+ *
+ * \see cl_object_get(), cl_lock_get().
+ */
+void cl_page_get(struct cl_page *page)
+{
+        ENTRY;
+        LASSERT(page->cp_state != CPS_FREEING);
+        cl_page_get_trust(page);
+        EXIT;
+}
+EXPORT_SYMBOL(cl_page_get);
+
+/**
+ * Releases a reference to a page.
+ *
+ * When last reference is released, page is returned to the cache, unless it
+ * is in cl_page_state::CPS_FREEING state, in which case it is immediately
+ * destroyed.
+ *
+ * \see cl_object_put(), cl_lock_put().
+ */
+void cl_page_put(const struct lu_env *env, struct cl_page *page)
+{
+        struct cl_object_header *hdr;
+        struct cl_site *site = cl_object_site(page->cp_obj);
+
+        PASSERT(env, page, atomic_read(&page->cp_ref) > !!page->cp_parent);
+
+        ENTRY;
+        CL_PAGE_HEADER(D_TRACE, env, page, "%i\n", atomic_read(&page->cp_ref));
+        hdr = cl_object_header(page->cp_obj);
+        if (atomic_dec_and_test(&page->cp_ref)) {
+                atomic_dec(&site->cs_pages.cs_busy);
+                if (page->cp_state == CPS_FREEING) {
+                        PASSERT(env, page, page->cp_owner == NULL);
+                        PASSERT(env, page, list_empty(&page->cp_batch));
+                        /*
+                         * Page is no longer reachable by other threads. Tear
+                         * it down.
+                         */
+                        cl_page_free(env, page);
+                }
+        }
+        EXIT;
+}
+EXPORT_SYMBOL(cl_page_put);
+
+/**
+ * Returns a VM page associated with a given cl_page.
+ */
+cfs_page_t *cl_page_vmpage(const struct lu_env *env, struct cl_page *page)
+{
+        const struct cl_page_slice *slice;
+
+        /*
+         * Find uppermost layer with ->cpo_vmpage() method, and return its
+         * result.
+         */
+        page = cl_page_top(page);
+        do {
+                list_for_each_entry(slice, &page->cp_layers, cpl_linkage) {
+                        if (slice->cpl_ops->cpo_vmpage != NULL)
+                                RETURN(slice->cpl_ops->cpo_vmpage(env, slice));
+                }
+                page = page->cp_child;
+        } while (page != NULL);
+        LBUG(); /* ->cpo_vmpage() has to be defined somewhere in the stack */
+}
+EXPORT_SYMBOL(cl_page_vmpage);
+
+/**
+ * Returns a cl_page associated with a VM page, and given cl_object.
+ */
+struct cl_page *cl_vmpage_page(cfs_page_t *vmpage, struct cl_object *obj)
+{
+        struct cl_page *page;
+
+        ENTRY;
+        KLASSERT(PageLocked(vmpage));
+
+        /*
+         * NOTE: absence of races and liveness of data are guaranteed by page
+         *       lock on a "vmpage". That works because object destruction has
+         *       bottom-to-top pass.
+         */
+
+        /*
+         * This loop assumes that ->private points to the top-most page. This
+         * can be rectified easily.
+         */
+        for (page = (void *)vmpage->private;
+             page != NULL; page = page->cp_child) {
+                if (cl_object_same(page->cp_obj, obj)) {
+                        cl_page_get_trust(page);
+                        break;
+                }
+        }
+        LASSERT(ergo(page, cl_is_page(page) && page->cp_type == CPT_CACHEABLE));
+        RETURN(page);
+}
+EXPORT_SYMBOL(cl_vmpage_page);
+
+/**
+ * Returns the top-page for a given page.
+ *
+ * \see cl_object_top(), cl_io_top()
+ */
+struct cl_page *cl_page_top(struct cl_page *page)
+{
+        return cl_page_top_trusted(page);
+}
+EXPORT_SYMBOL(cl_page_top);
+
+/**
+ * Returns true if \a addr is an address of an allocated cl_page. Used in
+ * assertions. This check is optimistically imprecise, i.e., it occasionally
+ * returns true for the incorrect addresses, but if it returns false, then the
+ * address is guaranteed to be incorrect. (Should be named cl_pagep().)
+ *
+ * \see cl_is_lock()
+ */
+int cl_is_page(const void *addr)
+{
+        return cfs_mem_is_in_cache(addr, cl_page_kmem);
+}
+EXPORT_SYMBOL(cl_is_page);
+
+const struct cl_page_slice *cl_page_at(const struct cl_page *page,
+                                       const struct lu_device_type *dtype)
+{
+        return cl_page_at_trusted(page, dtype);
+}
+EXPORT_SYMBOL(cl_page_at);
+
+#define CL_PAGE_OP(opname) offsetof(struct cl_page_operations, opname)
+
+#define CL_PAGE_INVOKE(_env, _page, _op, _proto, ...)                   \
+({                                                                      \
+        const struct lu_env        *__env  = (_env);                    \
+        struct cl_page             *__page = (_page);                   \
+        const struct cl_page_slice *__scan;                             \
+        int                         __result;                           \
+        ptrdiff_t                   __op   = (_op);                     \
+        int                       (*__method)_proto;                    \
+                                                                        \
+        __result = 0;                                                   \
+        __page = cl_page_top(__page);                                   \
+        do {                                                            \
+                list_for_each_entry(__scan, &__page->cp_layers,         \
+                                    cpl_linkage) {                      \
+                        __method = *(void **)((char *)__scan->cpl_ops + \
+                                              __op);                    \
+                        if (__method != NULL) {                         \
+                                __result = (*__method)(__env, __scan,   \
+                                                       ## __VA_ARGS__); \
+                                if (__result != 0)                      \
+                                        break;                          \
+                        }                                               \
+                }                                                       \
+                __page = __page->cp_child;                              \
+        } while (__page != NULL && __result == 0);                      \
+        if (__result > 0)                                               \
+                __result = 0;                                           \
+        __result;                                                       \
+})
+
+#define CL_PAGE_INVOID(_env, _page, _op, _proto, ...)                   \
+do {                                                                    \
+        const struct lu_env        *__env  = (_env);                    \
+        struct cl_page             *__page = (_page);                   \
+        const struct cl_page_slice *__scan;                             \
+        ptrdiff_t                   __op   = (_op);                     \
+        void                      (*__method)_proto;                    \
+                                                                        \
+        __page = cl_page_top(__page);                                   \
+        do {                                                            \
+                list_for_each_entry(__scan, &__page->cp_layers,         \
+                                    cpl_linkage) {                      \
+                        __method = *(void **)((char *)__scan->cpl_ops + \
+                                              __op);                    \
+                        if (__method != NULL)                           \
+                                (*__method)(__env, __scan,              \
+                                            ## __VA_ARGS__);            \
+                }                                                       \
+                __page = __page->cp_child;                              \
+        } while (__page != NULL);                                       \
+} while (0)
+
+#define CL_PAGE_INVOID_REVERSE(_env, _page, _op, _proto, ...)           \
+do {                                                                    \
+        const struct lu_env        *__env  = (_env);                    \
+        struct cl_page             *__page = (_page);                   \
+        const struct cl_page_slice *__scan;                             \
+        ptrdiff_t                   __op   = (_op);                     \
+        void                      (*__method)_proto;                    \
+                                                                        \
+        /* get to the bottom page. */                                   \
+        while (__page->cp_child != NULL)                                \
+                __page = __page->cp_child;                              \
+        do {                                                            \
+                list_for_each_entry_reverse(__scan, &__page->cp_layers, \
+                                            cpl_linkage) {              \
+                        __method = *(void **)((char *)__scan->cpl_ops + \
+                                              __op);                    \
+                        if (__method != NULL)                           \
+                                (*__method)(__env, __scan,              \
+                                            ## __VA_ARGS__);            \
+                }                                                       \
+                __page = __page->cp_parent;                             \
+        } while (__page != NULL);                                       \
+} while (0)
+
+static int cl_page_invoke(const struct lu_env *env,
+                          struct cl_io *io, struct cl_page *page, ptrdiff_t op)
+
+{
+        PINVRNT(env, page, cl_object_same(page->cp_obj, io->ci_obj));
+        ENTRY;
+        RETURN(CL_PAGE_INVOKE(env, page, op,
+                              (const struct lu_env *,
+                               const struct cl_page_slice *, struct cl_io *),
+                              io));
+}
+
+static void cl_page_invoid(const struct lu_env *env,
+                           struct cl_io *io, struct cl_page *page, ptrdiff_t op)
+
+{
+        PINVRNT(env, page, cl_object_same(page->cp_obj, io->ci_obj));
+        ENTRY;
+        CL_PAGE_INVOID(env, page, op,
+                       (const struct lu_env *,
+                        const struct cl_page_slice *, struct cl_io *), io);
+        EXIT;
+}
+
+static void cl_page_owner_clear(struct cl_page *page)
+{
+        ENTRY;
+        for (page = cl_page_top(page); page != NULL; page = page->cp_child) {
+                if (page->cp_owner != NULL) {
+                        LASSERT(page->cp_owner->ci_owned_nr > 0);
+                        page->cp_owner->ci_owned_nr--;
+                        page->cp_owner = NULL;
+                }
+        }
+        EXIT;
+}
+
+static void cl_page_owner_set(struct cl_page *page)
+{
+        ENTRY;
+        for (page = cl_page_top(page); page != NULL; page = page->cp_child) {
+                LASSERT(page->cp_owner != NULL);
+                page->cp_owner->ci_owned_nr++;
+        }
+        EXIT;
+}
+
+void cl_page_disown0(const struct lu_env *env,
+                     struct cl_io *io, struct cl_page *pg)
+{
+        enum cl_page_state state;
+
+        ENTRY;
+        state = pg->cp_state;
+        PINVRNT(env, pg, state == CPS_OWNED || state == CPS_FREEING);
+        PINVRNT(env, pg, cl_page_invariant(pg));
+        cl_page_owner_clear(pg);
+
+        if (state == CPS_OWNED)
+                cl_page_state_set(env, pg, CPS_CACHED);
+        /*
+         * Completion call-backs are executed in the bottom-up order, so that
+         * uppermost layer (llite), responsible for VFS/VM interaction runs
+         * last and can release locks safely.
+         */
+        CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(cpo_disown),
+                               (const struct lu_env *,
+                                const struct cl_page_slice *, struct cl_io *),
+                               io);
+        EXIT;
+}
+
+/**
+ * returns true, iff page is owned by the given io.
+ */
+int cl_page_is_owned(const struct cl_page *pg, const struct cl_io *io)
+{
+        LINVRNT(cl_object_same(pg->cp_obj, io->ci_obj));
+        ENTRY;
+        RETURN(pg->cp_state == CPS_OWNED && pg->cp_owner == io);
+}
+EXPORT_SYMBOL(cl_page_is_owned);
+
+/**
+ * Owns a page by IO.
+ *
+ * Waits until page is in cl_page_state::CPS_CACHED state, and then switch it
+ * into cl_page_state::CPS_OWNED state.
+ *
+ * \pre  !cl_page_is_owned(pg, io)
+ * \post result == 0 iff cl_page_is_owned(pg, io)
+ *
+ * \retval 0   success
+ *
+ * \retval -ve failure, e.g., page was destroyed (and landed in
+ *             cl_page_state::CPS_FREEING instead of cl_page_state::CPS_CACHED).
+ *
+ * \see cl_page_disown()
+ * \see cl_page_operations::cpo_own()
+ */
+int cl_page_own(const struct lu_env *env, struct cl_io *io, struct cl_page *pg)
+{
+        int result;
+
+        PINVRNT(env, pg, !cl_page_is_owned(pg, io));
+
+        ENTRY;
+        pg = cl_page_top(pg);
+        io = cl_io_top(io);
+
+        cl_page_invoid(env, io, pg, CL_PAGE_OP(cpo_own));
+        PASSERT(env, pg, pg->cp_owner == NULL);
+        PASSERT(env, pg, pg->cp_req == NULL);
+        pg->cp_owner = io;
+        cl_page_owner_set(pg);
+        if (pg->cp_state != CPS_FREEING) {
+                cl_page_state_set(env, pg, CPS_OWNED);
+                result = 0;
+        } else {
+                cl_page_disown0(env, io, pg);
+                result = -EAGAIN;
+        }
+        PINVRNT(env, pg, ergo(result == 0, cl_page_invariant(pg)));
+        RETURN(result);
+}
+EXPORT_SYMBOL(cl_page_own);
+
+/**
+ * Assume page ownership.
+ *
+ * Called when page is already locked by the hosting VM.
+ *
+ * \pre !cl_page_is_owned(pg, io)
+ * \post cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_operations::cpo_assume()
+ */
+void cl_page_assume(const struct lu_env *env,
+                    struct cl_io *io, struct cl_page *pg)
+{
+        PASSERT(env, pg, pg->cp_state < CPS_OWNED);
+        PASSERT(env, pg, pg->cp_owner == NULL);
+        PINVRNT(env, pg, cl_object_same(pg->cp_obj, io->ci_obj));
+        PINVRNT(env, pg, cl_page_invariant(pg));
+
+        ENTRY;
+        pg = cl_page_top(pg);
+        io = cl_io_top(io);
+
+        cl_page_invoid(env, io, pg, CL_PAGE_OP(cpo_assume));
+        pg->cp_owner = io;
+        cl_page_owner_set(pg);
+        cl_page_state_set(env, pg, CPS_OWNED);
+        EXIT;
+}
+EXPORT_SYMBOL(cl_page_assume);
+
+/**
+ * Releases page ownership without unlocking the page.
+ *
+ * Moves page into cl_page_state::CPS_CACHED without releasing a lock on the
+ * underlying VM page (as VM is supposed to do this itself).
+ *
+ * \pre   cl_page_is_owned(pg, io)
+ * \post !cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_assume()
+ */
+void cl_page_unassume(const struct lu_env *env,
+                      struct cl_io *io, struct cl_page *pg)
+{
+        PINVRNT(env, pg, cl_page_is_owned(pg, io));
+        PINVRNT(env, pg, cl_page_invariant(pg));
+
+        ENTRY;
+        pg = cl_page_top(pg);
+        io = cl_io_top(io);
+        cl_page_owner_clear(pg);
+        cl_page_state_set(env, pg, CPS_CACHED);
+        CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(cpo_unassume),
+                               (const struct lu_env *,
+                                const struct cl_page_slice *, struct cl_io *),
+                               io);
+        EXIT;
+}
+EXPORT_SYMBOL(cl_page_unassume);
+
+/**
+ * Releases page ownership.
+ *
+ * Moves page into cl_page_state::CPS_CACHED.
+ *
+ * \pre   cl_page_is_owned(pg, io)
+ * \post !cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_own()
+ * \see cl_page_operations::cpo_disown()
+ */
+void cl_page_disown(const struct lu_env *env,
+                    struct cl_io *io, struct cl_page *pg)
+{
+        PINVRNT(env, pg, cl_page_is_owned(pg, io));
+
+        ENTRY;
+        pg = cl_page_top(pg);
+        io = cl_io_top(io);
+        cl_page_disown0(env, io, pg);
+        EXIT;
+}
+EXPORT_SYMBOL(cl_page_disown);
+
+/**
+ * Called when page is to be removed from the object, e.g., as a result of
+ * truncate.
+ *
+ * Calls cl_page_operations::cpo_discard() top-to-bottom.
+ *
+ * \pre cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_operations::cpo_discard()
+ */
+void cl_page_discard(const struct lu_env *env,
+                     struct cl_io *io, struct cl_page *pg)
+{
+        PINVRNT(env, pg, cl_page_is_owned(pg, io));
+        PINVRNT(env, pg, cl_page_invariant(pg));
+
+        cl_page_invoid(env, io, pg, CL_PAGE_OP(cpo_discard));
+}
+EXPORT_SYMBOL(cl_page_discard);
+
+/**
+ * Version of cl_page_delete() that can be called for not fully constructed
+ * pages, e.g,. in a error handling cl_page_find()->cl_page_delete0()
+ * path. Doesn't check page invariant.
+ */
+static void cl_page_delete0(const struct lu_env *env, struct cl_page *pg,
+                            int radix)
+{
+        PASSERT(env, pg, pg == cl_page_top(pg));
+        PASSERT(env, pg, pg->cp_state != CPS_FREEING);
+
+        ENTRY;
+        /*
+         * Severe all ways to obtain new pointers to @pg.
+         */
+        cl_page_owner_clear(pg);
+        cl_page_state_set0(env, pg, CPS_FREEING);
+        CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_delete),
+                       (const struct lu_env *, const struct cl_page_slice *));
+        if (!radix)
+                /*
+                 * !radix means that @pg is not yet in the radix tree, skip
+                 * removing it.
+                 */
+                pg = pg->cp_child;
+        for (; pg != NULL; pg = pg->cp_child) {
+                void                    *value;
+                struct cl_object_header *hdr;
+
+                hdr = cl_object_header(pg->cp_obj);
+                spin_lock(&hdr->coh_page_guard);
+                value = radix_tree_delete(&hdr->coh_tree, pg->cp_index);
+                PASSERT(env, pg, value == pg);
+                PASSERT(env, pg, hdr->coh_pages > 0);
+                hdr->coh_pages--;
+                spin_unlock(&hdr->coh_page_guard);
+        }
+        EXIT;
+}
+
+/**
+ * Called when a decision is made to throw page out of memory.
+ *
+ * Notifies all layers about page destruction by calling
+ * cl_page_operations::cpo_delete() method top-to-bottom.
+ *
+ * Moves page into cl_page_state::CPS_FREEING state (this is the only place
+ * where transition to this state happens).
+ *
+ * Eliminates all venues through which new references to the page can be
+ * obtained:
+ *
+ *     - removes page from the radix trees,
+ *
+ *     - breaks linkage from VM page to cl_page.
+ *
+ * Once page reaches cl_page_state::CPS_FREEING, all remaining references will
+ * drain after some time, at which point page will be recycled.
+ *
+ * \pre  pg == cl_page_top(pg)
+ * \pre  VM page is locked
+ * \post pg->cp_state == CPS_FREEING
+ *
+ * \see cl_page_operations::cpo_delete()
+ */
+void cl_page_delete(const struct lu_env *env, struct cl_page *pg)
+{
+        PINVRNT(env, pg, cl_page_invariant(pg));
+        ENTRY;
+        cl_page_delete0(env, pg, 1);
+        EXIT;
+}
+EXPORT_SYMBOL(cl_page_delete);
+
+/**
+ * Unmaps page from user virtual memory.
+ *
+ * Calls cl_page_operations::cpo_unmap() through all layers top-to-bottom. The
+ * layer responsible for VM interaction has to unmap page from user space
+ * virtual memory.
+ *
+ * \see cl_page_operations::cpo_unmap()
+ */
+int cl_page_unmap(const struct lu_env *env,
+                  struct cl_io *io, struct cl_page *pg)
+{
+        PINVRNT(env, pg, cl_page_is_owned(pg, io));
+        PINVRNT(env, pg, cl_page_invariant(pg));
+
+        return cl_page_invoke(env, io, pg, CL_PAGE_OP(cpo_unmap));
+}
+EXPORT_SYMBOL(cl_page_unmap);
+
+/**
+ * Marks page up-to-date.
+ *
+ * Call cl_page_operations::cpo_export() through all layers top-to-bottom. The
+ * layer responsible for VM interaction has to mark page as up-to-date. From
+ * this moment on, page can be shown to the user space without Lustre being
+ * notified, hence the name.
+ *
+ * \see cl_page_operations::cpo_export()
+ */
+void cl_page_export(const struct lu_env *env, struct cl_page *pg)
+{
+        PINVRNT(env, pg, cl_page_invariant(pg));
+        CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_export),
+                       (const struct lu_env *, const struct cl_page_slice *));
+}
+EXPORT_SYMBOL(cl_page_export);
+
+/**
+ * Returns true, iff \a pg is VM locked in a suitable sense by the calling
+ * thread.
+ */
+int cl_page_is_vmlocked(const struct lu_env *env, const struct cl_page *pg)
+{
+        int result;
+        const struct cl_page_slice *slice;
+
+        ENTRY;
+        pg = cl_page_top_trusted((struct cl_page *)pg);
+        slice = container_of(pg->cp_layers.next,
+                             const struct cl_page_slice, cpl_linkage);
+        PASSERT(env, pg, slice->cpl_ops->cpo_is_vmlocked != NULL);
+        /*
+         * Call ->cpo_is_vmlocked() directly instead of going through
+         * CL_PAGE_INVOKE(), because cl_page_is_vmlocked() is used by
+         * cl_page_invariant().
+         */
+        result = slice->cpl_ops->cpo_is_vmlocked(env, slice);
+        PASSERT(env, pg, result == -EBUSY || result == -ENODATA);
+        RETURN(result == -EBUSY);
+}
+EXPORT_SYMBOL(cl_page_is_vmlocked);
+
+static enum cl_page_state cl_req_type_state(enum cl_req_type crt)
+{
+        ENTRY;
+        RETURN(crt == CRT_WRITE ? CPS_PAGEOUT : CPS_PAGEIN);
+}
+
+static void cl_page_io_start(const struct lu_env *env,
+                             struct cl_page *pg, enum cl_req_type crt)
+{
+        /*
+         * Page is queued for IO, change its state.
+         */
+        ENTRY;
+        cl_page_owner_clear(pg);
+        cl_page_state_set(env, pg, cl_req_type_state(crt));
+        EXIT;
+}
+
+/**
+ * Prepares page for immediate transfer. cl_page_operations::cpo_prep() is
+ * called top-to-bottom. Every layer either agrees to submit this page (by
+ * returning 0), or requests to omit this page (by returning -EALREADY). Layer
+ * handling interactions with the VM also has to inform VM that page is under
+ * transfer now.
+ */
+int cl_page_prep(const struct lu_env *env, struct cl_io *io,
+                 struct cl_page *pg, enum cl_req_type crt)
+{
+        int result;
+
+        PINVRNT(env, pg, cl_page_is_owned(pg, io));
+        PINVRNT(env, pg, cl_page_invariant(pg));
+        PINVRNT(env, pg, crt < CRT_NR);
+
+        /*
+         * XXX this has to be called bottom-to-top, so that llite can set up
+         * PG_writeback without risking other layers deciding to skip this
+         * page.
+         */
+        result = cl_page_invoke(env, io, pg, CL_PAGE_OP(io[crt].cpo_prep));
+        if (result == 0)
+                cl_page_io_start(env, pg, crt);
+
+        KLASSERT(ergo(crt == CRT_WRITE && pg->cp_type == CPT_CACHEABLE,
+                      equi(result == 0,
+                           PageWriteback(cl_page_vmpage(env, pg)))));
+        CL_PAGE_HEADER(D_TRACE, env, pg, "%i %i\n", crt, result);
+        return result;
+}
+EXPORT_SYMBOL(cl_page_prep);
+
+/**
+ * Notify layers about transfer completion.
+ *
+ * Invoked by transfer sub-system (which is a part of osc) to notify layers
+ * that a transfer, of which this page is a part of has completed.
+ *
+ * Completion call-backs are executed in the bottom-up order, so that
+ * uppermost layer (llite), responsible for the VFS/VM interaction runs last
+ * and can release locks safely.
+ *
+ * \pre  pg->cp_state == CPS_PAGEIN || pg->cp_state == CPS_PAGEOUT
+ * \post pg->cp_state == CPS_CACHED
+ *
+ * \see cl_page_operations::cpo_completion()
+ */
+void cl_page_completion(const struct lu_env *env,
+                        struct cl_page *pg, enum cl_req_type crt, int ioret)
+{
+        PASSERT(env, pg, crt < CRT_NR);
+        /* cl_page::cp_req already cleared by the caller (osc_completion()) */
+        PASSERT(env, pg, pg->cp_req == NULL);
+        PASSERT(env, pg, pg->cp_state == cl_req_type_state(crt));
+        PINVRNT(env, pg, cl_page_invariant(pg));
+
+        ENTRY;
+        CL_PAGE_HEADER(D_TRACE, env, pg, "%i %i\n", crt, ioret);
+        if (crt == CRT_READ) {
+                PASSERT(env, pg, !(pg->cp_flags & CPF_READ_COMPLETED));
+                pg->cp_flags |= CPF_READ_COMPLETED;
+        }
+
+        cl_page_state_set(env, pg, CPS_CACHED);
+        CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(io[crt].cpo_completion),
+                               (const struct lu_env *,
+                                const struct cl_page_slice *, int), ioret);
+
+        KLASSERT(!PageWriteback(cl_page_vmpage(env, pg)));
+        EXIT;
+}
+EXPORT_SYMBOL(cl_page_completion);
+
+/**
+ * Notify layers that transfer formation engine decided to yank this page from
+ * the cache and to make it a part of a transfer.
+ *
+ * \pre  pg->cp_state == CPS_CACHED
+ * \post pg->cp_state == CPS_PAGEIN || pg->cp_state == CPS_PAGEOUT
+ *
+ * \see cl_page_operations::cpo_make_ready()
+ */
+int cl_page_make_ready(const struct lu_env *env, struct cl_page *pg,
+                       enum cl_req_type crt)
+{
+        int result;
+
+        PINVRNT(env, pg, crt < CRT_NR);
+
+        ENTRY;
+        result = CL_PAGE_INVOKE(env, pg, CL_PAGE_OP(io[crt].cpo_make_ready),
+                                (const struct lu_env *,
+                                 const struct cl_page_slice *));
+        if (result == 0) {
+                PASSERT(env, pg, pg->cp_state == CPS_CACHED);
+                cl_page_io_start(env, pg, crt);
+        }
+        CL_PAGE_HEADER(D_TRACE, env, pg, "%i %i\n", crt, result);
+        RETURN(result);
+}
+EXPORT_SYMBOL(cl_page_make_ready);
+
+/**
+ * Notify layers that high level io decided to place this page into a cache
+ * for future transfer.
+ *
+ * The layer implementing transfer engine (osc) has to register this page in
+ * its queues.
+ *
+ * \pre  cl_page_is_owned(pg, io)
+ * \post ergo(result == 0,
+ *            pg->cp_state == CPS_PAGEIN || pg->cp_state == CPS_PAGEOUT)
+ *
+ * \see cl_page_operations::cpo_cache_add()
+ */
+int cl_page_cache_add(const struct lu_env *env, struct cl_io *io,
+                      struct cl_page *pg, enum cl_req_type crt)
+{
+        int result;
+
+        PINVRNT(env, pg, crt < CRT_NR);
+        PINVRNT(env, pg, cl_page_is_owned(pg, io));
+        PINVRNT(env, pg, cl_page_invariant(pg));
+
+        ENTRY;
+        result = cl_page_invoke(env, io, pg, CL_PAGE_OP(io[crt].cpo_cache_add));
+        if (result == 0) {
+                cl_page_owner_clear(pg);
+                cl_page_state_set(env, pg, CPS_CACHED);
+        }
+        CL_PAGE_HEADER(D_TRACE, env, pg, "%i %i\n", crt, result);
+        RETURN(result);
+}
+EXPORT_SYMBOL(cl_page_cache_add);
+
+/**
+ * Checks whether page is protected by any extent lock is at least required
+ * mode.
+ *
+ * \return the same as in cl_page_operations::cpo_is_under_lock() method.
+ * \see cl_page_operations::cpo_is_under_lock()
+ */
+int cl_page_is_under_lock(const struct lu_env *env, struct cl_io *io,
+                          struct cl_page *page)
+{
+        int rc;
+
+        PINVRNT(env, page, cl_page_invariant(page));
+
+        ENTRY;
+        rc = CL_PAGE_INVOKE(env, page, CL_PAGE_OP(cpo_is_under_lock),
+                            (const struct lu_env *,
+                             const struct cl_page_slice *, struct cl_io *),
+                            io);
+        PASSERT(env, page, rc != 0);
+        RETURN(rc);
+}
+EXPORT_SYMBOL(cl_page_is_under_lock);
+
+/**
+ * Purges all cached pages belonging to the object \a obj.
+ */
+int cl_pages_prune(const struct lu_env *env, struct cl_object *clobj)
+{
+        struct cl_thread_info   *info;
+        struct cl_object        *obj = cl_object_top(clobj);
+        struct cl_io            *io;
+        struct cl_page_list     *plist;
+        int                      result;
+
+        ENTRY;
+        info  = cl_env_info(env);
+        plist = &info->clt_list;
+        io    = &info->clt_io;
+
+        /*
+         * initialize the io. This is ugly since we never do IO in this
+         * function, we just make cl_page_list functions happy. -jay
+         */
+        io->ci_obj = obj;
+        result = cl_io_init(env, io, CIT_MISC, obj);
+        if (result != 0) {
+                cl_io_fini(env, io);
+                RETURN(io->ci_result);
+        }
+
+        cl_page_list_init(plist);
+        cl_page_gang_lookup(env, obj, io, 0, CL_PAGE_EOF, plist);
+        /*
+         * Since we're purging the pages of an object, we don't care
+         * the possible outcomes of the following functions.
+         */
+        cl_page_list_unmap(env, io, plist);
+        cl_page_list_discard(env, io, plist);
+        cl_page_list_disown(env, io, plist);
+        cl_page_list_fini(env, plist);
+
+        cl_io_fini(env, io);
+        RETURN(result);
+}
+EXPORT_SYMBOL(cl_pages_prune);
+
+/**
+ * Tells transfer engine that only part of a page is to be transmitted.
+ *
+ * \see cl_page_operations::cpo_clip()
+ */
+void cl_page_clip(const struct lu_env *env, struct cl_page *pg,
+                  int from, int to)
+{
+        PINVRNT(env, pg, cl_page_invariant(pg));
+
+        CL_PAGE_HEADER(D_TRACE, env, pg, "%i %i\n", from, to);
+        CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_clip),
+                       (const struct lu_env *,
+                        const struct cl_page_slice *,int, int),
+                       from, to);
+}
+EXPORT_SYMBOL(cl_page_clip);
+
+/**
+ * Prints human readable representation of \a pg to the \a f.
+ */
+void cl_page_header_print(const struct lu_env *env, void *cookie,
+                          lu_printer_t printer, const struct cl_page *pg)
+{
+        (*printer)(env, cookie,
+                   "page@%p[%d %p:%lu ^%p_%p %d %d %d %p %p %#x]\n",
+                   pg, atomic_read(&pg->cp_ref), pg->cp_obj,
+                   pg->cp_index, pg->cp_parent, pg->cp_child,
+                   pg->cp_state, pg->cp_error, pg->cp_type,
+                   pg->cp_owner, pg->cp_req, pg->cp_flags);
+}
+EXPORT_SYMBOL(cl_page_header_print);
+
+/**
+ * Prints human readable representation of \a pg to the \a f.
+ */
+void cl_page_print(const struct lu_env *env, void *cookie,
+                   lu_printer_t printer, const struct cl_page *pg)
+{
+        struct cl_page *scan;
+
+        for (scan = cl_page_top((struct cl_page *)pg);
+             scan != NULL; scan = scan->cp_child)
+                cl_page_header_print(env, cookie, printer, scan);
+        CL_PAGE_INVOKE(env, (struct cl_page *)pg, CL_PAGE_OP(cpo_print),
+                       (const struct lu_env *env,
+                        const struct cl_page_slice *slice,
+                        void *cookie, lu_printer_t p), cookie, printer);
+        (*printer)(env, cookie, "end page@%p\n", pg);
+}
+EXPORT_SYMBOL(cl_page_print);
+
+/**
+ * Cancel a page which is still in a transfer.
+ */
+int cl_page_cancel(const struct lu_env *env, struct cl_page *page)
+{
+        return CL_PAGE_INVOKE(env, page, CL_PAGE_OP(cpo_cancel),
+                              (const struct lu_env *,
+                               const struct cl_page_slice *));
+}
+EXPORT_SYMBOL(cl_page_cancel);
+
+/**
+ * Converts a byte offset within object \a obj into a page index.
+ */
+loff_t cl_offset(const struct cl_object *obj, pgoff_t idx)
+{
+        /*
+         * XXX for now.
+         */
+        return (loff_t)idx << CFS_PAGE_SHIFT;
+}
+EXPORT_SYMBOL(cl_offset);
+
+/**
+ * Converts a page index into a byte offset within object \a obj.
+ */
+pgoff_t cl_index(const struct cl_object *obj, loff_t offset)
+{
+        /*
+         * XXX for now.
+         */
+        return offset >> CFS_PAGE_SHIFT;
+}
+EXPORT_SYMBOL(cl_index);
+
+int cl_page_size(const struct cl_object *obj)
+{
+        return 1 << CFS_PAGE_SHIFT;
+}
+EXPORT_SYMBOL(cl_page_size);
+
+/**
+ * Adds page slice to the compound page.
+ *
+ * This is called by cl_object_operations::coo_page_init() methods to add a
+ * per-layer state to the page. New state is added at the end of
+ * cl_page::cp_layers list, that is, it is at the bottom of the stack.
+ *
+ * \see cl_lock_slice_add(), cl_req_slice_add(), cl_io_slice_add()
+ */
+void cl_page_slice_add(struct cl_page *page, struct cl_page_slice *slice,
+                       struct cl_object *obj,
+                       const struct cl_page_operations *ops)
+{
+        ENTRY;
+        list_add_tail(&slice->cpl_linkage, &page->cp_layers);
+        slice->cpl_obj  = obj;
+        slice->cpl_ops  = ops;
+        slice->cpl_page = page;
+        EXIT;
+}
+EXPORT_SYMBOL(cl_page_slice_add);
+
+int  cl_page_init(void)
+{
+        return lu_kmem_init(cl_page_caches);
+}
+
+void cl_page_fini(void)
+{
+        lu_kmem_fini(cl_page_caches);
+}
diff --git a/lustre/obdclass/class_obd.c b/lustre/obdclass/class_obd.c

index 2911586..0803a15 100644 (file)
--- a/lustre/obdclass/class_obd.c
+++ b/lustre/obdclass/class_obd.c
@@ -76,6 +76,7 @@ unsigned int obd_timeout = OBD_TIMEOUT_DEFAULT;   /* seconds */
  unsigned int ldlm_timeout = LDLM_TIMEOUT_DEFAULT; /* seconds */
  unsigned int obd_max_dirty_pages = 256;
  atomic_t obd_dirty_pages;
+atomic_t obd_dirty_transit_pages;
  
  cfs_waitq_t obd_race_waitq;
  int obd_race_state;
@@ -390,6 +391,7 @@ EXPORT_SYMBOL(obd_timeout);
  EXPORT_SYMBOL(ldlm_timeout);
  EXPORT_SYMBOL(obd_max_dirty_pages);
  EXPORT_SYMBOL(obd_dirty_pages);
+EXPORT_SYMBOL(obd_dirty_transit_pages);
  EXPORT_SYMBOL(ptlrpc_put_connection_superhack);
  
  EXPORT_SYMBOL(proc_lustre_root);
@@ -590,10 +592,10 @@ int init_obdclass(void)
          err = obd_init_caches();
          if (err)
                  return err;
-#ifdef __KERNEL__
          err = lu_global_init();
          if (err)
                  return err;
+#ifdef __KERNEL__
          err = class_procfs_init();
          if (err)
                  return err;
diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c

index 5eeae54..f317b95 100644 (file)
--- a/lustre/obdclass/genops.c
+++ b/lustre/obdclass/genops.c
@@ -1165,146 +1165,6 @@ int class_disconnect_stale_exports(struct obd_device *obd,
  }
  EXPORT_SYMBOL(class_disconnect_stale_exports);
  
-int oig_init(struct obd_io_group **oig_out)
-{
-        struct obd_io_group *oig;
-        ENTRY;
-
-        OBD_ALLOC(oig, sizeof(*oig));
-        if (oig == NULL)
-                RETURN(-ENOMEM);
-
-        spin_lock_init(&oig->oig_lock);
-        oig->oig_rc = 0;
-        oig->oig_pending = 0;
-        atomic_set(&oig->oig_refcount, 1);
-        cfs_waitq_init(&oig->oig_waitq);
-        CFS_INIT_LIST_HEAD(&oig->oig_occ_list);
-
-        *oig_out = oig;
-        RETURN(0);
-};
-EXPORT_SYMBOL(oig_init);
-
-static inline void oig_grab(struct obd_io_group *oig)
-{
-        atomic_inc(&oig->oig_refcount);
-}
-
-void oig_release(struct obd_io_group *oig)
-{
-        if (atomic_dec_and_test(&oig->oig_refcount))
-                OBD_FREE(oig, sizeof(*oig));
-}
-EXPORT_SYMBOL(oig_release);
-
-int oig_add_one(struct obd_io_group *oig, struct oig_callback_context *occ)
-{
-        int rc = 0;
-        CDEBUG(D_CACHE, "oig %p ready to roll\n", oig);
-        spin_lock(&oig->oig_lock);
-        if (oig->oig_rc) {
-                rc = oig->oig_rc;
-        } else {
-                oig->oig_pending++;
-                if (occ != NULL)
-                        list_add_tail(&occ->occ_oig_item, &oig->oig_occ_list);
-        }
-        spin_unlock(&oig->oig_lock);
-        oig_grab(oig);
-
-        return rc;
-}
-EXPORT_SYMBOL(oig_add_one);
-
-void oig_complete_one(struct obd_io_group *oig,
-                      struct oig_callback_context *occ, int rc)
-{
-        cfs_waitq_t *wake = NULL;
-        int old_rc;
-
-        spin_lock(&oig->oig_lock);
-
-        if (occ != NULL)
-                list_del_init(&occ->occ_oig_item);
-
-        old_rc = oig->oig_rc;
-        if (oig->oig_rc == 0 && rc != 0)
-                oig->oig_rc = rc;
-
-        if (--oig->oig_pending <= 0)
-                wake = &oig->oig_waitq;
-
-        spin_unlock(&oig->oig_lock);
-
-        CDEBUG(D_CACHE, "oig %p completed, rc %d -> %d via %d, %d now "
-                        "pending (racey)\n", oig, old_rc, oig->oig_rc, rc,
-                        oig->oig_pending);
-        if (wake)
-                cfs_waitq_signal(wake);
-        oig_release(oig);
-}
-EXPORT_SYMBOL(oig_complete_one);
-
-static int oig_done(struct obd_io_group *oig)
-{
-        int rc = 0;
-        spin_lock(&oig->oig_lock);
-        if (oig->oig_pending <= 0)
-                rc = 1;
-        spin_unlock(&oig->oig_lock);
-        return rc;
-}
-
-static void interrupted_oig(void *data)
-{
-        struct obd_io_group *oig = data;
-        struct oig_callback_context *occ;
-
-        spin_lock(&oig->oig_lock);
-        /* We need to restart the processing each time we drop the lock, as
-         * it is possible other threads called oig_complete_one() to remove
-         * an entry elsewhere in the list while we dropped lock.  We need to
-         * drop the lock because osc_ap_completion() calls oig_complete_one()
-         * which re-gets this lock ;-) as well as a lock ordering issue. */
-restart:
-        list_for_each_entry(occ, &oig->oig_occ_list, occ_oig_item) {
-                if (occ->interrupted)
-                        continue;
-                occ->interrupted = 1;
-                spin_unlock(&oig->oig_lock);
-                occ->occ_interrupted(occ);
-                spin_lock(&oig->oig_lock);
-                goto restart;
-        }
-        spin_unlock(&oig->oig_lock);
-}
-
-int oig_wait(struct obd_io_group *oig)
-{
-        struct l_wait_info lwi = LWI_INTR(interrupted_oig, oig);
-        int rc;
-
-        CDEBUG(D_CACHE, "waiting for oig %p\n", oig);
-
-        do {
-                rc = l_wait_event(oig->oig_waitq, oig_done(oig), &lwi);
-                LASSERTF(rc == 0 || rc == -EINTR, "rc: %d\n", rc);
-                /* we can't continue until the oig has emptied and stopped
-                 * referencing state that the caller will free upon return */
-                if (rc == -EINTR)
-                        lwi = (struct l_wait_info){ 0, };
-        } while (rc == -EINTR);
-
-        LASSERTF(oig->oig_pending == 0,
-                 "exiting oig_wait(oig = %p) with %d pending\n", oig,
-                 oig->oig_pending);
-
-        CDEBUG(D_CACHE, "done waiting on oig %p rc %d\n", oig, oig->oig_rc);
-        return oig->oig_rc;
-}
-EXPORT_SYMBOL(oig_wait);
-
  void class_fail_export(struct obd_export *exp)
  {
          int rc, already_failed;
@@ -1462,7 +1322,7 @@ enum {
  /**
   * check for work for kill zombie import/export thread.
   */
-int obd_zombie_impexp_check(void *arg)
+static int obd_zombie_impexp_check(void *arg)
  {
          int rc;
  
@@ -1484,6 +1344,32 @@ static void obd_zombie_impexp_notify(void)
          cfs_waitq_signal(&obd_zombie_waitq);
  }
  
+/**
+ * check whether obd_zombie is idle
+ */
+static int obd_zombie_is_idle(void)
+{
+        int rc;
+
+        LASSERT(!test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags));
+        spin_lock(&obd_zombie_impexp_lock);
+        rc = list_empty(&obd_zombie_imports) &&
+             list_empty(&obd_zombie_exports);
+        spin_unlock(&obd_zombie_impexp_lock);
+        return rc;
+}
+
+/**
+ * wait when obd_zombie import/export queues become empty
+ */
+void obd_zombie_barrier(void)
+{
+        struct l_wait_info lwi = { 0 };
+
+        l_wait_event(obd_zombie_waitq, obd_zombie_is_idle(), &lwi);
+}
+EXPORT_SYMBOL(obd_zombie_barrier);
+
  #ifdef __KERNEL__
  
  /**
@@ -1506,6 +1392,8 @@ static int obd_zombie_impexp_thread(void *unused)
                  l_wait_event(obd_zombie_waitq, !obd_zombie_impexp_check(NULL), &lwi);
  
                  obd_zombie_impexp_cull();
+                /* Notify obd_zombie_barrier callers that queues may be empty */
+                cfs_waitq_signal(&obd_zombie_waitq);
          }
  
          complete(&obd_zombie_stop);
diff --git a/lustre/obdclass/linux/linux-module.c b/lustre/obdclass/linux/linux-module.c

index a7feee4..a801349 100644 (file)
--- a/lustre/obdclass/linux/linux-module.c
+++ b/lustre/obdclass/linux/linux-module.c
@@ -345,7 +345,7 @@ static void obd_device_list_seq_stop(struct seq_file *p, void *v)
  }
  
  static void *obd_device_list_seq_next(struct seq_file *p, void *v, loff_t *pos)
-{ 
+{
          ++*pos;
          if (*pos >= class_devno_max())
                  return NULL;
diff --git a/lustre/obdclass/linux/linux-obdo.c b/lustre/obdclass/linux/linux-obdo.c

index 775f6c0..e85b5ab 100644 (file)
--- a/lustre/obdclass/linux/linux-obdo.c
+++ b/lustre/obdclass/linux/linux-obdo.c
@@ -65,7 +65,7 @@ void obdo_from_inode(struct obdo *dst, struct inode *src, obd_flag valid)
  
          if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME))
                  CDEBUG(D_INODE, "valid %x, new time %lu/%lu\n",
-                       valid, LTIME_S(src->i_mtime), 
+                       valid, LTIME_S(src->i_mtime),
                         LTIME_S(src->i_ctime));
  
          if (valid & OBD_MD_FLATIME) {
@@ -185,7 +185,7 @@ void obdo_refresh_inode(struct inode *dst, struct obdo *src, obd_flag valid)
  
          /* mtime is always updated with ctime, but can be set in past.
             As write and utime(2) may happen within 1 second, and utime's
-           mtime has a priority over write's one, leave mtime from mds 
+           mtime has a priority over write's one, leave mtime from mds
             for the same ctimes. */
          if (valid & OBD_MD_FLCTIME && src->o_ctime > LTIME_S(dst->i_ctime)) {
                  LTIME_S(dst->i_ctime) = src->o_ctime;
@@ -211,6 +211,10 @@ void obdo_refresh_inode(struct inode *dst, struct obdo *src, obd_flag valid)
  
          /* allocation of space */
          if (valid & OBD_MD_FLBLOCKS && src->o_blocks > dst->i_blocks)
+                /*
+                 * XXX shouldn't overflow be checked here like in
+                 * obdo_to_inode().
+                 */
                  dst->i_blocks = src->o_blocks;
  }
  EXPORT_SYMBOL(obdo_refresh_inode);
diff --git a/lustre/obdclass/llog_cat.c b/lustre/obdclass/llog_cat.c

index 73ee3c5..00151a6 100644 (file)
--- a/lustre/obdclass/llog_cat.c
+++ b/lustre/obdclass/llog_cat.c
@@ -181,7 +181,7 @@ int llog_cat_id2handle(struct llog_handle *cathandle, struct llog_handle **res,
          if (!rc) {
                  loghandle->u.phd.phd_cat_handle = cathandle;
                  loghandle->u.phd.phd_cookie.lgc_lgl = cathandle->lgh_id;
-                loghandle->u.phd.phd_cookie.lgc_index = 
+                loghandle->u.phd.phd_cookie.lgc_index =
                          loghandle->lgh_hdr->llh_cat_idx;
          }
  
@@ -452,8 +452,8 @@ int llog_cat_process_thread(void *data)
                  CWARN("No callback function for recovery\n");
          }
  
-        /* 
-         * Make sure that all cached data is sent. 
+        /*
+         * Make sure that all cached data is sent.
           */
          llog_sync(ctxt, NULL);
          GOTO(release_llh, rc);
diff --git a/lustre/obdclass/llog_lvfs.c b/lustre/obdclass/llog_lvfs.c

index 47c1e53..abaa9df 100644 (file)
--- a/lustre/obdclass/llog_lvfs.c
+++ b/lustre/obdclass/llog_lvfs.c
@@ -108,7 +108,7 @@ static int llog_lvfs_write_blob(struct obd_device *obd, struct l_file *file,
  
          file->f_pos = off;
  
-        if (buflen == 0) 
+        if (buflen == 0)
                  CWARN("0-length record\n");
  
          if (!buf) {
@@ -244,8 +244,8 @@ static int llog_lvfs_write_rec(struct llog_handle *loghandle,
                  RETURN(rc);
  
          if (buf)
-                /* write_blob adds header and tail to lrh_len. */ 
-                reclen = sizeof(*rec) + rec->lrh_len + 
+                /* write_blob adds header and tail to lrh_len. */
+                reclen = sizeof(*rec) + rec->lrh_len +
                           sizeof(struct llog_rec_tail);
  
          if (idx != -1) {
@@ -260,7 +260,7 @@ static int llog_lvfs_write_rec(struct llog_handle *loghandle,
                  if (idx && llh->llh_size && llh->llh_size != rec->lrh_len)
                          RETURN(-EINVAL);
  
-                if (!ext2_test_bit(idx, llh->llh_bitmap)) 
+                if (!ext2_test_bit(idx, llh->llh_bitmap))
                          CERROR("Modify unset record %u\n", idx);
                  if (idx != rec->lrh_index)
                          CERROR("Index mismatch %d %u\n", idx, rec->lrh_index);
@@ -290,13 +290,13 @@ static int llog_lvfs_write_rec(struct llog_handle *loghandle,
                                  RETURN(-EFAULT);
                          }
  #if 1  /* FIXME remove this safety check at some point */
-                        /* Verify that the record we're modifying is the 
+                        /* Verify that the record we're modifying is the
                             right one. */
                          rc = llog_lvfs_read_blob(obd, file, &check,
                                                   sizeof(check), saved_offset);
                          if (check.lrh_index != idx || check.lrh_len != reclen) {
                                  CERROR("Bad modify idx %u/%u size %u/%u (%d)\n",
-                                       idx, check.lrh_index, reclen, 
+                                       idx, check.lrh_index, reclen,
                                         check.lrh_len, rc);
                                  RETURN(-EFAULT);
                          }
@@ -366,7 +366,7 @@ static int llog_lvfs_write_rec(struct llog_handle *loghandle,
          if (rc == 0 && reccookie) {
                  reccookie->lgc_lgl = loghandle->lgh_id;
                  reccookie->lgc_index = index;
-                if ((rec->lrh_type == MDS_UNLINK_REC) || 
+                if ((rec->lrh_type == MDS_UNLINK_REC) ||
                                  (rec->lrh_type == MDS_SETATTR_REC))
                          reccookie->lgc_subsys = LLOG_MDS_OST_ORIG_CTXT;
                  else if (rec->lrh_type == OST_SZ_REC)
@@ -639,12 +639,12 @@ static int llog_lvfs_create(struct llog_ctxt *ctxt, struct llog_handle **res,
          } else if (name) {
                  /* COMPAT_146 */
                  if (strcmp(obd->obd_type->typ_name, LUSTRE_MDS_NAME) == 0) {
-                        handle->lgh_file = llog_filp_open(MDT_LOGS_DIR, name, 
+                        handle->lgh_file = llog_filp_open(MDT_LOGS_DIR, name,
                                                            open_flags, 0644);
                  } else {
                          /* end COMPAT_146 */
                          handle->lgh_file = llog_filp_open(MOUNT_CONFIGS_DIR,
-                                                          name, open_flags, 
+                                                          name, open_flags,
                                                            0644);
                  }
                  if (IS_ERR(handle->lgh_file))
@@ -777,7 +777,7 @@ int llog_get_cat_list(struct obd_device *obd, struct obd_device *disk_obd,
          loff_t off = idx *  sizeof(*idarray);
          ENTRY;
  
-        if (!count) 
+        if (!count)
                  RETURN(0);
  
          push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
diff --git a/lustre/obdclass/llog_obd.c b/lustre/obdclass/llog_obd.c

index 6d24d60..804049a 100644 (file)
--- a/lustre/obdclass/llog_obd.c
+++ b/lustre/obdclass/llog_obd.c
@@ -134,7 +134,7 @@ int llog_cleanup(struct llog_ctxt *ctxt)
          /* try to free the ctxt */
          rc = __llog_ctxt_put(ctxt);
          if (rc)
-                CERROR("Error %d while cleaning up ctxt %p\n", 
+                CERROR("Error %d while cleaning up ctxt %p\n",
                         rc, ctxt);
  
          l_wait_event(olg->olg_waitq,
@@ -227,7 +227,7 @@ int llog_add(struct llog_ctxt *ctxt, struct llog_rec_hdr *rec,
                  CERROR("No ctxt\n");
                  RETURN(-ENODEV);
          }
-        
+
          CTXT_CHECK_OP(ctxt, add, -EOPNOTSUPP);
          raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
          if (!raised)
@@ -249,7 +249,7 @@ int llog_cancel(struct llog_ctxt *ctxt, struct lov_stripe_md *lsm,
                  CERROR("No ctxt\n");
                  RETURN(-ENODEV);
          }
-        
+
          CTXT_CHECK_OP(ctxt, cancel, -EOPNOTSUPP);
          rc = CTXTP(ctxt, cancel)(ctxt, lsm, count, cookies, flags);
          RETURN(rc);
diff --git a/lustre/obdclass/lprocfs_status.c b/lustre/obdclass/lprocfs_status.c

index e13ed69..b9de3be 100644 (file)
--- a/lustre/obdclass/lprocfs_status.c
+++ b/lustre/obdclass/lprocfs_status.c
@@ -1248,15 +1248,6 @@ void lprocfs_init_ops_stats(int num_private_stats, struct lprocfs_stats *stats)
          LPROCFS_OBD_OP_INIT(num_private_stats, stats, getattr);
          LPROCFS_OBD_OP_INIT(num_private_stats, stats, getattr_async);
          LPROCFS_OBD_OP_INIT(num_private_stats, stats, brw);
-        LPROCFS_OBD_OP_INIT(num_private_stats, stats, brw_async);
-        LPROCFS_OBD_OP_INIT(num_private_stats, stats, prep_async_page);
-        LPROCFS_OBD_OP_INIT(num_private_stats, stats, reget_short_lock);
-        LPROCFS_OBD_OP_INIT(num_private_stats, stats, release_short_lock);
-        LPROCFS_OBD_OP_INIT(num_private_stats, stats, queue_async_io);
-        LPROCFS_OBD_OP_INIT(num_private_stats, stats, queue_group_io);
-        LPROCFS_OBD_OP_INIT(num_private_stats, stats, trigger_group_io);
-        LPROCFS_OBD_OP_INIT(num_private_stats, stats, set_async_flags);
-        LPROCFS_OBD_OP_INIT(num_private_stats, stats, teardown_async_page);
          LPROCFS_OBD_OP_INIT(num_private_stats, stats, merge_lvb);
          LPROCFS_OBD_OP_INIT(num_private_stats, stats, adjust_kms);
          LPROCFS_OBD_OP_INIT(num_private_stats, stats, punch);
@@ -1267,7 +1258,6 @@ void lprocfs_init_ops_stats(int num_private_stats, struct lprocfs_stats *stats)
          LPROCFS_OBD_OP_INIT(num_private_stats, stats, preprw);
          LPROCFS_OBD_OP_INIT(num_private_stats, stats, commitrw);
          LPROCFS_OBD_OP_INIT(num_private_stats, stats, enqueue);
-        LPROCFS_OBD_OP_INIT(num_private_stats, stats, match);
          LPROCFS_OBD_OP_INIT(num_private_stats, stats, change_cbdata);
          LPROCFS_OBD_OP_INIT(num_private_stats, stats, cancel);
          LPROCFS_OBD_OP_INIT(num_private_stats, stats, cancel_unused);
@@ -1286,10 +1276,6 @@ void lprocfs_init_ops_stats(int num_private_stats, struct lprocfs_stats *stats)
          LPROCFS_OBD_OP_INIT(num_private_stats, stats, quotacheck);
          LPROCFS_OBD_OP_INIT(num_private_stats, stats, quotactl);
          LPROCFS_OBD_OP_INIT(num_private_stats, stats, ping);
-        LPROCFS_OBD_OP_INIT(num_private_stats, stats, register_page_removal_cb);
-        LPROCFS_OBD_OP_INIT(num_private_stats,stats,unregister_page_removal_cb);
-        LPROCFS_OBD_OP_INIT(num_private_stats, stats, register_lock_cancel_cb);
-        LPROCFS_OBD_OP_INIT(num_private_stats, stats,unregister_lock_cancel_cb);
          LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_new);
          LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_rem);
          LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_add);
diff --git a/lustre/obdclass/lu_object.c b/lustre/obdclass/lu_object.c

index c4e3f2b..42d8cf1 100644 (file)
--- a/lustre/obdclass/lu_object.c
+++ b/lustre/obdclass/lu_object.c
@@ -194,10 +194,10 @@ static struct lu_object *lu_object_alloc(const struct lu_env *env,
   */
  static void lu_object_free(const struct lu_env *env, struct lu_object *o)
  {
-        struct list_head splice;
+        struct list_head  splice;
          struct lu_object *scan;
-        struct lu_site          *site;
-        struct list_head        *layers;
+        struct lu_site   *site;
+        struct list_head *layers;
  
          site   = o->lo_dev->ld_site;
          layers = &o->lo_header->loh_layers;
@@ -336,7 +336,7 @@ int lu_cdebug_printer(const struct lu_env *env,
          struct lu_cdebug_data       *key;
          int used;
          int complete;
-       va_list args;
+        va_list args;
  
          va_start(args, format);
  
@@ -352,9 +352,9 @@ int lu_cdebug_printer(const struct lu_env *env,
                    ARRAY_SIZE(key->lck_area) - used, format, args);
          if (complete) {
                  if (cdebug_show(info->lpi_mask, info->lpi_subsys))
-                libcfs_debug_msg(NULL, info->lpi_subsys, info->lpi_mask,
-                                 (char *)info->lpi_file, info->lpi_fn,
-                                 info->lpi_line, "%s", key->lck_area);
+                        libcfs_debug_msg(NULL, info->lpi_subsys, info->lpi_mask,
+                                         (char *)info->lpi_file, info->lpi_fn,
+                                         info->lpi_line, "%s", key->lck_area);
                  key->lck_area[0] = 0;
          }
          va_end(args);
@@ -367,7 +367,7 @@ EXPORT_SYMBOL(lu_cdebug_printer);
   */
  void lu_object_header_print(const struct lu_env *env, void *cookie,
                              lu_printer_t printer,
-                                   const struct lu_object_header *hdr)
+                            const struct lu_object_header *hdr)
  {
          (*printer)(env, cookie, "header@%p[%#lx, %d, "DFID"%s%s%s]",
                     hdr, hdr->loh_flags, atomic_read(&hdr->loh_ref),
@@ -400,7 +400,7 @@ void lu_object_print(const struct lu_env *env, void *cookie,
                  (*printer)(env, cookie, "%*.*s%s@%p", depth, depth, ruler,
                             o->lo_dev->ld_type->ldt_name, o);
                  if (o->lo_ops->loo_object_print != NULL)
-                o->lo_ops->loo_object_print(env, cookie, printer, o);
+                        o->lo_ops->loo_object_print(env, cookie, printer, o);
                  (*printer)(env, cookie, "\n");
          }
          (*printer)(env, cookie, "} header@%p\n", top);
@@ -496,8 +496,8 @@ static struct lu_object *lu_object_find_try(const struct lu_env *env,
                                              cfs_waitlink_t *waiter)
  {
          struct lu_site    *s;
-        struct lu_object     *o;
-        struct lu_object     *shadow;
+        struct lu_object  *o;
+        struct lu_object  *shadow;
          struct hlist_head *bucket;
  
          /*
@@ -844,9 +844,12 @@ void lu_device_fini(struct lu_device *d)
          struct lu_device_type *t;
  
          t = d->ld_type;
-        if (d->ld_obd != NULL)
+        if (d->ld_obd != NULL) {
                  /* finish lprocfs */
                  lprocfs_obd_cleanup(d->ld_obd);
+                d->ld_obd->obd_lu_dev = NULL;
+                d->ld_obd = NULL;
+        }
  
          lu_ref_fini(&d->ld_reference);
          LASSERTF(atomic_read(&d->ld_ref) == 0,
@@ -1001,9 +1004,9 @@ void lu_stack_fini(const struct lu_env *env, struct lu_device *top)
                  next = ldt->ldt_ops->ldto_device_free(env, scan);
                  type = ldt->ldt_obd_type;
                  if (type != NULL) {
-                type->typ_refcnt--;
-                class_put_type(type);
-        }
+                        type->typ_refcnt--;
+                        class_put_type(type);
+                }
          }
  }
  EXPORT_SYMBOL(lu_stack_fini);
@@ -1088,6 +1091,8 @@ void lu_context_key_degister(struct lu_context_key *key)
          LASSERT(atomic_read(&key->lct_used) >= 1);
          LINVRNT(0 <= key->lct_index && key->lct_index < ARRAY_SIZE(lu_keys));
  
+        lu_context_key_quiesce(key);
+
          ++key_set_version;
          key_fini(&lu_shrink_env.le_ctx, key->lct_index);
  
@@ -1205,8 +1210,13 @@ static CFS_LIST_HEAD(lu_context_remembered);
  void lu_context_key_quiesce(struct lu_context_key *key)
  {
          struct lu_context *ctx;
+        extern unsigned cl_env_cache_purge(unsigned nr);
  
          if (!(key->lct_tags & LCT_QUIESCENT)) {
+                /*
+                 * XXX layering violation.
+                 */
+                cl_env_cache_purge(~0);
                  key->lct_tags |= LCT_QUIESCENT;
                  /*
                   * XXX memory barrier has to go here.
@@ -1263,6 +1273,7 @@ static int keys_fill(struct lu_context *ctx)
                          value = key->lct_init(ctx, key);
                          if (unlikely(IS_ERR(value)))
                                  return PTR_ERR(value);
+
                          LASSERT(key->lct_owner != NULL);
                          if (!(ctx->lc_tags & LCT_NOREF))
                                  try_module_get(key->lct_owner);
@@ -1375,30 +1386,16 @@ int lu_context_refill(struct lu_context *ctx)
  }
  EXPORT_SYMBOL(lu_context_refill);
  
-static int lu_env_setup(struct lu_env *env, struct lu_context *ses,
-                        __u32 tags, int noref)
+int lu_env_init(struct lu_env *env, __u32 tags)
  {
          int result;
  
-        LINVRNT(ergo(!noref, !(tags & LCT_NOREF)));
-
-        env->le_ses = ses;
+        env->le_ses = NULL;
          result = lu_context_init(&env->le_ctx, tags);
          if (likely(result == 0))
                  lu_context_enter(&env->le_ctx);
          return result;
  }
-
-static int lu_env_init_noref(struct lu_env *env, struct lu_context *ses,
-                             __u32 tags)
-{
-        return lu_env_setup(env, ses, tags, 1);
-}
-
-int lu_env_init(struct lu_env *env, struct lu_context *ses, __u32 tags)
-{
-        return lu_env_setup(env, ses, tags, 0);
-}
  EXPORT_SYMBOL(lu_env_init);
  
  void lu_env_fini(struct lu_env *env)
@@ -1455,6 +1452,54 @@ static int lu_cache_shrink(int nr, unsigned int gfp_mask)
          return cached;
  }
  
+/*
+ * Debugging stuff.
+ */
+
+/**
+ * Environment to be used in debugger, contains all tags.
+ */
+struct lu_env lu_debugging_env;
+
+/**
+ * Debugging printer function using printk().
+ */
+int lu_printk_printer(const struct lu_env *env,
+                      void *_, const char *format, ...)
+{
+        va_list args;
+
+        va_start(args, format);
+        vprintk(format, args);
+        va_end(args);
+        return 0;
+}
+
+void lu_debugging_setup(void)
+{
+        lu_env_init(&lu_debugging_env, ~0);
+}
+
+void lu_context_keys_dump(void)
+{
+        int i;
+
+        for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
+                struct lu_context_key *key;
+
+                key = lu_keys[i];
+                if (key != NULL) {
+                        CERROR("[%i]: %p %x (%p,%p,%p) %i %i \"%s\"@%p\n",
+                               i, key, key->lct_tags,
+                               key->lct_init, key->lct_fini, key->lct_exit,
+                               key->lct_index, atomic_read(&key->lct_used),
+                               key->lct_owner ? key->lct_owner->name : "",
+                               key->lct_owner);
+                        lu_ref_print(&key->lct_reference);
+                }
+        }
+}
+EXPORT_SYMBOL(lu_context_keys_dump);
  #else  /* !__KERNEL__ */
  static int lu_cache_shrink(int nr, unsigned int gfp_mask)
  {
@@ -1462,6 +1507,8 @@ static int lu_cache_shrink(int nr, unsigned int gfp_mask)
  }
  #endif /* __KERNEL__ */
  
+int  cl_global_init(void);
+void cl_global_fini(void);
  int  lu_ref_global_init(void);
  void lu_ref_global_fini(void);
  
@@ -1478,21 +1525,21 @@ int lu_global_init(void)
          result = lu_context_key_register(&lu_global_key);
          if (result != 0)
                  return result;
-                /*
+        /*
           * At this level, we don't know what tags are needed, so allocate them
           * conservatively. This should not be too bad, because this
           * environment is global.
-                 */
-                down(&lu_sites_guard);
-                result = lu_env_init_noref(&lu_shrink_env, NULL, LCT_SHRINKER);
-                up(&lu_sites_guard);
+         */
+        down(&lu_sites_guard);
+        result = lu_env_init(&lu_shrink_env, LCT_SHRINKER);
+        up(&lu_sites_guard);
          if (result != 0)
                  return result;
  
          result = lu_ref_global_init();
          if (result != 0)
                  return result;
-                        /*
+        /*
           * seeks estimation: 3 seeks to read a record from oi, one to read
           * inode, one for ea. Unfortunately setting this high value results in
           * lu_object/inode cache consuming all the memory.
@@ -1501,8 +1548,11 @@ int lu_global_init(void)
          if (lu_site_shrinker == NULL)
                  return -ENOMEM;
  
-                                result = lu_time_global_init();
-        return result;
+        result = lu_time_global_init();
+        if (result != 0)
+                return result;
+
+        return cl_global_init();
  }
  
  /**
@@ -1510,6 +1560,7 @@ int lu_global_init(void)
   */
  void lu_global_fini(void)
  {
+        cl_global_fini();
          lu_time_global_fini();
          if (lu_site_shrinker != NULL) {
                  remove_shrinker(lu_site_shrinker);
@@ -1566,6 +1617,7 @@ int lu_site_stats_print(const struct lu_site *s, char *page, int count)
  }
  EXPORT_SYMBOL(lu_site_stats_print);
  
+#ifdef __KERNEL__
  /*
   * XXX: Functions below logically belong to the fid module, but they are used
   * by dt_store_open(). Put them here until better place is found.
@@ -1640,6 +1692,7 @@ int fid_unpack(const struct lu_fid_pack *pack, struct lu_fid *fid)
          return result;
  }
  EXPORT_SYMBOL(fid_unpack);
+#endif  /* #ifdef __KERNEL__ */
  
  const char *lu_time_names[LU_TIME_NR] = {
          [LU_TIME_FIND_LOOKUP] = "find_lookup",
diff --git a/lustre/obdclass/lu_time.c b/lustre/obdclass/lu_time.c

index 66a8687..26513cf 100644 (file)
--- a/lustre/obdclass/lu_time.c
+++ b/lustre/obdclass/lu_time.c
@@ -161,7 +161,7 @@ unsigned long long lu_time_stamp_get(void)
          /*
           * Return timestamp with microsecond precision. This has to be cheap.
           */
-//#ifdef CONFIG_X86 
+//#ifdef CONFIG_X86
  #if defined(CONFIG_X86) && !defined(CONFIG_X86_64)
         /*
          * do_gettimeofday() goes backwards sometimes :(.  Usethe TSC
diff --git a/lustre/obdclass/obd_mount.c b/lustre/obdclass/obd_mount.c

index f55604b..ea3473e 100644 (file)
--- a/lustre/obdclass/obd_mount.c
+++ b/lustre/obdclass/obd_mount.c
@@ -1164,15 +1164,15 @@ out_mgc:
  
  struct lustre_sb_info *lustre_init_lsi(struct super_block *sb)
  {
-        struct lustre_sb_info *lsi = NULL;
+        struct lustre_sb_info *lsi;
          ENTRY;
  
-        OBD_ALLOC(lsi, sizeof(*lsi));
+        OBD_ALLOC_PTR(lsi);
          if (!lsi)
                  RETURN(NULL);
-        OBD_ALLOC(lsi->lsi_lmd, sizeof(*lsi->lsi_lmd));
+        OBD_ALLOC_PTR(lsi->lsi_lmd);
          if (!lsi->lsi_lmd) {
-                OBD_FREE(lsi, sizeof(*lsi));
+                OBD_FREE_PTR(lsi);
                  RETURN(NULL);
          }
  
@@ -1696,6 +1696,7 @@ int lustre_common_put_super(struct super_block *sb)
          }
          /* Drop a ref to the mounted disk */
          lustre_put_lsi(sb);
+        lu_types_stop();
          RETURN(rc);
  }
  
diff --git a/lustre/obdecho/autoMakefile.am b/lustre/obdecho/autoMakefile.am

index bd83a99..c8b7df3 100644 (file)
--- a/lustre/obdecho/autoMakefile.am
+++ b/lustre/obdecho/autoMakefile.am
@@ -68,4 +68,4 @@ endif # MODULES
  install-data-hook: $(install_data_hook)
  
  MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ 
-DIST_SOURCES = $(obdecho-objs:%.o=%.c)
+DIST_SOURCES = $(obdecho-objs:%.o=%.c) echo_internal.h
diff --git a/lustre/obdecho/echo.c b/lustre/obdecho/echo.c

index 663ac58..52609b0 100644 (file)
--- a/lustre/obdecho/echo.c
+++ b/lustre/obdecho/echo.c
@@ -47,11 +47,12 @@
  
  #include <obd_support.h>
  #include <obd_class.h>
-#include <obd_echo.h>
  #include <lustre_debug.h>
  #include <lustre_dlm.h>
  #include <lprocfs_status.h>
  
+#include "echo_internal.h"
+
  #define ECHO_INIT_OBJID      0x1000000000000000ULL
  #define ECHO_HANDLE_MAGIC    0xabcd0123fedc9876ULL
  
diff --git a/lustre/obdecho/echo_client.c b/lustre/obdecho/echo_client.c

index 1be3ce0..225ec5a 100644 (file)
--- a/lustre/obdecho/echo_client.c
+++ b/lustre/obdecho/echo_client.c
@@ -44,47 +44,1129 @@
  #include <obd.h>
  #include <obd_support.h>
  #include <obd_class.h>
-#include <obd_echo.h>
  #include <lustre_debug.h>
  #include <lprocfs_status.h>
+#include <cl_object.h>
  
-static obd_id last_object_id;
+#include "echo_internal.h"
+
+struct echo_device {
+        struct cl_device        ed_cl;
+        struct echo_client_obd *ed_ec;
+
+        struct cl_site          ed_site_myself;
+        struct cl_site         *ed_site;
+        struct lu_device       *ed_next;
+        int                     ed_next_islov;
+};
+
+struct echo_object {
+        struct cl_object        eo_cl;
+        struct cl_object_header eo_hdr;
+
+        struct echo_device     *eo_dev;
+        struct list_head        eo_obj_chain;
+        struct lov_stripe_md   *eo_lsm;
+        atomic_t                eo_npages;
+        int                     eo_deleted;
+};
+
+struct echo_object_conf {
+        struct cl_object_conf  eoc_cl;
+        struct lov_stripe_md **eoc_md;
+};
+
+struct echo_page {
+        struct cl_page_slice   ep_cl;
+        struct cl_sync_io     *ep_sync_io;
+        cfs_page_t            *ep_vmpage;
+};
+
+struct echo_lock {
+        struct cl_lock_slice   el_cl;
+        struct list_head       el_chain;
+        struct echo_object    *el_object;
+        __u64                  el_cookie;
+};
+
+struct echo_io {
+        struct cl_io_slice     ei_cl;
+};
  
  #if 0
-static void
-echo_printk_object (char *msg, struct ec_object *eco)
+struct echo_req {
+        struct cl_req_slice er_cl;
+};
+#endif
+
+static int echo_client_setup(struct obd_device *obddev,
+                             struct lustre_cfg *lcfg);
+static int echo_client_cleanup(struct obd_device *obddev);
+
+
+/** \defgroup echo_helpers
+ * @{
+ */
+static inline struct echo_device *cl2echo_dev(const struct cl_device *dev)
  {
-        struct lov_stripe_md *lsm = eco->eco_lsm;
-        int                   i;
+        return container_of0(dev, struct echo_device, ed_cl);
+}
  
-        CDEBUG(D_INFO, "%s: object %p: "LPX64", refs %d%s: "LPX64"=%u!%u\n",
-               msg, eco, eco->eco_id, eco->eco_refcount,
-               eco->eco_deleted ? "(deleted) " : "",
-               lsm->lsm_object_id, lsm->lsm_stripe_size,
-               lsm->lsm_stripe_count);
+static inline struct cl_device *echo_dev2cl(struct echo_device *d)
+{
+        return &d->ed_cl;
+}
  
-        for (i = 0; i < lsm->lsm_stripe_count; i++)
-                CDEBUG(D_INFO, "@%2u:"LPX64"\n",
-                       lsm->lsm_oinfo[i].loi_ost_idx,
-                       lsm->lsm_oinfo[i].loi_id);
+static inline struct echo_device *obd2echo_dev(const struct obd_device *obd)
+{
+        return cl2echo_dev(lu2cl_dev(obd->obd_lu_dev));
  }
+
+static inline struct cl_object *echo_obj2cl(struct echo_object *eco)
+{
+        return &eco->eo_cl;
+}
+
+static inline struct echo_object *cl2echo_obj(const struct cl_object *o)
+{
+        return container_of(o, struct echo_object, eo_cl);
+}
+
+static inline struct echo_page *cl2echo_page(const struct cl_page_slice *s)
+{
+        return container_of(s, struct echo_page, ep_cl);
+}
+
+static inline struct echo_lock *cl2echo_lock(const struct cl_lock_slice *s)
+{
+        return container_of(s, struct echo_lock, el_cl);
+}
+
+static inline struct cl_lock *echo_lock2cl(const struct echo_lock *ecl)
+{
+        return ecl->el_cl.cls_lock;
+}
+
+static struct lu_context_key echo_thread_key;
+static inline struct echo_thread_info *echo_env_info(const struct lu_env *env)
+{
+        struct echo_thread_info *info;
+        info = lu_context_key_get(&env->le_ctx, &echo_thread_key);
+        LASSERT(info != NULL);
+        return info;
+}
+
+static inline
+struct echo_object_conf *cl2echo_conf(const struct cl_object_conf *c)
+{
+        return container_of(c, struct echo_object_conf, eoc_cl);
+}
+
+static inline void lsm2fid(struct lov_stripe_md *lsm, struct lu_fid *fid)
+{
+        fid_zero(fid);
+        fid->f_seq = lsm->lsm_object_gr << 16 | lsm->lsm_object_id >> 32;
+        fid->f_oid = lsm->lsm_object_id;
+}
+/** @} echo_helpers */
+
+static struct echo_object *cl_echo_object_find(struct echo_device *d,
+                                               struct lov_stripe_md **lsm);
+static int cl_echo_object_put(struct echo_object *eco);
+static int cl_echo_enqueue   (struct echo_object *eco, obd_off start,
+                              obd_off end, int mode, __u64 *cookie);
+static int cl_echo_cancel    (struct echo_device *d, __u64 cookie);
+static int cl_echo_object_brw(struct echo_object *eco, int rw, obd_off offset,
+                              cfs_page_t **pages, int npages, int async);
+
+static struct echo_thread_info *echo_env_info(const struct lu_env *env);
+
+struct echo_thread_info {
+        struct echo_object_conf eti_conf;
+        struct lustre_md        eti_md;
+
+        struct cl_2queue        eti_queue;
+        struct cl_io            eti_io;
+        struct cl_sync_io       eti_anchor;
+        struct cl_lock_descr    eti_descr;
+        struct lu_fid           eti_fid;
+};
+
+/* No session used right now */
+struct echo_session_info {
+        unsigned long dummy;
+};
+
+static cfs_mem_cache_t *echo_page_kmem;
+static cfs_mem_cache_t *echo_lock_kmem;
+static cfs_mem_cache_t *echo_object_kmem;
+static cfs_mem_cache_t *echo_thread_kmem;
+static cfs_mem_cache_t *echo_session_kmem;
+//static cfs_mem_cache_t *echo_req_kmem;
+
+static struct lu_kmem_descr echo_caches[] = {
+        {
+                .ckd_cache = &echo_page_kmem,
+                .ckd_name  = "echo_page_kmem",
+                .ckd_size  = sizeof (struct echo_page)
+        },
+        {
+                .ckd_cache = &echo_lock_kmem,
+                .ckd_name  = "echo_lock_kmem",
+                .ckd_size  = sizeof (struct echo_lock)
+        },
+        {
+                .ckd_cache = &echo_object_kmem,
+                .ckd_name  = "echo_object_kmem",
+                .ckd_size  = sizeof (struct echo_object)
+        },
+        {
+                .ckd_cache = &echo_thread_kmem,
+                .ckd_name  = "echo_thread_kmem",
+                .ckd_size  = sizeof (struct echo_thread_info)
+        },
+        {
+                .ckd_cache = &echo_session_kmem,
+                .ckd_name  = "echo_session_kmem",
+                .ckd_size  = sizeof (struct echo_session_info)
+        },
+#if 0
+        {
+                .ckd_cache = &echo_req_kmem,
+                .ckd_name  = "echo_req_kmem",
+                .ckd_size  = sizeof (struct echo_req)
+        },
  #endif
+        {
+                .ckd_cache = NULL
+        }
+};
+
+/** defgroup echo_page echo_page
+ *
+ * Echo page operations.
+ *
+ * @{
+ */
+cfs_page_t *echo_page_vmpage(const struct lu_env *env,
+                             const struct cl_page_slice *slice)
+{
+        return cl2echo_page(slice)->ep_vmpage;
+}
+
+static void echo_page_discard(const struct lu_env *env,
+                              const struct cl_page_slice *slice,
+                              struct cl_io *_)
+{
+        cl_page_delete(env, slice->cpl_page);
+}
+
+static int echo_page_is_vmlocked(const struct lu_env *env,
+                                 const struct cl_page_slice *slice)
+{
+        return 1;
+}
+
+static void echo_page_completion(const struct lu_env *env,
+                                 const struct cl_page_slice *slice,
+                                 int ioret)
+{
+        struct echo_page *ecp     = cl2echo_page(slice);
+        struct cl_sync_io *anchor = ecp->ep_sync_io;
+        ENTRY;
+
+        LASSERT(anchor != NULL);
+        ecp->ep_sync_io = NULL;
+        cl_sync_io_note(anchor, ioret);
+        EXIT;
+}
+
+static void echo_page_fini(const struct lu_env *env,
+                           struct cl_page_slice *slice)
+{
+        struct echo_page *ep    = cl2echo_page(slice);
+        struct echo_object *eco = cl2echo_obj(slice->cpl_obj);
+        cfs_page_t *vmpage      = ep->ep_vmpage;
+        ENTRY;
+
+        atomic_dec(&eco->eo_npages);
+        page_cache_release(vmpage);
+        OBD_SLAB_FREE_PTR(ep, echo_page_kmem);
+        EXIT;
+}
+
+static int echo_page_prep(const struct lu_env *env,
+                          const struct cl_page_slice *slice,
+                          struct cl_io *_)
+{
+        return 0;
+}
+
+static int echo_page_print(const struct lu_env *env,
+                           const struct cl_page_slice *slice,
+                           void *cookie, lu_printer_t printer)
+{
+        struct echo_page *ep = cl2echo_page(slice);
+
+        (*printer)(env, cookie, LUSTRE_ECHO_CLIENT_NAME"-page@%p vm@%p\n",
+                   ep, ep->ep_vmpage);
+        return 0;
+}
+
+static const struct cl_page_operations echo_page_ops = {
+        .cpo_discard       = echo_page_discard,
+        .cpo_vmpage        = echo_page_vmpage,
+        .cpo_fini          = echo_page_fini,
+        .cpo_print         = echo_page_print,
+        .cpo_is_vmlocked   = echo_page_is_vmlocked,
+        .io = {
+                [CRT_READ] = {
+                        .cpo_prep        = echo_page_prep,
+                        .cpo_completion  = echo_page_completion,
+                },
+                [CRT_WRITE] = {
+                        .cpo_prep        = echo_page_prep,
+                        .cpo_completion  = echo_page_completion,
+                }
+        }
+};
+/** @} echo_page */
+
+/** \defgroup echo_lock echo_lock
+ *
+ * echo lock operations
+ *
+ * @{
+ */
+static void echo_lock_fini(const struct lu_env *env,
+                           struct cl_lock_slice *slice)
+{
+        struct echo_lock *ecl = cl2echo_lock(slice);
+
+        LASSERT(list_empty(&ecl->el_chain));
+        OBD_SLAB_FREE_PTR(ecl, echo_lock_kmem);
+}
+
+static void echo_lock_delete(const struct lu_env *env,
+                             const struct cl_lock_slice *slice)
+{
+        struct echo_lock *ecl      = cl2echo_lock(slice);
+
+        LASSERT(list_empty(&ecl->el_chain));
+}
+
+static int echo_lock_fits_into(const struct lu_env *env,
+                               const struct cl_lock_slice *slice,
+                               const struct cl_lock_descr *need,
+                               const struct cl_io *_)
+{
+        return 1;
+}
+
+static struct cl_lock_operations echo_lock_ops = {
+        .clo_fini      = echo_lock_fini,
+        .clo_delete    = echo_lock_delete,
+        .clo_fits_into = echo_lock_fits_into
+};
+
+/** @} echo_lock */
+
+/** \defgroup echo_cl_ops echo_cl_ops
+ *
+ * operations for cl_object
+ *
+ * @{
+ */
+static struct cl_page *echo_page_init(const struct lu_env *env,
+                                      struct cl_object *obj,
+                                      struct cl_page *page, cfs_page_t *vmpage)
+{
+        struct echo_page *ep;
+        ENTRY;
+
+        OBD_SLAB_ALLOC_PTR(ep, echo_page_kmem);
+        if (ep != NULL) {
+                struct echo_object *eco = cl2echo_obj(obj);
+                ep->ep_vmpage = vmpage;
+                page_cache_get(vmpage);
+                cl_page_slice_add(page, &ep->ep_cl, obj, &echo_page_ops);
+                atomic_inc(&eco->eo_npages);
+        }
+        RETURN(ERR_PTR(ep ? 0 : -ENOMEM));
+}
+
+static int echo_io_init(const struct lu_env *env, struct cl_object *obj,
+                        struct cl_io *io)
+{
+        return 0;
+}
+
+static int echo_lock_init(const struct lu_env *env,
+                          struct cl_object *obj, struct cl_lock *lock,
+                          const struct cl_io *_)
+{
+        struct echo_lock *el;
+        ENTRY;
+
+        OBD_SLAB_ALLOC_PTR(el, echo_lock_kmem);
+        if (el != NULL) {
+                cl_lock_slice_add(lock, &el->el_cl, obj, &echo_lock_ops);
+                el->el_object = cl2echo_obj(obj);
+                CFS_INIT_LIST_HEAD(&el->el_chain);
+        }
+        RETURN(el == NULL ? -ENOMEM : 0);
+}
+
+static int echo_conf_set(const struct lu_env *env, struct cl_object *obj,
+                         const struct cl_object_conf *conf)
+{
+        return 0;
+}
+
+static const struct cl_object_operations echo_cl_obj_ops = {
+        .coo_page_init = echo_page_init,
+        .coo_lock_init = echo_lock_init,
+        .coo_io_init   = echo_io_init,
+        .coo_conf_set  = echo_conf_set
+};
+/** @} echo_cl_ops */
+
+/** \defgroup echo_lu_ops echo_lu_ops
+ *
+ * operations for echo lu object.
+ *
+ * @{
+ */
+static int echo_object_init(const struct lu_env *env, struct lu_object *obj,
+                            const struct lu_object_conf *conf)
+{
+        const struct cl_object_conf *cconf = lu2cl_conf(conf);
+        struct echo_object_conf *econf = cl2echo_conf(cconf);
+        struct echo_device *ed         = cl2echo_dev(lu2cl_dev(obj->lo_dev));
+        struct echo_client_obd *ec     = ed->ed_ec;
+        struct echo_object *eco        = cl2echo_obj(lu2cl(obj));
+        ENTRY;
+
+        if (ed->ed_next) {
+                struct lu_object  *below;
+                struct lu_device  *under;
+
+                under = ed->ed_next;
+                below = under->ld_ops->ldo_object_alloc(env, obj->lo_header,
+                                                        under);
+                if (below == NULL)
+                        RETURN(-ENOMEM);
+                lu_object_add(obj, below);
+        }
+
+        LASSERT(econf->eoc_md);
+        eco->eo_lsm = *econf->eoc_md;
+        eco->eo_dev = ed;
+        atomic_set(&eco->eo_npages, 0);
+
+        /* clear the lsm pointer so that it won't get freed. */
+        *econf->eoc_md = NULL;
+
+        spin_lock(&ec->ec_lock);
+        list_add_tail(&eco->eo_obj_chain, &ec->ec_objects);
+        spin_unlock(&ec->ec_lock);
+
+        RETURN(0);
+}
+
+static void echo_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+        struct echo_object *eco    = cl2echo_obj(lu2cl(obj));
+        struct echo_client_obd *ec = eco->eo_dev->ed_ec;
+        struct lov_stripe_md *lsm  = eco->eo_lsm;
+        ENTRY;
+
+        LASSERT(atomic_read(&eco->eo_npages) == 0);
+
+        spin_lock(&ec->ec_lock);
+        list_del_init(&eco->eo_obj_chain);
+        spin_unlock(&ec->ec_lock);
+
+        lu_object_fini(obj);
+        lu_object_header_fini(obj->lo_header);
+
+        if (lsm)
+                obd_free_memmd(ec->ec_exp, &lsm);
+        OBD_SLAB_FREE_PTR(eco, echo_object_kmem);
+        EXIT;
+}
+
+static int echo_object_print(const struct lu_env *env, void *cookie,
+                            lu_printer_t p, const struct lu_object *o)
+{
+        struct echo_object *obj = cl2echo_obj(lu2cl(o));
+
+        return (*p)(env, cookie, "echoclient-object@%p", obj);
+}
+
+
+static const struct lu_object_operations echo_lu_obj_ops = {
+        .loo_object_init      = echo_object_init,
+        .loo_object_delete    = NULL,
+        .loo_object_release   = NULL,
+        .loo_object_free      = echo_object_free,
+        .loo_object_print     = echo_object_print,
+        .loo_object_invariant = NULL
+};
+/** @} echo_lu_ops */
+
+/** \defgroup echo_lu_dev_ops
+ *
+ * Operations for echo lu device.
+ *
+ * @{
+ */
+static struct lu_object *echo_object_alloc(const struct lu_env *env,
+                                         const struct lu_object_header *hdr,
+                                         struct lu_device *dev)
+{
+        struct echo_object *eco;
+        struct lu_object *obj = NULL;
+        ENTRY;
+
+        /* we're the top dev. */
+        LASSERT(hdr == NULL);
+        OBD_SLAB_ALLOC_PTR(eco, echo_object_kmem);
+        if (eco != NULL) {
+                struct cl_object_header *hdr = &eco->eo_hdr;
+
+                obj = &echo_obj2cl(eco)->co_lu;
+                cl_object_header_init(hdr);
+                lu_object_init(obj, &hdr->coh_lu, dev);
+                lu_object_add_top(&hdr->coh_lu, obj);
+
+                eco->eo_cl.co_ops = &echo_cl_obj_ops;
+                obj->lo_ops       = &echo_lu_obj_ops;
+        }
+        RETURN(obj);
+}
+
+static struct lu_device_operations echo_device_lu_ops = {
+        .ldo_object_alloc   = echo_object_alloc,
+};
+/** @} echo_lu_dev_ops */
+
+static struct cl_device_operations echo_device_cl_ops = {
+};
+
+/** \defgroup echo_init echo_init
+ *
+ * Init and fini functions for echo client.
+ *
+ * @{
+ */
+static int echo_site_init(const struct lu_env *env, struct echo_device *ed)
+{
+        struct cl_site *site = &ed->ed_site_myself;
+        int rc;
+
+        /* initialize site */
+        rc = cl_site_init(site, &ed->ed_cl);
+        if (rc) {
+                CERROR("Cannot initilize site for echo client(%d)\n", rc);
+                return rc;
+        }
+
+        rc = lu_site_init_finish(&site->cs_lu);
+        if (rc)
+                return rc;
+
+        ed->ed_site = site;
+        return 0;
+}
  
-static struct ec_object *
-echo_find_object_locked (struct obd_device *obd, obd_id id)
+static void echo_site_fini(const struct lu_env *env, struct echo_device *ed)
  {
-        struct echo_client_obd *ec = &obd->u.echo_client;
-        struct ec_object       *eco = NULL;
+        if (ed->ed_site) {
+                cl_site_fini(ed->ed_site);
+                ed->ed_site = NULL;
+        }
+}
+
+static void *echo_thread_key_init(const struct lu_context *ctx,
+                          struct lu_context_key *key)
+{
+        struct echo_thread_info *info;
+
+        OBD_SLAB_ALLOC_PTR(info, echo_thread_kmem);
+        if (info == NULL)
+                info = ERR_PTR(-ENOMEM);
+        return info;
+}
+
+static void echo_thread_key_fini(const struct lu_context *ctx,
+                         struct lu_context_key *key, void *data)
+{
+        struct echo_thread_info *info = data;
+        OBD_SLAB_FREE_PTR(info, echo_thread_kmem);
+}
+
+static void echo_thread_key_exit(const struct lu_context *ctx,
+                         struct lu_context_key *key, void *data)
+{
+}
+
+static struct lu_context_key echo_thread_key = {
+        .lct_tags = LCT_CL_THREAD,
+        .lct_init = echo_thread_key_init,
+        .lct_fini = echo_thread_key_fini,
+        .lct_exit = echo_thread_key_exit
+};
+
+static void *echo_session_key_init(const struct lu_context *ctx,
+                                  struct lu_context_key *key)
+{
+        struct echo_session_info *session;
+
+        OBD_SLAB_ALLOC_PTR(session, echo_session_kmem);
+        if (session == NULL)
+                session = ERR_PTR(-ENOMEM);
+        return session;
+}
+
+static void echo_session_key_fini(const struct lu_context *ctx,
+                                 struct lu_context_key *key, void *data)
+{
+        struct echo_session_info *session = data;
+        OBD_SLAB_FREE_PTR(session, echo_session_kmem);
+}
+
+static void echo_session_key_exit(const struct lu_context *ctx,
+                                 struct lu_context_key *key, void *data)
+{
+}
+
+static struct lu_context_key echo_session_key = {
+        .lct_tags = LCT_SESSION,
+        .lct_init = echo_session_key_init,
+        .lct_fini = echo_session_key_fini,
+        .lct_exit = echo_session_key_exit
+};
+
+LU_TYPE_INIT_FINI(echo, &echo_thread_key, &echo_session_key);
+
+static struct lu_device *echo_device_alloc(const struct lu_env *env,
+                                           struct lu_device_type *t,
+                                           struct lustre_cfg *cfg)
+{
+        struct lu_device   *next;
+        struct echo_device *ed;
+        struct cl_device   *cd;
+        struct obd_device  *obd = NULL; /* to keep compiler happy */
+        struct obd_device  *tgt;
+        const char *tgt_type_name;
+        int rc;
+        int cleanup = 0;
+        ENTRY;
+
+        OBD_ALLOC_PTR(ed);
+        if (ed == NULL)
+                GOTO(out, rc = -ENOMEM);
+
+        cleanup = 1;
+        cd = &ed->ed_cl;
+        rc = cl_device_init(cd, t);
+        if (rc)
+                GOTO(out, rc);
+
+        cd->cd_lu_dev.ld_ops = &echo_device_lu_ops;
+        cd->cd_ops = &echo_device_cl_ops;
+
+        cleanup = 2;
+        rc = echo_site_init(env, ed);
+        if (rc)
+                GOTO(out, rc);
+
+        cleanup = 3;
+        obd = class_name2obd(lustre_cfg_string(cfg, 0));
+        LASSERT(obd != NULL);
+        rc = echo_client_setup(obd, cfg);
+        if (rc)
+                GOTO(out, rc);
+        ed->ed_ec = &obd->u.echo_client;
+
+        cleanup = 4;
+        tgt = class_name2obd(lustre_cfg_string(cfg, 1));
+        LASSERT(tgt != NULL);
+        next = tgt->obd_lu_dev;
+        if (!lu_device_is_cl(next))
+                next = NULL;
+
+        /*
+         * if echo client is to be stacked upon ost device, the next is NULL
+         * since ost is not a clio device so far
+         */
+        tgt_type_name = tgt->obd_type->typ_name;
+        if (next != NULL) {
+                LASSERT(next != NULL);
+                if (next->ld_site != NULL)
+                        GOTO(out, rc = -EBUSY);
+
+                next->ld_site = &ed->ed_site->cs_lu;
+                rc = next->ld_type->ldt_ops->ldto_device_init(env, next,
+                                             next->ld_type->ldt_name, NULL);
+                if (rc)
+                        GOTO(out, rc);
+
+                /* Trikcy case, I have to determine the obd type since clio
+                 * uses the different parameters to initialize objects for
+                 * lov & osc.
+                 */
+                if (strcmp(tgt_type_name, LUSTRE_LOV_NAME) == 0)
+                        ed->ed_next_islov = 1;
+                else
+                        LASSERT(strcmp(tgt_type_name, LUSTRE_OSC_NAME) == 0);
+        } else
+                LASSERT(strcmp(tgt_type_name, LUSTRE_OST_NAME) == 0);
+
+        ed->ed_next = next;
+        RETURN(&cd->cd_lu_dev);
+
+out:
+        switch(cleanup) {
+        case 4: {
+                int rc2;
+                rc2 = echo_client_cleanup(obd);
+                if (rc2)
+                        CERROR("Cleanup obd device %s error(%d)\n",
+                               obd->obd_name, rc2);
+        }
+
+        case 3:
+                echo_site_fini(env, ed);
+        case 2:
+                cl_device_fini(&ed->ed_cl);
+        case 1:
+                OBD_FREE_PTR(ed);
+        case 0:
+        default:
+                break;
+        }
+        return(ERR_PTR(rc));
+}
+
+static int echo_device_init(const struct lu_env *env, struct lu_device *d,
+                          const char *name, struct lu_device *next)
+{
+        LBUG();
+        return 0;
+}
+
+static struct lu_device *echo_device_fini(const struct lu_env *env,
+                                          struct lu_device *d)
+{
+        struct echo_device *ed = cl2echo_dev(lu2cl_dev(d));
+        struct lu_device *next = ed->ed_next;
+
+        while (next)
+                next = next->ld_type->ldt_ops->ldto_device_fini(env, next);
+        return NULL;
+}
+
+static struct lu_device *echo_device_free(const struct lu_env *env,
+                                          struct lu_device *d)
+{
+        struct echo_device     *ed   = cl2echo_dev(lu2cl_dev(d));
+        struct echo_client_obd *ec   = ed->ed_ec;
+        struct lu_device       *next = ed->ed_next;
+
+        printk("ed = %p, ec = %p, next = %p\n", ed, ec, next);
+
+        /* destroy locks */
+        spin_lock(&ec->ec_lock);
+        while (!list_empty(&ec->ec_locks)) {
+                struct echo_lock *ecl = list_entry(ec->ec_locks.next,
+                                                   struct echo_lock, el_chain);
+                struct cl_lock *lock  = echo_lock2cl(ecl);
+
+                list_del_init(&ecl->el_chain);
+                spin_unlock(&ec->ec_lock);
+
+                CERROR("echo client: pending lock %p\n", ecl);
+
+                cl_lock_get(lock);
+                cl_unuse(env, lock);
+                cl_lock_release(env, lock, "ec enqueue", ecl->el_object);
+
+                cl_lock_mutex_get(env, lock);
+                cl_lock_cancel(env, lock);
+                cl_lock_delete(env, lock);
+                cl_lock_mutex_put(env, lock);
+                cl_lock_put(env, lock);
+
+                spin_lock(&ec->ec_lock);
+        }
+        spin_unlock(&ec->ec_lock);
+
+        LASSERT(ed->ed_site);
+        lu_site_purge(env, &ed->ed_site->cs_lu, -1);
+
+        /* check if there are objects still alive, assume only one reference */
+        spin_lock(&ec->ec_lock);
+        while (!list_empty(&ec->ec_objects)) {
+                struct echo_object *eco;
+                eco = list_entry(ec->ec_objects.next, struct echo_object,
+                                 eo_obj_chain);
+                spin_unlock(&ec->ec_lock);
+
+                eco->eo_deleted = 1;
+                cl_echo_object_put(eco);
+
+                spin_lock(&ec->ec_lock);
+        }
+        spin_unlock(&ec->ec_lock);
+
+        echo_client_cleanup(d->ld_obd);
+
+        while (next)
+                next = next->ld_type->ldt_ops->ldto_device_free(env, next);
+
+        LASSERT(ed->ed_site == lu2cl_site(d->ld_site));
+        echo_site_fini(env, ed);
+        cl_device_fini(&ed->ed_cl);
+        OBD_FREE_PTR(ed);
+
+        return NULL;
+}
+
+static const struct lu_device_type_operations echo_device_type_ops = {
+        .ldto_init = echo_type_init,
+        .ldto_fini = echo_type_fini,
+
+        .ldto_start = echo_type_start,
+        .ldto_stop  = echo_type_stop,
+
+        .ldto_device_alloc = echo_device_alloc,
+        .ldto_device_free  = echo_device_free,
+        .ldto_device_init  = echo_device_init,
+        .ldto_device_fini  = echo_device_fini
+};
+
+static struct lu_device_type echo_device_type = {
+        .ldt_tags     = LU_DEVICE_CL,
+        .ldt_name     = LUSTRE_ECHO_CLIENT_NAME,
+        .ldt_ops      = &echo_device_type_ops,
+        .ldt_ctx_tags = LCT_CL_THREAD
+};
+/** @} echo_init */
+
+/** \defgroup echo_exports
+ *
+ * exporting functions to echo client
+ *
+ * @{
+ */
+
+/* Interfaces to echo client obd device */
+static struct echo_object *cl_echo_object_find(struct echo_device *d,
+                                               struct lov_stripe_md **lsmp)
+{
+        struct lu_env *env;
+        struct echo_thread_info *info;
+        struct echo_object_conf *conf;
+        struct lov_stripe_md    *lsm;
+        struct echo_object *eco;
+        struct cl_object   *obj;
+        struct lu_fid *fid;
+        int refcheck;
+        ENTRY;
+
+        LASSERT(lsmp);
+        lsm = *lsmp;
+        LASSERT(lsm);
+        LASSERT(lsm->lsm_object_id);
+
+        env = cl_env_get(&refcheck);
+        if (IS_ERR(env))
+                RETURN((void *)env);
+
+        info = echo_env_info(env);
+        conf = &info->eti_conf;
+        if (d->ed_next) {
+                if (!d->ed_next_islov) {
+                        struct lov_oinfo *oinfo = lsm->lsm_oinfo[0];
+                        LASSERT(oinfo != NULL);
+                        oinfo->loi_id = lsm->lsm_object_id;
+                        oinfo->loi_gr = lsm->lsm_object_gr;
+                        conf->eoc_cl.u.coc_oinfo = oinfo;
+                } else {
+                        struct lustre_md *md;
+                        md = &info->eti_md;
+                        memset(md, 0, sizeof *md);
+                        md->lsm = lsm;
+                        conf->eoc_cl.u.coc_md = md;
+                }
+        }
+        conf->eoc_md = lsmp;
+
+        fid  = &info->eti_fid;
+        lsm2fid(lsm, fid);
+
+        obj = cl_object_find(env, echo_dev2cl(d), fid, &conf->eoc_cl);
+        if (IS_ERR(obj))
+                GOTO(out, eco = (void*)obj);
+
+        eco = cl2echo_obj(obj);
+        if (eco->eo_deleted) {
+                cl_object_put(env, obj);
+                eco = ERR_PTR(-EAGAIN);
+        }
+
+out:
+        cl_env_put(env, &refcheck);
+        RETURN(eco);
+}
+
+static int cl_echo_object_put(struct echo_object *eco)
+{
+        struct lu_env *env;
+        struct cl_object *obj = echo_obj2cl(eco);
+        int refcheck;
+        ENTRY;
+
+        env = cl_env_get(&refcheck);
+        if (IS_ERR(env))
+                RETURN(PTR_ERR(env));
+
+        /* an external function to kill an object? */
+        if (eco->eo_deleted) {
+                struct lu_object_header *loh = obj->co_lu.lo_header;
+                LASSERT(&eco->eo_hdr == luh2coh(loh));
+                set_bit(LU_OBJECT_HEARD_BANSHEE, &loh->loh_flags);
+                cl_object_prune(env, obj);
+        }
+
+        cl_object_put(env, obj);
+        cl_env_put(env, &refcheck);
+        RETURN(0);
+}
+
+static int cl_echo_enqueue(struct echo_object *eco, obd_off start, obd_off end,
+                           int mode, __u64 *cookie)
+{
+        struct lu_env *env;
+        struct cl_lock *lck;
+        struct echo_thread_info *info;
+        struct cl_io *io;
+        struct cl_lock_descr *descr;
+        struct cl_object *obj = echo_obj2cl(eco);
+        int refcheck;
+        int result;
+        ENTRY;
+
+        env = cl_env_get(&refcheck);
+        if (IS_ERR(env))
+                RETURN(PTR_ERR(env));
+
+        info = echo_env_info(env);
+        descr = &info->eti_descr;
+        descr->cld_obj   = obj;
+        descr->cld_start = cl_index(obj, start);
+        descr->cld_end   = cl_index(obj, end);
+        descr->cld_mode  = mode == LCK_PW ? CLM_WRITE : CLM_READ;
+
+        io = &info->eti_io;
+        io->ci_obj = obj;
+        result = cl_io_init(env, io, CIT_MISC, obj);
+        if (result < 0)
+                GOTO(out, result);
+        LASSERT(result == 0);
+
+        result = -ENOMEM;
+        lck = cl_lock_request(env, io, descr, CEF_ASYNC, "ec enqueue", eco);
+        if (lck) {
+                struct echo_client_obd *ec = eco->eo_dev->ed_ec;
+                struct echo_lock *el;
+
+                result = cl_wait(env, lck);
+                if (result == 0) {
+                        el = cl2echo_lock(cl_lock_at(lck, &echo_device_type));
+                        spin_lock(&ec->ec_lock);
+                        list_add(&el->el_chain, &ec->ec_locks);
+                        *cookie = el->el_cookie = ++ec->ec_unique;
+                        spin_unlock(&ec->ec_lock);
+                } else
+                        cl_lock_release(env, lck, "ec enqueue", cfs_current());
+        }
+        cl_io_fini(env, io);
+
+        EXIT;
+out:
+        cl_env_put(env, &refcheck);
+        return result;
+}
+
+static int cl_echo_cancel(struct echo_device *ed, __u64 cookie)
+{
+        struct echo_client_obd *ec = ed->ed_ec;
+        struct echo_lock       *ecl = NULL;
          struct list_head       *el;
+        int found = 0;
+        int result;
+
+        struct lu_env *env;
+        int refcheck;
+        ENTRY;
+
+        env = cl_env_get(&refcheck);
+        if (IS_ERR(env))
+                RETURN(PTR_ERR(env));
+
+        spin_lock (&ec->ec_lock);
+        list_for_each (el, &ec->ec_locks) {
+                ecl = list_entry (el, struct echo_lock, el_chain);
+                CDEBUG(D_INFO, "ecl: %p, cookie: %llx\n", ecl, ecl->el_cookie);
+                found = (ecl->el_cookie == cookie);
+                if (found) {
+                        list_del_init(&ecl->el_chain);
+                        break;
+                }
+        }
+        spin_unlock (&ec->ec_lock);
+
+        result = -ENOENT;
+        if (found) {
+                struct cl_lock *clk = echo_lock2cl(ecl);
+
+                cl_lock_get(clk);
+                cl_unuse(env, clk);
+                cl_lock_release(env, clk, "ec enqueue", ecl->el_object);
+
+                cl_lock_mutex_get(env, clk);
+                cl_lock_cancel(env, clk);
+                cl_lock_delete(env, clk);
+                cl_lock_mutex_put(env, clk);
+                cl_lock_put(env, clk);
+                result = 0;
+        }
+        cl_env_put(env, &refcheck);
+        RETURN(result);
+}
+
+static int cl_echo_async_brw(const struct lu_env *env, struct cl_io *io,
+                             enum cl_req_type _, struct cl_2queue *queue)
+{
+        struct cl_page *clp;
+        struct cl_page *temp;
+        int result = 0;
+        ENTRY;
+
+        cl_page_list_splice(&queue->c2_qin, &queue->c2_qout);
+        cl_page_list_for_each_safe(clp, temp, &queue->c2_qout) {
+                int rc;
+                rc = cl_page_cache_add(env, io, clp, CRT_WRITE);
+                if (rc == 0)
+                        continue;
+                cl_page_list_move(&queue->c2_qin, &queue->c2_qout, clp);
+                result = result ?: rc;
+        }
+        RETURN(list_empty(&queue->c2_qout.pl_pages) ? result : 0);
+}
+
+static int cl_echo_object_brw(struct echo_object *eco, int rw, obd_off offset,
+                              cfs_page_t **pages, int npages, int async)
+{
+        struct lu_env           *env;
+        struct echo_thread_info *info;
+        struct cl_object        *obj = echo_obj2cl(eco);
+        struct echo_device      *ed  = eco->eo_dev;
+        struct cl_sync_io       *anchor;
+        struct cl_2queue        *queue;
+        struct cl_io            *io;
+        struct cl_page          *clp;
+        struct echo_page        *ep;
+
+        int page_size = cl_page_size(obj);
+        int refcheck;
+        int rc;
+        int i;
+        ENTRY;
+
+        LASSERT(ed->ed_next != NULL);
+        env = cl_env_get(&refcheck);
+        if (IS_ERR(env))
+                RETURN(PTR_ERR(env));
+
+        info    = echo_env_info(env);
+        io      = &info->eti_io;
+        anchor  = &info->eti_anchor;
+        queue   = &info->eti_queue;
+
+        cl_sync_io_init(anchor, npages);
+        cl_2queue_init(queue);
+        rc = cl_io_init(env, io, CIT_MISC, obj);
+        if (rc < 0)
+                GOTO(out, rc);
+        LASSERT(rc == 0);
+
+        for (i = 0; i < npages; i++) {
+                LASSERT(pages[i]);
+                clp = cl_page_find(env, obj, cl_index(obj, offset),
+                                   pages[i], CPT_TRANSIENT);
+                if (IS_ERR(clp)) {
+                        rc = PTR_ERR(clp);
+                        break;
+                }
+                LASSERT(clp->cp_type == CPT_TRANSIENT);
+
+                rc = cl_page_own(env, io, clp);
+                if (rc) {
+                        LASSERT(clp->cp_state == CPS_FREEING);
+                        cl_page_put(env, clp);
+                        break;
+                }
+
+                ep = cl2echo_page(cl_page_at(clp, &echo_device_type));
+                ep->ep_sync_io = anchor;
+                cl_2queue_add(queue, clp);
  
-        list_for_each (el, &ec->ec_objects) {
-                eco = list_entry (el, struct ec_object, eco_obj_chain);
+                /* drop the reference count for cl_page_find, so that the page
+                 * will be freed in cl_2queue_fini. */
+                cl_page_put(env, clp);
+                offset += page_size;
+        }
  
-                if (eco->eco_id == id)
-                        return (eco);
+        if (rc == 0) {
+                enum cl_req_type typ = rw == READ ? CRT_READ : CRT_WRITE;
+
+                async = async && (typ == CRT_WRITE);
+                rc = (async ? cl_echo_async_brw : cl_io_submit_rw)(env, io,
+                                                                   typ, queue);
+                CDEBUG(D_INFO, "echo_client %s write returns %d\n",
+                       async ? "async" : "sync", rc);
+                if (rc == 0) {
+                        /*
+                         * If some pages weren't sent for any reason (e.g.,
+                         * direct-io read found up-to-date pages in the
+                         * cache), count them as completed to avoid infinite
+                         * wait.
+                         */
+                        cl_page_list_for_each(clp, &queue->c2_qin)
+                                cl_sync_io_note(anchor, +1);
+                        /* wait for the IO to be finished. */
+                        rc = cl_sync_io_wait(env, io, &queue->c2_qout, anchor);
+                }
          }
-        return (NULL);
+
+        cl_2queue_discard(env, io, queue);
+        cl_2queue_disown(env, io, queue);
+        cl_2queue_fini(env, queue);
+        cl_io_fini(env, io);
+
+        EXIT;
+out:
+        cl_env_put(env, &refcheck);
+        return rc;
  }
+/** @} echo_exports */
+
+
+static obd_id last_object_id;
  
  static int
  echo_copyout_lsm (struct lov_stripe_md *lsm, void *_ulsm, int ulsm_nob)
@@ -108,10 +1190,10 @@ echo_copyout_lsm (struct lov_stripe_md *lsm, void *_ulsm, int ulsm_nob)
  }
  
  static int
-echo_copyin_lsm (struct obd_device *obd, struct lov_stripe_md *lsm,
+echo_copyin_lsm (struct echo_device *ed, struct lov_stripe_md *lsm,
                   void *ulsm, int ulsm_nob)
  {
-        struct echo_client_obd *ec = &obd->u.echo_client;
+        struct echo_client_obd *ec = ed->ed_ec;
          int                     i;
  
          if (ulsm_nob < sizeof (*lsm))
@@ -121,93 +1203,51 @@ echo_copyin_lsm (struct obd_device *obd, struct lov_stripe_md *lsm,
                  return (-EFAULT);
  
          if (lsm->lsm_stripe_count > ec->ec_nstripes ||
-            lsm->lsm_magic != LOV_MAGIC ||
-            (lsm->lsm_stripe_size & (~CFS_PAGE_MASK)) != 0 ||
-            ((__u64)lsm->lsm_stripe_size * lsm->lsm_stripe_count > ~0UL))
-                return (-EINVAL);
-
-
-        for (i = 0; i < lsm->lsm_stripe_count; i++) {
-                if (copy_from_user(lsm->lsm_oinfo[i],
-                                   ((struct lov_stripe_md *)ulsm)->lsm_oinfo[i],
-                                   sizeof(lsm->lsm_oinfo[0])))
-                        return (-EFAULT);
-        }
-        return (0);
-}
-
-static struct ec_object *
-echo_allocate_object (struct obd_device *obd)
-{
-        struct echo_client_obd *ec = &obd->u.echo_client;
-        struct ec_object       *eco;
-        int rc;
-
-        OBD_ALLOC(eco, sizeof (*eco));
-        if (eco == NULL)
-                return NULL;
-
-        rc = obd_alloc_memmd(ec->ec_exp, &eco->eco_lsm);
-        if (rc < 0) {
-                OBD_FREE(eco, sizeof (*eco));
-                return NULL;
-        }
-
-        eco->eco_device = obd;
-        eco->eco_deleted = 0;
-        eco->eco_refcount = 0;
-        eco->eco_lsm->lsm_magic = LOV_MAGIC;
-        /* leave stripe count 0 by default */
-
-        return (eco);
-}
+            lsm->lsm_magic != LOV_MAGIC ||
+            (lsm->lsm_stripe_size & (~CFS_PAGE_MASK)) != 0 ||
+            ((__u64)lsm->lsm_stripe_size * lsm->lsm_stripe_count > ~0UL))
+                return (-EINVAL);
  
-static void
-echo_free_object (struct ec_object *eco)
-{
-        struct obd_device      *obd = eco->eco_device;
-        struct echo_client_obd *ec = &obd->u.echo_client;
  
-        LASSERT (eco->eco_refcount == 0);
-        if (!eco->eco_lsm)
-                CERROR("No object %s\n", obd->obd_name);
-        else
-                obd_free_memmd(ec->ec_exp, &eco->eco_lsm);
-        OBD_FREE (eco, sizeof (*eco));
+        for (i = 0; i < lsm->lsm_stripe_count; i++) {
+                if (copy_from_user(lsm->lsm_oinfo[i],
+                                   ((struct lov_stripe_md *)ulsm)->lsm_oinfo[i],
+                                   sizeof(lsm->lsm_oinfo[0])))
+                        return (-EFAULT);
+        }
+        return (0);
  }
  
-static int echo_create_object(struct obd_device *obd, int on_target,
+static int echo_create_object(struct echo_device *ed, int on_target,
                                struct obdo *oa, void *ulsm, int ulsm_nob,
                                struct obd_trans_info *oti)
  {
-        struct echo_client_obd *ec = &obd->u.echo_client;
-        struct ec_object       *eco2;
-        struct ec_object       *eco;
-        struct lov_stripe_md   *lsm;
+        struct echo_object     *eco;
+        struct echo_client_obd *ec = ed->ed_ec;
+        struct lov_stripe_md   *lsm = NULL;
          int                     rc;
-        int                     i, idx;
+        int                     created = 0;
+        ENTRY;
  
          if ((oa->o_valid & OBD_MD_FLID) == 0 && /* no obj id */
              (on_target ||                       /* set_stripe */
               ec->ec_nstripes != 0)) {           /* LOV */
                  CERROR ("No valid oid\n");
-                return (-EINVAL);
+                RETURN(-EINVAL);
          }
  
-        if (ulsm != NULL) {
-                eco = echo_allocate_object (obd);
-                if (eco == NULL)
-                        return (-ENOMEM);
+        rc = obd_alloc_memmd(ec->ec_exp, &lsm);
+        if (rc < 0) {
+                CERROR("Cannot allocate md, rc = %d\n", rc);
+                GOTO(failed, rc);
+        }
  
-                lsm = eco->eco_lsm;
+        if (ulsm != NULL) {
+                int i, idx;
  
-                rc = echo_copyin_lsm (obd, lsm, ulsm, ulsm_nob);
+                rc = echo_copyin_lsm (ed, lsm, ulsm, ulsm_nob);
                  if (rc != 0)
-                        goto failed;
-
-                /* setup object ID here for !on_target and LOV hint */
-                if ((oa->o_valid & OBD_MD_FLID) != 0)
-                        eco->eco_id = lsm->lsm_object_id = oa->o_id;
+                        GOTO(failed, rc);
  
                  if (lsm->lsm_stripe_count == 0)
                          lsm->lsm_stripe_count = ec->ec_nstripes;
@@ -225,197 +1265,91 @@ static int echo_create_object(struct obd_device *obd, int on_target,
                          lsm->lsm_oinfo[i]->loi_ost_idx =
                                  (idx + i) % ec->ec_nstripes;
                  }
-        } else {
-                OBD_ALLOC(eco, sizeof(*eco));
-                if (!eco) 
-                        return (-ENOMEM);
-                eco->eco_device = obd;
-                lsm = NULL;
          }
  
-        if (oa->o_id == 0)
-                oa->o_id = ++last_object_id;
+        /* setup object ID here for !on_target and LOV hint */
+        if (oa->o_valid & OBD_MD_FLID)
+                lsm->lsm_object_id = oa->o_id;
  
+        if (lsm->lsm_object_id == 0)
+                lsm->lsm_object_id = ++last_object_id;
+
+        rc = 0;
          if (on_target) {
                  oa->o_gr = FILTER_GROUP_ECHO;
                  oa->o_valid |= OBD_MD_FLGROUP;
  
                  rc = obd_create(ec->ec_exp, oa, &lsm, oti);
-                if (rc != 0)
-                        goto failed;
-
-                /* See what object ID we were given */
-                eco->eco_id = oa->o_id = lsm->lsm_object_id;
-                oa->o_valid |= OBD_MD_FLID;
-
-                LASSERT(eco->eco_lsm == NULL || eco->eco_lsm == lsm);
-                eco->eco_lsm = lsm;
+                if (rc != 0) {
+                        CERROR("Cannot create objects, rc = %d\n", rc);
+                        GOTO(failed, rc);
+                }
+                created = 1;
          }
  
-        spin_lock (&ec->ec_lock);
-
-        eco2 = echo_find_object_locked (obd, oa->o_id);
-        if (eco2 != NULL) {                     /* conflict */
-                spin_unlock (&ec->ec_lock);
-
-                CERROR ("Can't create object id "LPX64": id already exists%s\n",
-                        oa->o_id, on_target ? " (undoing create)" : "");
+        /* See what object ID we were given */
+        oa->o_id = lsm->lsm_object_id;
+        oa->o_valid |= OBD_MD_FLID;
  
-                if (on_target)
-                        obd_destroy(ec->ec_exp, oa, lsm, oti, NULL);
-
-                rc = -EEXIST;
-                goto failed;
-        }
+        eco = cl_echo_object_find(ed, &lsm);
+        if (IS_ERR(eco))
+                GOTO(failed, rc = PTR_ERR(eco));
+        cl_echo_object_put(eco);
  
-        list_add (&eco->eco_obj_chain, &ec->ec_objects);
-        spin_unlock (&ec->ec_lock);
-        CDEBUG (D_INFO,
-                "created %p: "LPX64"=%u#%u@%u refs %d del %d\n",
-                eco, eco->eco_id,
-                eco->eco_lsm->lsm_stripe_size,
-                eco->eco_lsm->lsm_stripe_count,
-                eco->eco_lsm->lsm_oinfo[0]->loi_ost_idx,
-                eco->eco_refcount, eco->eco_deleted);
-        return (0);
+        CDEBUG(D_INFO, "oa->o_id = %lx\n", (long)oa->o_id);
+        EXIT;
  
   failed:
-        echo_free_object (eco);
-        if (rc) 
-                CERROR("%s: err %d on create\n", obd->obd_name, rc);
+        if (created && rc)
+                obd_destroy(ec->ec_exp, oa, lsm, oti, NULL);
+        if (lsm)
+                obd_free_memmd(ec->ec_exp, &lsm);
+        if (rc)
+                CERROR("create object failed with rc = %d\n", rc);
          return (rc);
  }
  
-static int
-echo_get_object (struct ec_object **ecop, struct obd_device *obd,
-                 struct obdo *oa)
+static int echo_get_object(struct echo_object **ecop, struct echo_device *ed,
+                           struct obdo *oa)
  {
-        struct echo_client_obd *ec = &obd->u.echo_client;
-        struct ec_object       *eco;
-        struct ec_object       *eco2;
+        struct echo_client_obd *ec  = ed->ed_ec;
+        struct lov_stripe_md   *lsm = NULL;
+        struct echo_object     *eco;
          int                     rc;
+        ENTRY;
  
          if ((oa->o_valid & OBD_MD_FLID) == 0 ||
-            oa->o_id == 0)                      /* disallow use of object id 0 */
+            oa->o_id == 0)  /* disallow use of object id 0 */
          {
                  CERROR ("No valid oid\n");
-                return (-EINVAL);
-        }
-
-        spin_lock (&ec->ec_lock);
-        eco = echo_find_object_locked (obd, oa->o_id);
-        if (eco != NULL) {
-                if (eco->eco_deleted) {           /* being deleted */
-                        spin_unlock(&ec->ec_lock);/* (see comment in cleanup) */
-                        return (-EAGAIN);
-                }
-
-                eco->eco_refcount++;
-                spin_unlock (&ec->ec_lock);
-                *ecop = eco;
-                CDEBUG (D_INFO,
-                        "found %p: "LPX64"=%u#%u@%u refs %d del %d\n",
-                        eco, eco->eco_id,
-                        eco->eco_lsm->lsm_stripe_size,
-                        eco->eco_lsm->lsm_stripe_count,
-                        eco->eco_lsm->lsm_oinfo[0]->loi_ost_idx,
-                        eco->eco_refcount, eco->eco_deleted);
-                return (0);
+                RETURN(-EINVAL);
          }
-        spin_unlock (&ec->ec_lock);
  
-        if (ec->ec_nstripes != 0)               /* striping required */
-                return (-ENOENT);
-
-        eco = echo_allocate_object (obd);
-        if (eco == NULL)
-                return (-ENOMEM);
-
-        eco->eco_id = eco->eco_lsm->lsm_object_id = oa->o_id;
+        rc = obd_alloc_memmd(ec->ec_exp, &lsm);
+        if (rc < 0)
+                RETURN(rc);
  
-        spin_lock (&ec->ec_lock);
+        lsm->lsm_object_id = oa->o_id;
+        if (oa->o_valid & OBD_MD_FLGROUP)
+                lsm->lsm_object_gr = oa->o_gr;
+        else
+                lsm->lsm_object_gr = FILTER_GROUP_ECHO;
  
-        eco2 = echo_find_object_locked (obd, oa->o_id);
-        if (eco2 == NULL) {                     /* didn't race */
-                list_add (&eco->eco_obj_chain, &ec->ec_objects);
-                spin_unlock (&ec->ec_lock);
-                eco->eco_refcount = 1;
+        rc = 0;
+        eco = cl_echo_object_find(ed, &lsm);
+        if (!IS_ERR(eco))
                  *ecop = eco;
-                CDEBUG (D_INFO,
-                        "created %p: "LPX64"=%u#%u@%d refs %d del %d\n",
-                        eco, eco->eco_id,
-                        eco->eco_lsm->lsm_stripe_size,
-                        eco->eco_lsm->lsm_stripe_count,
-                        eco->eco_lsm->lsm_oinfo[0]->loi_ost_idx,
-                        eco->eco_refcount, eco->eco_deleted);
-                return (0);
-        }
-
-        if (eco2->eco_deleted)
-                rc = -EAGAIN;                   /* lose race */
-        else {
-                eco2->eco_refcount++;           /* take existing */
-                *ecop = eco2;
-                rc = 0;
-                LASSERT (eco2->eco_id == eco2->eco_lsm->lsm_object_id);
-                CDEBUG (D_INFO,
-                        "found(2) %p: "LPX64"=%u#%u@%d refs %d del %d\n",
-                        eco2, eco2->eco_id,
-                        eco2->eco_lsm->lsm_stripe_size,
-                        eco2->eco_lsm->lsm_stripe_count,
-                        eco2->eco_lsm->lsm_oinfo[0]->loi_ost_idx,
-                        eco2->eco_refcount, eco2->eco_deleted);
-        }
-
-        spin_unlock (&ec->ec_lock);
-
-        echo_free_object (eco);
-        return (rc);
+        else
+                rc = PTR_ERR(eco);
+        if (lsm)
+                obd_free_memmd(ec->ec_exp, &lsm);
+        RETURN(rc);
  }
  
-static void
-echo_put_object (struct ec_object *eco)
+static void echo_put_object(struct echo_object *eco)
  {
-        struct obd_device      *obd = eco->eco_device;
-        struct echo_client_obd *ec = &obd->u.echo_client;
-
-        /* Release caller's ref on the object.
-         * delete => mark for deletion when last ref goes
-         */
-
-        spin_lock (&ec->ec_lock);
-
-        eco->eco_refcount--;
-        LASSERT (eco->eco_refcount >= 0);
-
-        CDEBUG(D_INFO, "put %p: "LPX64"=%u#%u@%d refs %d del %d\n",
-               eco, eco->eco_id,
-               eco->eco_lsm->lsm_stripe_size,
-               eco->eco_lsm->lsm_stripe_count,
-               eco->eco_lsm->lsm_oinfo[0]->loi_ost_idx,
-               eco->eco_refcount, eco->eco_deleted);
-
-        if (eco->eco_refcount != 0 || !eco->eco_deleted) {
-                spin_unlock (&ec->ec_lock);
-                return;
-        }
-
-        spin_unlock (&ec->ec_lock);
-
-        /* NB leave obj in the object list.  We must prevent anyone from
-         * attempting to enqueue on this object number until we can be
-         * sure there will be no more lock callbacks.
-         */
-        obd_cancel_unused(ec->ec_exp, eco->eco_lsm, 0, NULL);
-
-        /* now we can let it go */
-        spin_lock (&ec->ec_lock);
-        list_del (&eco->eco_obj_chain);
-        spin_unlock (&ec->ec_lock);
-
-        LASSERT (eco->eco_refcount == 0);
-
-        echo_free_object (eco);
+        if (cl_echo_object_put(eco))
+                CERROR("echo client: drop an object failed");
  }
  
  static void
@@ -512,20 +1446,23 @@ static int echo_client_page_debug_check(struct lov_stripe_md *lsm,
          return rc;
  }
  
-static int echo_client_kbrw(struct obd_device *obd, int rw, struct obdo *oa,
-                            struct lov_stripe_md *lsm, obd_off offset,
-                            obd_size count, struct obd_trans_info *oti)
+static int echo_client_kbrw(struct echo_device *ed, int rw, struct obdo *oa,
+                            struct echo_object *eco, obd_off offset,
+                            obd_size count, int async,
+                            struct obd_trans_info *oti)
  {
-        struct echo_client_obd *ec = &obd->u.echo_client;
-        struct obd_info         oinfo = { { { 0 } } };
+        struct echo_client_obd *ec  = ed->ed_ec;
+        struct lov_stripe_md   *lsm = eco->eo_lsm;
          obd_count               npages;
          struct brw_page        *pga;
          struct brw_page        *pgp;
+        cfs_page_t            **pages;
          obd_off                 off;
          int                     i;
          int                     rc;
          int                     verify;
          int                     gfp_mask;
+        ENTRY;
  
          verify = ((oa->o_id) != ECHO_PERSISTENT_OBJID &&
                    (oa->o_valid & OBD_MD_FLFLAGS) != 0 &&
@@ -539,14 +1476,20 @@ static int echo_client_kbrw(struct obd_device *obd, int rw, struct obdo *oa,
  
          if (count <= 0 ||
              (count & (~CFS_PAGE_MASK)) != 0)
-                return (-EINVAL);
+                RETURN(-EINVAL);
  
          /* XXX think again with misaligned I/O */
          npages = count >> CFS_PAGE_SHIFT;
  
          OBD_ALLOC(pga, npages * sizeof(*pga));
          if (pga == NULL)
-                return (-ENOMEM);
+                RETURN(-ENOMEM);
+
+        OBD_ALLOC(pages, npages * sizeof(*pages));
+        if (pages == NULL) {
+                OBD_FREE(pga, npages * sizeof(*pga));
+                RETURN(-ENOMEM);
+        }
  
          for (i = 0, pgp = pga, off = offset;
               i < npages;
@@ -559,6 +1502,7 @@ static int echo_client_kbrw(struct obd_device *obd, int rw, struct obdo *oa,
                  if (pgp->pg == NULL)
                          goto out;
  
+                pages[i] = pgp->pg;
                  pgp->count = CFS_PAGE_SIZE;
                  pgp->off = off;
                  pgp->flag = 0;
@@ -568,9 +1512,13 @@ static int echo_client_kbrw(struct obd_device *obd, int rw, struct obdo *oa,
                                                       oa->o_id, off, pgp->count);
          }
  
-        oinfo.oi_oa = oa;
-        oinfo.oi_md = lsm;
-        rc = obd_brw(rw, ec->ec_exp, &oinfo, npages, pga, oti);
+        if (ed->ed_next == NULL) {
+                struct obd_info oinfo = { { { 0 } } };
+                oinfo.oi_oa = oa;
+                oinfo.oi_md = lsm;
+                rc = obd_brw(rw, ec->ec_exp, &oinfo, npages, pga, oti);
+        } else
+                rc = cl_echo_object_brw(eco, rw, offset, pages, npages, async);
  
   out:
          if (rc != 0 || rw != OBD_BRW_READ)
@@ -590,268 +1538,16 @@ static int echo_client_kbrw(struct obd_device *obd, int rw, struct obdo *oa,
                  OBD_PAGE_FREE(pgp->pg);
          }
          OBD_FREE(pga, npages * sizeof(*pga));
-        return (rc);
-}
-
-struct echo_async_state;
-
-#define EAP_MAGIC 79277927
-struct echo_async_page {
-        int                     eap_magic;
-        cfs_page_t             *eap_page;
-        void                    *eap_cookie;
-        obd_off                 eap_off;
-        struct echo_async_state *eap_eas;
-        struct list_head        eap_item;
-};
-
-static inline struct echo_async_page *eap_from_cookie(void *ptr)
-{
-        struct echo_async_page *ap = ptr;
-        LASSERT(ap->eap_magic == EAP_MAGIC);
-        return ap;
-}
-
-struct echo_async_state {
-        spinlock_t              eas_lock;
-        obd_off                 eas_next_offset;
-        obd_off                 eas_end_offset;
-        int                     eas_in_flight;
-        int                     eas_rc;
-        cfs_waitq_t             eas_waitq;
-        struct list_head        eas_avail;
-        struct obdo             eas_oa;
-        struct lov_stripe_md    *eas_lsm;
-};
-
-static int eas_should_wake(struct echo_async_state *eas)
-{
-        int rc = 0;
-
-        spin_lock(&eas->eas_lock);
-        if (eas->eas_rc == 0 && !list_empty(&eas->eas_avail))
-            rc = 1;
-        spin_unlock(&eas->eas_lock);
-        return rc;
-};
-
-static int ec_ap_make_ready(void *data, int cmd)
-{
-        /* our pages are issued ready */
-        LBUG();
-        return 0;
-}
-static int ec_ap_refresh_count(void *data, int cmd)
-{
-        /* our pages are issued with a stable count */
-        LBUG();
-        return CFS_PAGE_SIZE;
-}
-static void ec_ap_fill_obdo(void *data, int cmd, struct obdo *oa)
-{
-        struct echo_async_page *eap = eap_from_cookie(data);
-
-        memcpy(oa, &eap->eap_eas->eas_oa, sizeof(*oa));
-}
-
-static int ec_ap_completion(void *data, int cmd, struct obdo *oa, int rc)
-{
-        struct echo_async_page *eap = eap_from_cookie(data);
-        struct echo_async_state *eas;
-
-        eas = eap->eap_eas;
-
-        if (cmd == OBD_BRW_READ &&
-            eas->eas_oa.o_id != ECHO_PERSISTENT_OBJID &&
-            (eas->eas_oa.o_valid & OBD_MD_FLFLAGS) != 0 &&
-            (eas->eas_oa.o_flags & OBD_FL_DEBUG_CHECK) != 0)
-                echo_client_page_debug_check(eas->eas_lsm, eap->eap_page,
-                                             eas->eas_oa.o_id, eap->eap_off,
-                                             CFS_PAGE_SIZE);
-
-        spin_lock(&eas->eas_lock);
-        if (rc && !eas->eas_rc)
-                eas->eas_rc = rc;
-        eas->eas_in_flight--;
-        list_add(&eap->eap_item, &eas->eas_avail);
-        cfs_waitq_signal(&eas->eas_waitq);
-        spin_unlock(&eas->eas_lock);
-        return 0;
-}
-
-static struct obd_async_page_ops ec_async_page_ops = {
-        .ap_make_ready =        ec_ap_make_ready,
-        .ap_refresh_count =     ec_ap_refresh_count,
-        .ap_fill_obdo =         ec_ap_fill_obdo,
-        .ap_completion =        ec_ap_completion,
-};
-
-static int echo_client_async_page(struct obd_export *exp, int rw,
-                                   struct obdo *oa, struct lov_stripe_md *lsm,
-                                   obd_off offset, obd_size count,
-                                   obd_size batching)
-{
-        obd_count npages, i;
-        struct echo_async_page *eap;
-        struct echo_async_state eas;
-        int rc = 0;
-        struct echo_async_page **aps = NULL;
-
-        ENTRY;
-#if 0
-        int                     verify;
-        int                     gfp_mask;
-
-        verify = ((oa->o_id) != ECHO_PERSISTENT_OBJID &&
-                  (oa->o_valid & OBD_MD_FLFLAGS) != 0 &&
-                  (oa->o_flags & OBD_FL_DEBUG_CHECK) != 0);
-
-        gfp_mask = ((oa->o_id & 2) == 0) ? GFP_KERNEL : GFP_HIGHUSER;
-#endif
-
-        LASSERT(rw == OBD_BRW_WRITE || rw == OBD_BRW_READ);
-
-        if (count <= 0 ||
-            (count & (~CFS_PAGE_MASK)) != 0 ||
-            (lsm != NULL &&
-             lsm->lsm_object_id != oa->o_id))
-                return (-EINVAL);
-
-        /* XXX think again with misaligned I/O */
-        npages = batching >> CFS_PAGE_SHIFT;
-
-        memcpy(&eas.eas_oa, oa, sizeof(*oa));
-        eas.eas_next_offset = offset;
-        eas.eas_end_offset = offset + count;
-        spin_lock_init(&eas.eas_lock);
-        cfs_waitq_init(&eas.eas_waitq);
-        eas.eas_in_flight = 0;
-        eas.eas_rc = 0;
-        eas.eas_lsm = lsm;
-        CFS_INIT_LIST_HEAD(&eas.eas_avail);
-
-        OBD_ALLOC(aps, npages * sizeof aps[0]);
-        if (aps == NULL)
-                return (-ENOMEM);
-
-        /* prepare the group of pages that we're going to be keeping
-         * in flight */
-        for (i = 0; i < npages; i++) {
-                cfs_page_t *page;
-                OBD_PAGE_ALLOC(page, CFS_ALLOC_STD);
-                if (page == NULL)
-                        GOTO(out, rc = -ENOMEM);
-
-                OBD_ALLOC(eap, sizeof(*eap));
-                if (eap == NULL) {
-                        OBD_PAGE_FREE(page);
-                        GOTO(out, rc = -ENOMEM);
-                }
-
-                eap->eap_magic = EAP_MAGIC;
-                eap->eap_page = page;
-                eap->eap_eas = &eas;
-                list_add_tail(&eap->eap_item, &eas.eas_avail);
-                aps[i] = eap;
-        }
-
-        /* first we spin queueing io and being woken by its completion */
-        spin_lock(&eas.eas_lock);
-        for(;;) {
-                int rc;
-
-                /* sleep until we have a page to send */
-                spin_unlock(&eas.eas_lock);
-                rc = wait_event_interruptible(eas.eas_waitq,
-                                              eas_should_wake(&eas));
-                spin_lock(&eas.eas_lock);
-                if (rc && !eas.eas_rc)
-                        eas.eas_rc = rc;
-                if (eas.eas_rc)
-                        break;
-                if (list_empty(&eas.eas_avail))
-                        continue;
-                eap = list_entry(eas.eas_avail.next, struct echo_async_page,
-                                 eap_item);
-                list_del(&eap->eap_item);
-                spin_unlock(&eas.eas_lock);
-
-                /* unbind the eap from its old page offset */
-                if (eap->eap_cookie != NULL) {
-                        obd_teardown_async_page(exp, lsm, NULL,
-                                                eap->eap_cookie);
-                        eap->eap_cookie = NULL;
-                }
-
-                eas.eas_next_offset += CFS_PAGE_SIZE;
-                eap->eap_off = eas.eas_next_offset;
-
-                rc = obd_prep_async_page(exp, lsm, NULL, eap->eap_page,
-                                         eap->eap_off, &ec_async_page_ops,
-                                         eap, &eap->eap_cookie, 1, NULL);
-                if (rc) {
-                        spin_lock(&eas.eas_lock);
-                        eas.eas_rc = rc;
-                        break;
-                }
-
-                if (oa->o_id != ECHO_PERSISTENT_OBJID &&
-                    (oa->o_valid & OBD_MD_FLFLAGS) != 0 &&
-                    (oa->o_flags & OBD_FL_DEBUG_CHECK) != 0)
-                        echo_client_page_debug_setup(lsm, eap->eap_page, rw,
-                                                     oa->o_id,
-                                                     eap->eap_off, CFS_PAGE_SIZE);
-
-                /* always asserts urgent, which isn't quite right */
-                rc = obd_queue_async_io(exp, lsm, NULL, eap->eap_cookie,
-                                        rw, 0, CFS_PAGE_SIZE, 0,
-                                        ASYNC_READY | ASYNC_URGENT |
-                                        ASYNC_COUNT_STABLE);
-                spin_lock(&eas.eas_lock);
-                if (rc && !eas.eas_rc) {
-                        eas.eas_rc = rc;
-                        break;
-                }
-                eas.eas_in_flight++;
-                if (eas.eas_next_offset == eas.eas_end_offset)
-                        break;
-        }
-
-        /* still hold the eas_lock here.. */
-
-        /* now we just spin waiting for all the rpcs to complete */
-        while(eas.eas_in_flight) {
-                spin_unlock(&eas.eas_lock);
-                wait_event_interruptible(eas.eas_waitq,
-                                         eas.eas_in_flight == 0);
-                spin_lock(&eas.eas_lock);
-        }
-        spin_unlock(&eas.eas_lock);
-
-out:
-        if (aps != NULL) {
-                for (i = 0; i < npages; ++ i) {
-                        cfs_page_t *page;
-
-                        eap = aps[i];
-                        page = eap->eap_page;
-                        if (eap->eap_cookie != NULL)
-                                obd_teardown_async_page(exp, lsm, NULL,
-                                                        eap->eap_cookie);
-                        OBD_FREE(eap, sizeof(*eap));
-                        OBD_PAGE_FREE(page);
-                }
-                OBD_FREE(aps, npages * sizeof aps[0]);
-        }
-
+        OBD_FREE(pages, npages * sizeof(*pages));
          RETURN(rc);
  }
  
  static int echo_client_prep_commit(struct obd_export *exp, int rw,
-                                   struct obdo *oa, struct lov_stripe_md *lsm,
+                                   struct obdo *oa, struct echo_object *eco,
                                     obd_off offset, obd_size count,
                                     obd_size batch, struct obd_trans_info *oti)
  {
+        struct lov_stripe_md *lsm = eco->eo_lsm;
          struct obd_ioobj ioo;
          struct niobuf_local *lnb;
          struct niobuf_remote *rnb;
@@ -935,17 +1631,19 @@ out:
          RETURN(ret);
  }
  
-int echo_client_brw_ioctl(int rw, struct obd_export *exp,
-                          struct obd_ioctl_data *data)
+static int echo_client_brw_ioctl(int rw, struct obd_export *exp,
+                                 struct obd_ioctl_data *data)
  {
          struct obd_device *obd = class_exp2obd(exp);
-        struct echo_client_obd *ec = &obd->u.echo_client;
+        struct echo_device *ed = obd2echo_dev(obd);
+        struct echo_client_obd *ec = ed->ed_ec;
          struct obd_trans_info dummy_oti = { .oti_thread_id = -1 };
-        struct ec_object *eco;
+        struct echo_object *eco;
          int rc;
+        int async = 1;
          ENTRY;
  
-        rc = echo_get_object(&eco, obd, &data->ioc_obdo1);
+        rc = echo_get_object(&eco, ed, &data->ioc_obdo1);
          if (rc)
                  RETURN(rc);
  
@@ -955,18 +1653,16 @@ int echo_client_brw_ioctl(int rw, struct obd_export *exp,
  
          switch((long)data->ioc_pbuf1) {
          case 1:
-                rc = echo_client_kbrw(obd, rw, &data->ioc_obdo1,
-                                              eco->eco_lsm, data->ioc_offset,
-                                              data->ioc_count, &dummy_oti);
-                break;
+                async = 0;
+                /* fall through */
          case 2:
-                rc = echo_client_async_page(ec->ec_exp, rw, &data->ioc_obdo1,
-                                           eco->eco_lsm, data->ioc_offset,
-                                           data->ioc_count, data->ioc_plen1);
+                rc = echo_client_kbrw(ed, rw, &data->ioc_obdo1,
+                                      eco, data->ioc_offset,
+                                      data->ioc_count, async, &dummy_oti);
                  break;
          case 3:
                  rc = echo_client_prep_commit(ec->ec_exp, rw, &data->ioc_obdo1,
-                                            eco->eco_lsm, data->ioc_offset,
+                                            eco, data->ioc_offset,
                                              data->ioc_count, data->ioc_plen1,
                                              &dummy_oti);
                  break;
@@ -978,165 +1674,61 @@ int echo_client_brw_ioctl(int rw, struct obd_export *exp,
  }
  
  static int
-echo_ldlm_callback (struct ldlm_lock *lock, struct ldlm_lock_desc *new,
-                    void *data, int flag)
-{
-        struct ec_object       *eco = (struct ec_object *)data;
-        struct echo_client_obd *ec = &(eco->eco_device->u.echo_client);
-        struct lustre_handle    lockh;
-        struct list_head       *el;
-        int                     found = 0;
-        int                     rc;
-
-        ldlm_lock2handle (lock, &lockh);
-
-        /* #ifdef this out if we're not feeling paranoid */
-        spin_lock (&ec->ec_lock);
-        list_for_each (el, &ec->ec_objects) {
-                found = (eco == list_entry(el, struct ec_object,
-                                           eco_obj_chain));
-                if (found)
-                        break;
-        }
-        spin_unlock (&ec->ec_lock);
-        LASSERT (found);
-
-        switch (flag) {
-        case LDLM_CB_BLOCKING:
-                CDEBUG(D_INFO, "blocking callback on "LPX64", handle "LPX64"\n",
-                       eco->eco_id, lockh.cookie);
-                rc = ldlm_cli_cancel (&lockh);
-                if (rc != ELDLM_OK)
-                        CERROR ("ldlm_cli_cancel failed: %d\n", rc);
-                break;
-
-        case LDLM_CB_CANCELING:
-                CDEBUG(D_INFO, "cancel callback on "LPX64", handle "LPX64"\n",
-                       eco->eco_id, lockh.cookie);
-                break;
-
-        default:
-                LBUG ();
-        }
-
-        return (0);
-}
-
-static int
  echo_client_enqueue(struct obd_export *exp, struct obdo *oa,
                      int mode, obd_off offset, obd_size nob)
  {
-        struct obd_device      *obd = exp->exp_obd;
-        struct echo_client_obd *ec = &obd->u.echo_client;
+        struct echo_device     *ed = obd2echo_dev(exp->exp_obd);
          struct lustre_handle   *ulh = &oa->o_handle;
-        struct ldlm_enqueue_info einfo = { 0 };
-        struct obd_info oinfo = { { { 0 } } };
-        struct ec_object       *eco;
-        struct ec_lock         *ecl;
+        struct echo_object     *eco;
+        obd_off                 end;
          int                     rc;
+        ENTRY;
+
+        if (ed->ed_next == NULL)
+                RETURN(-EOPNOTSUPP);
  
          if (!(mode == LCK_PR || mode == LCK_PW))
-                return -EINVAL;
+                RETURN(-EINVAL);
  
          if ((offset & (~CFS_PAGE_MASK)) != 0 ||
              (nob & (~CFS_PAGE_MASK)) != 0)
-                return -EINVAL;
-
-        rc = echo_get_object (&eco, obd, oa);
-        if (rc != 0)
-                return rc;
+                RETURN(-EINVAL);
  
-        rc = -ENOMEM;
-        OBD_ALLOC (ecl, sizeof (*ecl));
-        if (ecl == NULL)
-                goto failed_0;
-
-        ecl->ecl_mode = mode;
-        ecl->ecl_object = eco;
-        ecl->ecl_policy.l_extent.start = offset;
-        ecl->ecl_policy.l_extent.end =
-                (nob == 0) ? ((obd_off) -1) : (offset + nob - 1);
-
-        einfo.ei_type = LDLM_EXTENT;
-        einfo.ei_mode = mode;
-        einfo.ei_cb_bl = echo_ldlm_callback;
-        einfo.ei_cb_cp = ldlm_completion_ast;
-        einfo.ei_cb_gl = NULL;
-        einfo.ei_cbdata = eco;
-
-        oinfo.oi_policy = ecl->ecl_policy;
-        oinfo.oi_lockh = &ecl->ecl_lock_handle;
-        oinfo.oi_md = eco->eco_lsm;
-        rc = obd_enqueue(ec->ec_exp, &oinfo, &einfo, NULL);
+        rc = echo_get_object (&eco, ed, oa);
          if (rc != 0)
-                goto failed_1;
-
-        CDEBUG(D_INFO, "enqueue handle "LPX64"\n", ecl->ecl_lock_handle.cookie);
-
-        /* NB ecl takes object ref from echo_get_object() above */
-        spin_lock(&ec->ec_lock);
-
-        list_add(&ecl->ecl_exp_chain, &exp->exp_ec_data.eced_locks);
-        ulh->cookie = ecl->ecl_cookie = ec->ec_unique++;
-
-        spin_unlock(&ec->ec_lock);
-
-        oa->o_valid |= OBD_MD_FLHANDLE;
-        return 0;
+                RETURN(rc);
  
- failed_1:
-        OBD_FREE (ecl, sizeof (*ecl));
- failed_0:
-        echo_put_object (eco);
-        return (rc);
+        end = (nob == 0) ? ((obd_off) -1) : (offset + nob - 1);
+        rc = cl_echo_enqueue(eco, offset, end, mode, &ulh->cookie);
+        if (rc == 0) {
+                oa->o_valid |= OBD_MD_FLHANDLE;
+                CDEBUG(D_INFO, "Cookie is %llx\n", ulh->cookie);
+        }
+        echo_put_object(eco);
+        RETURN(rc);
  }
  
  static int
  echo_client_cancel(struct obd_export *exp, struct obdo *oa)
  {
-        struct obd_device      *obd = exp->exp_obd;
-        struct echo_client_obd *ec = &obd->u.echo_client;
-        struct lustre_handle   *ulh = &oa->o_handle;
-        struct ec_lock         *ecl = NULL;
-        int                     found = 0;
-        struct list_head       *el;
-        int                     rc;
+        struct echo_device *ed     = obd2echo_dev(exp->exp_obd);
+        __u64               cookie = oa->o_handle.cookie;
  
          if ((oa->o_valid & OBD_MD_FLHANDLE) == 0)
                  return -EINVAL;
  
-        spin_lock (&ec->ec_lock);
-
-        list_for_each (el, &exp->exp_ec_data.eced_locks) {
-                ecl = list_entry (el, struct ec_lock, ecl_exp_chain);
-                found = (ecl->ecl_cookie == ulh->cookie);
-                if (found) {
-                        list_del (&ecl->ecl_exp_chain);
-                        break;
-                }
-        }
-
-        spin_unlock (&ec->ec_lock);
-
-        if (!found)
-                return (-ENOENT);
-
-        rc = obd_cancel(ec->ec_exp, ecl->ecl_object->eco_lsm, ecl->ecl_mode,
-                        &ecl->ecl_lock_handle);
-
-        echo_put_object (ecl->ecl_object);
-        OBD_FREE (ecl, sizeof (*ecl));
-
-        return rc;
+        CDEBUG(D_INFO, "Cookie is %llx\n", cookie);
+        return cl_echo_cancel(ed, cookie);
  }
  
  static int
  echo_client_iocontrol(unsigned int cmd, struct obd_export *exp,
                        int len, void *karg, void *uarg)
  {
-        struct obd_device      *obd;
-        struct echo_client_obd *ec;
-        struct ec_object       *eco;
+        struct obd_device      *obd = exp->exp_obd;
+        struct echo_device     *ed = obd2echo_dev(obd);
+        struct echo_client_obd *ec = ed->ed_ec;
+        struct echo_object     *eco;
          struct obd_ioctl_data  *data = karg;
          struct obd_trans_info   dummy_oti;
          struct oti_req_ack_lock *ack_lock;
@@ -1150,15 +1742,12 @@ echo_client_iocontrol(unsigned int cmd, struct obd_export *exp,
  
          memset(&dummy_oti, 0, sizeof(dummy_oti));
  
-        obd = exp->exp_obd;
-        ec = &obd->u.echo_client;
-
          switch (cmd) {
          case OBD_IOC_CREATE:                    /* may create echo object */
                  if (!cfs_capable(CFS_CAP_SYS_ADMIN))
                          GOTO (out, rc = -EPERM);
  
-                rc = echo_create_object (obd, 1, &data->ioc_obdo1,
+                rc = echo_create_object (ed, 1, &data->ioc_obdo1,
                                           data->ioc_pbuf1, data->ioc_plen1,
                                           &dummy_oti);
                  GOTO(out, rc);
@@ -1167,24 +1756,24 @@ echo_client_iocontrol(unsigned int cmd, struct obd_export *exp,
                  if (!cfs_capable(CFS_CAP_SYS_ADMIN))
                          GOTO (out, rc = -EPERM);
  
-                rc = echo_get_object (&eco, obd, &data->ioc_obdo1);
+                rc = echo_get_object (&eco, ed, &data->ioc_obdo1);
                  if (rc == 0) {
                          oa = &data->ioc_obdo1;
                          oa->o_gr = FILTER_GROUP_ECHO;
                          oa->o_valid |= OBD_MD_FLGROUP;
-                        rc = obd_destroy(ec->ec_exp, oa, eco->eco_lsm,
+                        rc = obd_destroy(ec->ec_exp, oa, eco->eo_lsm,
                                           &dummy_oti, NULL);
                          if (rc == 0)
-                                eco->eco_deleted = 1;
+                                eco->eo_deleted = 1;
                          echo_put_object(eco);
                  }
                  GOTO(out, rc);
  
          case OBD_IOC_GETATTR:
-                rc = echo_get_object (&eco, obd, &data->ioc_obdo1);
+                rc = echo_get_object (&eco, ed, &data->ioc_obdo1);
                  if (rc == 0) {
                          struct obd_info oinfo = { { { 0 } } };
-                        oinfo.oi_md = eco->eco_lsm;
+                        oinfo.oi_md = eco->eo_lsm;
                          oinfo.oi_oa = &data->ioc_obdo1;
                          rc = obd_getattr(ec->ec_exp, &oinfo);
                          echo_put_object(eco);
@@ -1195,11 +1784,11 @@ echo_client_iocontrol(unsigned int cmd, struct obd_export *exp,
                  if (!cfs_capable(CFS_CAP_SYS_ADMIN))
                          GOTO (out, rc = -EPERM);
  
-                rc = echo_get_object (&eco, obd, &data->ioc_obdo1);
+                rc = echo_get_object (&eco, ed, &data->ioc_obdo1);
                  if (rc == 0) {
                          struct obd_info oinfo = { { { 0 } } };
                          oinfo.oi_oa = &data->ioc_obdo1;
-                        oinfo.oi_md = eco->eco_lsm;
+                        oinfo.oi_md = eco->eo_lsm;
  
                          rc = obd_setattr(ec->ec_exp, &oinfo, NULL);
                          echo_put_object(eco);
@@ -1217,9 +1806,9 @@ echo_client_iocontrol(unsigned int cmd, struct obd_export *exp,
                  GOTO(out, rc);
  
          case ECHO_IOC_GET_STRIPE:
-                rc = echo_get_object(&eco, obd, &data->ioc_obdo1);
+                rc = echo_get_object(&eco, ed, &data->ioc_obdo1);
                  if (rc == 0) {
-                        rc = echo_copyout_lsm(eco->eco_lsm, data->ioc_pbuf1,
+                        rc = echo_copyout_lsm(eco->eo_lsm, data->ioc_pbuf1,
                                                data->ioc_plen1);
                          echo_put_object(eco);
                  }
@@ -1230,13 +1819,13 @@ echo_client_iocontrol(unsigned int cmd, struct obd_export *exp,
                          GOTO (out, rc = -EPERM);
  
                  if (data->ioc_pbuf1 == NULL) {  /* unset */
-                        rc = echo_get_object(&eco, obd, &data->ioc_obdo1);
+                        rc = echo_get_object(&eco, ed, &data->ioc_obdo1);
                          if (rc == 0) {
-                                eco->eco_deleted = 1;
+                                eco->eo_deleted = 1;
                                  echo_put_object(eco);
                          }
                  } else {
-                        rc = echo_create_object(obd, 0, &data->ioc_obdo1,
+                        rc = echo_create_object(ed, 0, &data->ioc_obdo1,
                                                  data->ioc_pbuf1,
                                                  data->ioc_plen1, &dummy_oti);
                  }
@@ -1248,7 +1837,8 @@ echo_client_iocontrol(unsigned int cmd, struct obd_export *exp,
  
                  rc = echo_client_enqueue(exp, &data->ioc_obdo1,
                                           data->ioc_conn1, /* lock mode */
-                                   data->ioc_offset, data->ioc_count);/*extent*/
+                                         data->ioc_offset,
+                                         data->ioc_count);/*extent*/
                  GOTO (out, rc);
  
          case ECHO_IOC_CANCEL:
@@ -1300,7 +1890,9 @@ static int echo_client_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
  
          spin_lock_init (&ec->ec_lock);
          CFS_INIT_LIST_HEAD (&ec->ec_objects);
+        CFS_INIT_LIST_HEAD (&ec->ec_locks);
          ec->ec_unique = 0;
+        ec->ec_nstripes = 0;
  
          OBD_ALLOC(ocd, sizeof(*ocd));
          if (ocd == NULL) {
@@ -1309,7 +1901,8 @@ static int echo_client_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
                  return -ENOMEM;
          }
  
-        ocd->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_REQPORTAL;
+        ocd->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_REQPORTAL |
+                                 OBD_CONNECT_GRANT;
          ocd->ocd_version = LUSTRE_VERSION_CODE;
          ocd->ocd_group = FILTER_GROUP_ECHO;
  
@@ -1329,8 +1922,6 @@ static int echo_client_setup(struct obd_device *obddev, struct lustre_cfg *lcfg)
  
  static int echo_client_cleanup(struct obd_device *obddev)
  {
-        struct list_head       *el;
-        struct ec_object       *eco;
          struct echo_client_obd *ec = &obddev->u.echo_client;
          int rc;
          ENTRY;
@@ -1340,17 +1931,6 @@ static int echo_client_cleanup(struct obd_device *obddev)
                  RETURN(-EBUSY);
          }
  
-        /* XXX assuming sole access */
-        while (!list_empty(&ec->ec_objects)) {
-                el = ec->ec_objects.next;
-                eco = list_entry(el, struct ec_object, eco_obj_chain);
-
-                LASSERT(eco->eco_refcount == 0);
-                eco->eco_refcount = 1;
-                eco->eco_deleted = 1;
-                echo_put_object(eco);
-        }
-
          rc = obd_disconnect(ec->ec_exp);
          if (rc != 0)
                  CERROR("fail to disconnect device: %d\n", rc);
@@ -1370,7 +1950,6 @@ static int echo_client_connect(const struct lu_env *env,
          rc = class_connect(conn, src, cluuid);
          if (rc == 0) {
                  exp = class_conn2export(conn);
-                CFS_INIT_LIST_HEAD(&exp->exp_ec_data.eced_locks);
                  class_export_put(exp);
          }
  
@@ -1379,15 +1958,18 @@ static int echo_client_connect(const struct lu_env *env,
  
  static int echo_client_disconnect(struct obd_export *exp)
  {
+#if 0
          struct obd_device      *obd;
          struct echo_client_obd *ec;
          struct ec_lock         *ecl;
+#endif
          int                     rc;
          ENTRY;
  
          if (exp == NULL)
                  GOTO(out, rc = -EINVAL);
  
+#if 0
          obd = exp->exp_obd;
          ec = &obd->u.echo_client;
  
@@ -1406,6 +1988,7 @@ static int echo_client_disconnect(struct obd_export *exp)
                  echo_put_object (ecl->ecl_object);
                  OBD_FREE (ecl, sizeof (*ecl));
          }
+#endif
  
          rc = class_disconnect(exp);
          GOTO(out, rc);
@@ -1415,8 +1998,12 @@ static int echo_client_disconnect(struct obd_export *exp)
  
  static struct obd_ops echo_obd_ops = {
          .o_owner       = THIS_MODULE,
+
+#if 0
          .o_setup       = echo_client_setup,
          .o_cleanup     = echo_client_cleanup,
+#endif
+
          .o_iocontrol   = echo_client_iocontrol,
          .o_connect     = echo_client_connect,
          .o_disconnect  = echo_client_disconnect
@@ -1425,13 +2012,19 @@ static struct obd_ops echo_obd_ops = {
  int echo_client_init(void)
  {
          struct lprocfs_static_vars lvars = { 0 };
+        int rc;
  
          lprocfs_echo_init_vars(&lvars);
-        return class_register_type(&echo_obd_ops, NULL, lvars.module_vars,
-                                   LUSTRE_ECHO_CLIENT_NAME, NULL);
+        rc = class_register_type(&echo_obd_ops, NULL, lvars.module_vars,
+                                 LUSTRE_ECHO_CLIENT_NAME, &echo_device_type);
+        if (rc == 0)
+                lu_kmem_init(echo_caches);
+        return rc;
  }
  
  void echo_client_exit(void)
  {
          class_unregister_type(LUSTRE_ECHO_CLIENT_NAME);
+        lu_kmem_fini(echo_caches);
  }
+
diff --git a/lustre/obdecho/echo_internal.h b/lustre/obdecho/echo_internal.h

new file mode 100644 (file)

index 0000000..c45f0c6
--- /dev/null
+++ b/lustre/obdecho/echo_internal.h
@@ -0,0 +1,30 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ */
+
+#ifndef _ECHO_INTERNAL_H
+#define _ECHO_INTERNAL_H
+
+/* The persistent object (i.e. actually stores stuff!) */
+#define ECHO_PERSISTENT_OBJID    1ULL
+#define ECHO_PERSISTENT_SIZE     ((__u64)(1<<20))
+
+/* block size to use for data verification */
+#define OBD_ECHO_BLOCK_SIZE    (4<<10)
+
+#ifndef __KERNEL__
+/* Kludge here, define some functions and macros needed by liblustre -jay */
+static inline void page_cache_get(struct page *page)
+{
+}
+
+static inline void page_cache_release(struct page *page)
+{
+}
+
+#define READ    0
+#define WRITE   1
+
+#endif /* ifndef __KERNEL__ */
+
+#endif
diff --git a/lustre/obdfilter/filter_io.c b/lustre/obdfilter/filter_io.c

index 526828c..5bab0b6 100644 (file)
--- a/lustre/obdfilter/filter_io.c
+++ b/lustre/obdfilter/filter_io.c
@@ -842,7 +842,9 @@ static int filter_commitrw_read(struct obd_export *exp, struct obdo *oa,
                  resource = ldlm_resource_get(ns, NULL, &res_id, LDLM_EXTENT, 0);
  
                  if (resource != NULL) {
+                        LDLM_RESOURCE_ADDREF(resource);
                          ns->ns_lvbo->lvbo_update(resource, NULL, 0, 1);
+                        LDLM_RESOURCE_DELREF(resource);
                          ldlm_resource_putref(resource);
                  }
          }
diff --git a/lustre/osc/Makefile.in b/lustre/osc/Makefile.in

index 2eb2eea..438ce4c 100644 (file)
--- a/lustre/osc/Makefile.in
+++ b/lustre/osc/Makefile.in
@@ -1,4 +1,4 @@
  MODULES := osc
-osc-objs := osc_request.o lproc_osc.o osc_create.o cache.o
+osc-objs := osc_request.o lproc_osc.o osc_create.o osc_dev.o osc_object.o osc_page.o osc_lock.o osc_io.o
  
  @INCLUDE_RULES@
diff --git a/lustre/osc/autoMakefile.am b/lustre/osc/autoMakefile.am

index 65c588b..cf370ba 100644 (file)
--- a/lustre/osc/autoMakefile.am
+++ b/lustre/osc/autoMakefile.am
@@ -36,7 +36,8 @@
  
  if LIBLUSTRE
  noinst_LIBRARIES = libosc.a
-libosc_a_SOURCES = osc_request.c osc_create.c osc_internal.h cache.c
+libosc_a_SOURCES = osc_request.c osc_create.c osc_internal.h osc_cl_internal.h osc_dev.c osc_object.c osc_page.c osc_lock.c osc_io.c
+
  libosc_a_CPPFLAGS = $(LLCPPFLAGS)
  libosc_a_CFLAGS = $(LLCFLAGS)
  endif
@@ -52,6 +53,11 @@ macos_PROGRAMS := osc
  
  osc_SOURCES := \
          osc_create.c \
+        osc_dev.c    \
+        osc_object.c \
+        osc_page.c   \
+        osc_lock.c   \
+        osc_io.c     \
          osc_request.c
  
  osc_CFLAGS := $(EXTRA_KCFLAGS)
@@ -69,4 +75,4 @@ endif
  install-data-hook: $(install_data_hook)
  
  MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ 
-DIST_SOURCES = $(osc-objs:%.o=%.c) osc_internal.h
+DIST_SOURCES = $(osc-objs:%.o=%.c) osc_internal.h osc_cl_internal.h
diff --git a/lustre/osc/cache.c b/lustre/osc/cache.c

deleted file mode 100644 (file)

index 90fb60a..0000000
--- a/lustre/osc/cache.c
+++ /dev/null
@@ -1,445 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
- * GPL HEADER START
- *
- * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 only,
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License version 2 for more details (a copy is included
- * in the LICENSE file that accompanied this code).
- *
- * You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see
- * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
- *
- * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
- * CA 95054 USA or visit www.sun.com if you need additional information or
- * have any questions.
- *
- * GPL HEADER END
- */
-/*
- * Copyright  2008 Sun Microsystems, Inc. All rights reserved
- * Use is subject to license terms.
- */
-/*
- * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
- *
- * lustre/osc/cache.c
- *
- * Cache of triples - object, lock, extent
- */
-
-#ifndef EXPORT_SYMTAB
-# define EXPORT_SYMTAB
-#endif
-#define DEBUG_SUBSYSTEM S_OSC
-
-#ifdef __KERNEL__
-# include <linux/version.h>
-# include <linux/module.h>
-# include <linux/list.h>
-#else                           /* __KERNEL__ */
-# include <liblustre.h>
-#endif
-
-#include <lustre_dlm.h>
-#include <lustre_cache.h>
-#include <obd.h>
-#include <lustre_debug.h>
-
-#include "osc_internal.h"
-
-/* Adding @lock to the @cache */
-int cache_add_lock(struct lustre_cache *cache, struct lustre_handle *lockh)
-{
-        struct ldlm_lock *lock = ldlm_handle2lock(lockh);
-
-        if (!lock)      // Lock disappeared under us.
-                return 0;
-
-        spin_lock(&cache->lc_locks_list_lock);
-        list_add_tail(&lock->l_cache_locks_list, &cache->lc_locks_list);
-        spin_unlock(&cache->lc_locks_list_lock);
-
-        LDLM_LOCK_PUT(lock);
-
-        return 0;
-}
-
-/* Tries to add @extent to lock represented by @lockh if non-NULL, otherwise
-   just tries to match some suitable lock by resource and data contained in
-   @extent */
-/* Should be called with oap->lock held (except on initial addition, see
-   comment in osc_request.c*/
-int cache_add_extent(struct lustre_cache *cache, struct ldlm_res_id *res,
-                     struct osc_async_page *extent, struct lustre_handle *lockh)
-{
-        struct lustre_handle tmplockh;
-        ldlm_policy_data_t tmpex;
-        struct ldlm_lock *lock = NULL;
-        ENTRY;
-
-        /* Don't add anything second time */
-        if (!list_empty(&extent->oap_page_list)) {
-                LBUG();
-                RETURN(0);
-        }
-
-        if (lockh && lustre_handle_is_used(lockh)) {
-                lock = ldlm_handle2lock(lockh);
-                if (!lock)
-                        RETURN(-ENOLCK);
-
-                LASSERTF(lock->l_policy_data.l_extent.start <=
-                         extent->oap_obj_off &&
-                         extent->oap_obj_off + CFS_PAGE_SIZE - 1 <=
-                         lock->l_policy_data.l_extent.end,
-                         "Got wrong lock [" LPU64 "," LPU64 "] for page with "
-                         "offset " LPU64 "\n",
-                         lock->l_policy_data.l_extent.start,
-                         lock->l_policy_data.l_extent.end, extent->oap_obj_off);
-        } else {
-                int mode;
-                /* Real extent width calculation here once we have real
-                 * extents
-                 */
-                tmpex.l_extent.start = extent->oap_obj_off;
-                tmpex.l_extent.end = tmpex.l_extent.start + CFS_PAGE_SIZE - 1;
-
-                /* XXX find lock from extent or something like that */
-                /* The lock mode does not matter. If this is dirty page - then
-                 * there could be only one PW lock. If the page is clean,
-                 * any PR lock is good
-                 */
-
-                mode = ldlm_lock_match(cache->lc_obd->obd_namespace,
-                                       LDLM_FL_BLOCK_GRANTED |
-                                       LDLM_FL_CBPENDING, res, LDLM_EXTENT,
-                                       &tmpex, LCK_PW | LCK_PR, &tmplockh);
-
-                if (mode <= 0) {
-                        CDEBUG(D_CACHE, "No lock to attach " LPU64 "->" LPU64
-                               " extent to!\n", tmpex.l_extent.start,
-                               tmpex.l_extent.end);
-                        RETURN((mode < 0) ? mode : -ENOLCK);
-                }
-
-                lock = ldlm_handle2lock(&tmplockh);
-                if (!lock) {    // Race - lock disappeared under us (eviction?)
-                        CDEBUG(D_CACHE, "Newly matched lock just disappeared "
-                               "under us\n");
-                        RETURN(-ENOLCK);
-                }
-                ldlm_lock_decref(&tmplockh, mode);
-        }
-
-        spin_lock(&lock->l_extents_list_lock);
-        list_add_tail(&extent->oap_page_list, &lock->l_extents_list);
-        spin_unlock(&lock->l_extents_list_lock);
-        extent->oap_ldlm_lock = lock;
-        LDLM_LOCK_PUT(lock);
-
-        RETURN(0);
-}
-
-static void cache_extent_removal_get(struct page_removal_cb_element *element)
-{
-        atomic_inc(&element->prce_refcnt);
-}
-
-static void cache_extent_removal_put(struct page_removal_cb_element *element)
-{
-        if(atomic_dec_and_test(&element->prce_refcnt))
-                OBD_FREE_PTR(element);
-}
-
-static int cache_extent_removal_event(struct lustre_cache *cache,
-                                      void *data, int discard)
-{
-        struct page *page = data;
-        struct list_head *iter;
-        struct page_removal_cb_element *element;
-
-        read_lock(&cache->lc_page_removal_cb_lock);
-        iter = cache->lc_page_removal_callback_list.next;
-        while(iter != &cache->lc_page_removal_callback_list) {
-                element = list_entry(iter, struct page_removal_cb_element, prce_list);
-                cache_extent_removal_get(element);
-                read_unlock(&cache->lc_page_removal_cb_lock);
-
-                element->prce_callback(page, discard);
-
-                read_lock(&cache->lc_page_removal_cb_lock);
-                iter = iter->next;
-                cache_extent_removal_put(element);
-        }
-        read_unlock(&cache->lc_page_removal_cb_lock);
-
-        return 0;
-}
-
-/* Registers set of pin/remove callbacks for extents. Current limitation is
-   there could be only one pin_cb per cache.
-   @pin_cb is called when we have the page locked to pin it in memory so that
-   it does not disappear after we release page lock (which we need to do
-   to avoid deadlocks).
-   @func_cb is removal callback that is called after page and all spinlocks are
-   released, and is supposed to clean the page and remove it from all
-   (vfs) caches it might be in */
-int cache_add_extent_removal_cb(struct lustre_cache *cache,
-                                obd_page_removal_cb_t func_cb,
-                                obd_pin_extent_cb pin_cb)
-{
-        struct page_removal_cb_element *element;
-
-        if (!func_cb)
-                return 0;
-
-        OBD_ALLOC_PTR(element);
-        if (!element)
-                return -ENOMEM;
-        element->prce_callback = func_cb;
-        atomic_set(&element->prce_refcnt, 1);
-
-        write_lock(&cache->lc_page_removal_cb_lock);
-        list_add_tail(&element->prce_list,
-                      &cache->lc_page_removal_callback_list);
-        write_unlock(&cache->lc_page_removal_cb_lock);
-
-        cache->lc_pin_extent_cb = pin_cb;
-        return 0;
-}
-EXPORT_SYMBOL(cache_add_extent_removal_cb);
-
-/* Unregister exntent removal callback registered earlier. If the list of
-   registered removal callbacks becomes empty, we also clear pin callback
-   since it could only be one */
-int cache_del_extent_removal_cb(struct lustre_cache *cache,
-                                obd_page_removal_cb_t func_cb)
-{
-        int found = 0;
-        struct page_removal_cb_element *element, *t;
-
-        write_lock(&cache->lc_page_removal_cb_lock);
-        list_for_each_entry_safe(element, t,
-                                 &cache->lc_page_removal_callback_list,
-                                 prce_list) {
-                if (element->prce_callback == func_cb) {
-                        list_del(&element->prce_list);
-                        write_unlock(&cache->lc_page_removal_cb_lock);
-                        found = 1;
-                        cache_extent_removal_put(element);
-                        write_lock(&cache->lc_page_removal_cb_lock);
-                        /* We continue iterating the list in case this function
-                           was registered more than once */
-                }
-        }
-        write_unlock(&cache->lc_page_removal_cb_lock);
-
-        if (list_empty(&cache->lc_page_removal_callback_list))
-                cache->lc_pin_extent_cb = NULL;
-
-        return !found;
-}
-EXPORT_SYMBOL(cache_del_extent_removal_cb);
-
-static int cache_remove_extent_nolock(struct lustre_cache *cache,
-                                      struct osc_async_page *extent)
-{
-        int have_lock = !!extent->oap_ldlm_lock;
-        /* We used to check oap_ldlm_lock for non NULL here, but it might be
-           NULL, in fact, due to parallel page eviction clearing it and waiting
-           on a lock's page list lock */
-        extent->oap_ldlm_lock = NULL;
-
-        if (!list_empty(&extent->oap_page_list))
-                list_del_init(&extent->oap_page_list);
-
-        return have_lock;
-}
-
-/* Request the @extent to be removed from cache and locks it belongs to. */
-void cache_remove_extent(struct lustre_cache *cache,
-                         struct osc_async_page *extent)
-{
-        struct ldlm_lock *lock;
-
-        spin_lock(&extent->oap_lock);
-        lock = extent->oap_ldlm_lock;
-
-        extent->oap_ldlm_lock = NULL;
-        spin_unlock(&extent->oap_lock);
-
-        /* No lock - means this extent is not in any list */
-        if (!lock)
-                return;
-
-        spin_lock(&lock->l_extents_list_lock);
-        if (!list_empty(&extent->oap_page_list))
-                list_del_init(&extent->oap_page_list);
-        spin_unlock(&lock->l_extents_list_lock);
-}
-
-/* iterate through list of extents in given lock identified by @lockh,
-   calling @cb_func for every such extent. also passed @data to every call.
-   stops iterating prematurely if @cb_func returns nonzero. */
-int cache_iterate_extents(struct lustre_cache *cache,
-                          struct lustre_handle *lockh,
-                          cache_iterate_extents_cb_t cb_func, void *data)
-{
-        struct ldlm_lock *lock = ldlm_handle2lock(lockh);
-        struct osc_async_page *extent, *t;
-
-        if (!lock)      // Lock disappeared
-                return 0;
-        /* Parallel page removal from mem pressure can race with us */
-        spin_lock(&lock->l_extents_list_lock);
-        list_for_each_entry_safe(extent, t, &lock->l_extents_list,
-                                 oap_page_list) {
-                if (cb_func(cache, lockh, extent, data))
-                        break;
-        }
-        spin_unlock(&lock->l_extents_list_lock);
-        LDLM_LOCK_PUT(lock);
-
-        return 0;
-}
-
-static int cache_remove_extents_from_lock(struct lustre_cache *cache,
-                                          struct ldlm_lock *lock, void *data)
-{
-        struct osc_async_page *extent;
-        void *ext_data;
-
-        LASSERT(lock);
-
-        spin_lock(&lock->l_extents_list_lock);
-        while (!list_empty(&lock->l_extents_list)) {
-                extent = list_entry(lock->l_extents_list.next,
-                                    struct osc_async_page, oap_page_list);
-
-                spin_lock(&extent->oap_lock);
-                /* If there is no lock referenced from this oap, it means
-                   there is parallel page-removal process waiting to free that
-                   page on l_extents_list_lock and it holds page lock.
-                   We need this page to completely go away and for that to
-                   happen we will just try to truncate it here too.
-                   Serialisation on page lock will achieve that goal for us. */
-                /* Try to add extent back to the cache first, but only if we
-                 * cancel read lock, write locks cannot have other overlapping
-                 * locks. If adding is not possible (or canceling pw lock),
-                 * then remove extent from cache */
-                if (!cache_remove_extent_nolock(cache, extent) ||
-                    (lock->l_granted_mode == LCK_PW) ||
-                    cache_add_extent(cache, &lock->l_resource->lr_name, extent,
-                                     NULL)) {
-                        /* We need to remember this oap_page value now,
-                           once we release spinlocks, extent struct
-                           might be freed and we endup requesting
-                           page with address 0x5a5a5a5a in
-                           cache_extent_removal_event */
-                        ext_data = extent->oap_page;
-                        cache->lc_pin_extent_cb(extent->oap_page);
-                        spin_unlock(&extent->oap_lock);
-                        spin_unlock(&lock->l_extents_list_lock);
-                        cache_extent_removal_event(cache, ext_data,
-                                                   lock->
-                                                   l_flags &
-                                                   LDLM_FL_DISCARD_DATA);
-                        spin_lock(&lock->l_extents_list_lock);
-                } else {
-                        spin_unlock(&extent->oap_lock);
-                }
-        }
-        spin_unlock(&lock->l_extents_list_lock);
-
-        return 0;
-}
-
-/* Remoes @lock from cache after necessary checks. */
-int cache_remove_lock(struct lustre_cache *cache, struct lustre_handle *lockh)
-{
-        struct ldlm_lock *lock = ldlm_handle2lock(lockh);
-
-        if (!lock)  // The lock was removed by somebody just now, nothing to do
-                return 0;
-
-        cache_remove_extents_from_lock(cache, lock, NULL /*data */ );
-
-        spin_lock(&cache->lc_locks_list_lock);
-        list_del_init(&lock->l_cache_locks_list);
-        spin_unlock(&cache->lc_locks_list_lock);
-
-        LDLM_LOCK_PUT(lock);
-
-        return 0;
-}
-
-/* Supposed to iterate through all locks in the cache for given resource.
-   Not implemented atthe moment. */
-int cache_iterate_locks(struct lustre_cache *cache, struct ldlm_res_id *res,
-                        cache_iterate_locks_cb_t cb_fun, void *data)
-{
-        return -ENOTSUPP;
-}
-
-/* Create lustre cache and attach it to @obd */
-struct lustre_cache *cache_create(struct obd_device *obd)
-{
-        struct lustre_cache *cache;
-
-        OBD_ALLOC(cache, sizeof(*cache));
-        if (!cache)
-                GOTO(out, NULL);
-        spin_lock_init(&cache->lc_locks_list_lock);
-        CFS_INIT_LIST_HEAD(&cache->lc_locks_list);
-        CFS_INIT_LIST_HEAD(&cache->lc_page_removal_callback_list);
-        rwlock_init(&cache->lc_page_removal_cb_lock);
-        cache->lc_obd = obd;
-
-      out:
-        return cache;
-}
-
-/* Destroy @cache and free its memory */
-int cache_destroy(struct lustre_cache *cache)
-{
-        if (cache) {
-                spin_lock(&cache->lc_locks_list_lock);
-                if (!list_empty(&cache->lc_locks_list)) {
-                        struct ldlm_lock *lock, *tmp;
-                        CERROR("still have locks in the list on cleanup:\n");
-
-                        list_for_each_entry_safe(lock, tmp,
-                                                 &cache->lc_locks_list,
-                                                 l_cache_locks_list) {
-                                list_del_init(&lock->l_cache_locks_list);
-                                /* XXX: Of course natural idea would be to print
-                                   offending locks here, but if we use
-                                   e.g. LDLM_ERROR, we will likely crash here,
-                                   as LDLM error tries to access e.g.
-                                   nonexisting namespace. Normally this kind of
-                                   case could only happen when somebody did not
-                                   release lock reference and we have other ways
-                                   to detect this. */
-                                /* Make sure there are no pages left under the
-                                   lock */
-                                LASSERT(list_empty(&lock->l_extents_list));
-                        }
-                }
-                spin_unlock(&cache->lc_locks_list_lock);
-                LASSERT(list_empty(&cache->lc_page_removal_callback_list));
-                OBD_FREE(cache, sizeof(*cache));
-        }
-
-        return 0;
-}
diff --git a/lustre/osc/lproc_osc.c b/lustre/osc/lproc_osc.c

index e846e17..d17dbf8 100644 (file)
--- a/lustre/osc/lproc_osc.c
+++ b/lustre/osc/lproc_osc.c
@@ -479,6 +479,44 @@ static int osc_wr_resend_count(struct file *file, const char *buffer,
          return count;
  }
  
+static int osc_rd_contention_seconds(char *page, char **start, off_t off,
+                                     int count, int *eof, void *data)
+{
+        struct obd_device *obd = data;
+        struct osc_device *od  = obd2osc_dev(obd);
+
+        return snprintf(page, count, "%u\n", od->od_contention_time);
+}
+
+static int osc_wr_contention_seconds(struct file *file, const char *buffer,
+                                     unsigned long count, void *data)
+{
+        struct obd_device *obd = data;
+        struct osc_device *od  = obd2osc_dev(obd);
+
+        return lprocfs_write_helper(buffer, count, &od->od_contention_time) ?:
+                count;
+}
+
+static int osc_rd_lockless_truncate(char *page, char **start, off_t off,
+                                    int count, int *eof, void *data)
+{
+        struct obd_device *obd = data;
+        struct osc_device *od  = obd2osc_dev(obd);
+
+        return snprintf(page, count, "%u\n", od->od_lockless_truncate);
+}
+
+static int osc_wr_lockless_truncate(struct file *file, const char *buffer,
+                                    unsigned long count, void *data)
+{
+        struct obd_device *obd = data;
+        struct osc_device *od  = obd2osc_dev(obd);
+
+        return lprocfs_write_helper(buffer, count, &od->od_lockless_truncate) ?:
+                count;
+}
+
  static struct lprocfs_vars lprocfs_osc_obd_vars[] = {
          { "uuid",            lprocfs_rd_uuid,        0, 0 },
          { "ping",            0, lprocfs_wr_ping,     0, 0, 0222 },
@@ -510,6 +548,10 @@ static struct lprocfs_vars lprocfs_osc_obd_vars[] = {
          { "checksum_type",   osc_rd_checksum_type, osc_wd_checksum_type, 0 },
          { "resend_count",    osc_rd_resend_count, osc_wr_resend_count, 0},
          { "timeouts",        lprocfs_rd_timeouts,      0, 0 },
+        { "contention_seconds", osc_rd_contention_seconds,
+                                osc_wr_contention_seconds, 0 },
+        { "lockless_truncate",  osc_rd_lockless_truncate,
+                                osc_wr_lockless_truncate, 0 },
          { "import",          lprocfs_rd_import,    0, 0 },
          { 0 }
  };
@@ -637,10 +679,48 @@ static ssize_t osc_rpc_stats_seq_write(struct file *file, const char *buf,
  
  LPROC_SEQ_FOPS(osc_rpc_stats);
  
+static int osc_stats_seq_show(struct seq_file *seq, void *v)
+{
+        struct timeval now;
+        struct obd_device *dev = seq->private;
+        struct osc_stats *stats = &obd2osc_dev(dev)->od_stats;
+
+        do_gettimeofday(&now);
+
+        seq_printf(seq, "snapshot_time:         %lu.%lu (secs.usecs)\n",
+                   now.tv_sec, now.tv_usec);
+        seq_printf(seq, "lockless_write_bytes\t\t"LPU64"\n",
+                   stats->os_lockless_writes);
+        seq_printf(seq, "lockless_read_bytes\t\t"LPU64"\n",
+                   stats->os_lockless_reads);
+        seq_printf(seq, "lockless_truncate\t\t"LPU64"\n",
+                   stats->os_lockless_truncates);
+        return 0;
+}
+
+static ssize_t osc_stats_seq_write(struct file *file, const char *buf,
+                                   size_t len, loff_t *off)
+{
+        struct seq_file *seq = file->private_data;
+        struct obd_device *dev = seq->private;
+        struct osc_stats *stats = &obd2osc_dev(dev)->od_stats;
+
+        memset(stats, 0, sizeof(*stats));
+        return len;
+}
+
+LPROC_SEQ_FOPS(osc_stats);
+
  int lproc_osc_attach_seqstat(struct obd_device *dev)
  {
-        return lprocfs_obd_seq_create(dev, "rpc_stats", 0444,
-                                      &osc_rpc_stats_fops, dev);
+        int rc;
+
+        rc = lprocfs_seq_create(dev->obd_proc_entry, "osc_stats", 0444,
+                                &osc_stats_fops, dev);
+        if (rc == 0)
+                rc = lprocfs_obd_seq_create(dev, "rpc_stats", 0444,
+                                            &osc_rpc_stats_fops, dev);
+        return rc;
  }
  
  void lprocfs_osc_init_vars(struct lprocfs_static_vars *lvars)
diff --git a/lustre/osc/osc_cl_internal.h b/lustre/osc/osc_cl_internal.h

new file mode 100644 (file)

index 0000000..be6badb
--- /dev/null
+++ b/lustre/osc/osc_cl_internal.h
@@ -0,0 +1,424 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Internal interfaces of OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#ifndef OSC_CL_INTERNAL_H
+#define OSC_CL_INTERNAL_H
+
+#ifdef __KERNEL__
+# include <libcfs/libcfs.h>
+#else
+# include <liblustre.h>
+#endif
+
+#include <obd.h>
+/* osc_build_res_name() */
+#include <obd_ost.h>
+#include <cl_object.h>
+#include "osc_internal.h"
+
+/** \addtogroup osc osc @{ */
+
+/**
+ * State maintained by osc layer for each IO context.
+ */
+struct osc_io {
+        /** super class */
+        struct cl_io_slice oi_cl;
+        /** true if this io is lockless. */
+        int                oi_lockless;
+
+        struct obdo        oi_oa;
+        struct osc_punch_cbargs {
+                int               opc_rc;
+                struct completion opc_sync;
+        } oi_punch_cbarg;
+};
+
+/**
+ * State of transfer for osc.
+ */
+struct osc_req {
+        struct cl_req_slice    or_cl;
+};
+
+/**
+ * State maintained by osc layer for the duration of a system call.
+ */
+struct osc_session {
+        struct osc_io       os_io;
+};
+
+struct osc_thread_info {
+        struct ldlm_res_id      oti_resname;
+        ldlm_policy_data_t      oti_policy;
+        struct cl_lock_descr    oti_descr;
+        struct cl_attr          oti_attr;
+        struct lustre_handle    oti_handle;
+        struct cl_lock_closure  oti_closure;
+        struct cl_page_list     oti_plist;
+};
+
+struct osc_object {
+        struct cl_object   oo_cl;
+        struct lov_oinfo  *oo_oinfo;
+        /**
+         * True if locking against this stripe got -EUSERS.
+         */
+        int                oo_contended;
+        cfs_time_t         oo_contention_time;
+#ifdef INVARIANT_CHECK
+        /**
+         * IO context used for invariant checks in osc_lock_has_pages().
+         */
+        struct cl_io       oo_debug_io;
+        /** Serialization object for osc_object::oo_debug_io. */
+        struct mutex       oo_debug_mutex;
+#endif
+        /**
+         * List of pages in transfer.
+         */
+        struct list_head   oo_inflight[CRT_NR];
+        /**
+         * Lock, protecting ccc_object::cob_inflight, because a seat-belt is
+         * locked during take-off and landing.
+         */
+        spinlock_t         oo_seatbelt;
+};
+
+/*
+ * Lock "micro-states" for osc layer.
+ */
+enum osc_lock_state {
+        OLS_NEW,
+        OLS_ENQUEUED,
+        OLS_UPCALL_RECEIVED,
+        OLS_GRANTED,
+        OLS_RELEASED,
+        OLS_BLOCKED,
+        OLS_CANCELLED
+};
+
+/**
+ * osc-private state of cl_lock.
+ *
+ * Interaction with DLM.
+ *
+ * CLIO enqueues all DLM locks through ptlrpcd (that is, in "async" mode).
+ *
+ * Once receive upcall is invoked, osc_lock remembers a handle of DLM lock in
+ * osc_lock::ols_handle and a pointer to that lock in osc_lock::ols_lock.
+ *
+ * This pointer is protected through a reference, acquired by
+ * osc_lock_upcall0(). Also, an additional reference is acquired by
+ * ldlm_lock_addref() call protecting the lock from cancellation, until
+ * osc_lock_unuse() releases it.
+ *
+ * Below is a description of how lock references are acquired and released
+ * inside of DLM.
+ *
+ * - When new lock is created and enqueued to the server (ldlm_cli_enqueue())
+ *      - ldlm_lock_create()
+ *          - ldlm_lock_new(): initializes a lock with 2 references. One for
+ *            the caller (released when reply from the server is received, or on
+ *            error), and another for the hash table.
+ *      - ldlm_lock_addref_internal(): protects the lock from cancellation.
+ *
+ * - When reply is received from the server (osc_enqueue_interpret())
+ *      - ldlm_cli_enqueue_fini()
+ *          - LDLM_LOCK_PUT(): releases caller reference acquired by
+ *            ldlm_lock_new().
+ *          - if (rc != 0)
+ *                ldlm_lock_decref(): error case: matches ldlm_cli_enqueue().
+ *      - ldlm_lock_decref(): for async locks, matches ldlm_cli_enqueue().
+ *
+ * - When lock is being cancelled (ldlm_lock_cancel())
+ *      - ldlm_lock_destroy()
+ *          - LDLM_LOCK_PUT(): releases hash-table reference acquired by
+ *            ldlm_lock_new().
+ *
+ * osc_lock is detached from ldlm_lock by osc_lock_detach() that is called
+ * either when lock is cancelled (osc_lock_blocking()), or when locks is
+ * deleted without cancellation (e.g., from cl_locks_prune()). In the latter
+ * case ldlm lock remains in memory, and can be re-attached to osc_lock in the
+ * future.
+ */
+struct osc_lock {
+        struct cl_lock_slice     ols_cl;
+        /** underlying DLM lock */
+        struct ldlm_lock        *ols_lock;
+        /** lock value block */
+        struct ost_lvb           ols_lvb;
+        /** DLM flags with which osc_lock::ols_lock was enqueued */
+        int                      ols_flags;
+        /** osc_lock::ols_lock handle */
+        struct lustre_handle     ols_handle;
+        struct ldlm_enqueue_info ols_einfo;
+        enum osc_lock_state      ols_state;
+        /**
+         * true, if ldlm_lock_addref() was called against
+         * osc_lock::ols_lock. This is used for sanity checking.
+         *
+         * \see osc_lock::ols_has_ref
+         */
+        unsigned                  ols_hold :1,
+        /**
+         * this is much like osc_lock::ols_hold, except that this bit is
+         * cleared _after_ reference in released in osc_lock_unuse(). This
+         * fine distinction is needed because:
+         *
+         *     - if ldlm lock still has a reference, osc_ast_data_get() needs
+         *       to return associated cl_lock (so that a flag is needed that is
+         *       cleared after ldlm_lock_decref() returned), and
+         *
+         *     - ldlm_lock_decref() can invoke blocking ast (for a
+         *       LDLM_FL_CBPENDING lock), and osc_lock functions like
+         *       osc_lock_cancel() called from there need to know whether to
+         *       release lock reference (so that a flag is needed that is
+         *       cleared before ldlm_lock_decref() is called).
+         */
+                                 ols_has_ref:1,
+        /**
+         * inherit the lockless attribute from top level cl_io.
+         * If true, osc_lock_enqueue is able to tolerate the -EUSERS error.
+         */
+                                 ols_locklessable:1,
+        /**
+         * set by osc_lock_use() to wait until blocking AST enters into
+         * osc_ldlm_blocking_ast0(), so that cl_lock mutex can be used for
+         * further synchronization.
+         */
+                                 ols_ast_wait:1,
+        /**
+         * If the data of this lock has been flushed to server side.
+         */
+                                 ols_flush:1,
+        /**
+         * if set, the osc_lock is a glimpse lock. For glimpse locks, we treat
+         * the EVAVAIL error as torerable, this will make upper logic happy
+         * to wait all glimpse locks to each OSTs to be completed.
+         * Glimpse lock converts to normal lock if the server lock is
+         * granted.
+         * Glimpse lock should be destroyed immediately after use.
+         */
+                                 ols_glimpse:1;
+        /**
+         * IO that owns this lock. This field is used for a dead-lock
+         * avoidance by osc_lock_enqueue().
+         *
+         * \see osc_deadlock_is_possible()
+         */
+        struct osc_io           *ols_owner;
+};
+
+
+/**
+ * Page state private for osc layer.
+ */
+struct osc_page {
+        struct cl_page_slice  ops_cl;
+        /**
+         * Page queues used by osc to detect when RPC can be formed.
+         */
+        struct osc_async_page ops_oap;
+        /**
+         * An offset within page from which next transfer starts. This is used
+         * by cl_page_clip() to submit partial page transfers.
+         */
+        int                   ops_from;
+        /**
+         * An offset within page at which next transfer ends.
+         *
+         * \see osc_page::ops_from.
+         */
+        int                   ops_to;
+        /**
+         * Boolean, true iff page is under transfer. Used for sanity checking.
+         */
+        unsigned              ops_transfer_pinned:1,
+        /**
+         * True for a `temporary page' created by read-ahead code, probably
+         * outside of any DLM lock.
+         */
+                              ops_temp:1,
+        /**
+         * True iff page was created by a user with `appropriate privileges'.
+         */
+                              ops_ignore_quota:1;
+        /**
+         * Linkage into a per-osc_object list of pages in flight. For
+         * debugging.
+         */
+        struct list_head      ops_inflight;
+        /**
+         * Thread that submitted this page for transfer. For debugging.
+         */
+        cfs_task_t           *ops_submitter;
+};
+
+extern cfs_mem_cache_t *osc_page_kmem;
+extern cfs_mem_cache_t *osc_lock_kmem;
+extern cfs_mem_cache_t *osc_object_kmem;
+extern cfs_mem_cache_t *osc_thread_kmem;
+extern cfs_mem_cache_t *osc_session_kmem;
+extern cfs_mem_cache_t *osc_req_kmem;
+
+extern struct lu_device_type osc_device_type;
+extern struct lu_context_key osc_key;
+extern struct lu_context_key osc_session_key;
+
+#define OSC_FLAGS (ASYNC_URGENT|ASYNC_READY)
+
+int osc_lock_init(const struct lu_env *env,
+                  struct cl_object *obj, struct cl_lock *lock,
+                  const struct cl_io *io);
+int osc_io_init  (const struct lu_env *env,
+                  struct cl_object *obj, struct cl_io *io);
+int osc_req_init (const struct lu_env *env, struct cl_device *dev,
+                  struct cl_req *req);
+struct lu_object *osc_object_alloc(const struct lu_env *env,
+                                   const struct lu_object_header *hdr,
+                                   struct lu_device *dev);
+struct cl_page   *osc_page_init   (const struct lu_env *env,
+                                   struct cl_object *obj,
+                                   struct cl_page *page, cfs_page_t *vmpage);
+
+void osc_lock_build_res(const struct lu_env *env, const struct osc_object *obj,
+                        struct ldlm_res_id *resname);
+void osc_index2policy  (ldlm_policy_data_t *policy, const struct cl_object *obj,
+                        pgoff_t start, pgoff_t end);
+int  osc_lvb_print     (const struct lu_env *env, void *cookie,
+                        lu_printer_t p, const struct ost_lvb *lvb);
+void osc_io_submit_page(const struct lu_env *env,
+                        struct osc_io *oio, struct osc_page *opg,
+                        enum cl_req_type crt);
+
+void osc_object_set_contended  (struct osc_object *obj);
+void osc_object_clear_contended(struct osc_object *obj);
+int  osc_object_is_contended   (struct osc_object *obj);
+
+int  osc_lock_is_lockless      (const struct osc_lock *olck);
+
+/*****************************************************************************
+ *
+ * Accessors.
+ *
+ */
+
+static inline struct osc_thread_info *osc_env_info(const struct lu_env *env)
+{
+        struct osc_thread_info *info;
+
+        info = lu_context_key_get(&env->le_ctx, &osc_key);
+        LASSERT(info != NULL);
+        return info;
+}
+
+static inline struct osc_session *osc_env_session(const struct lu_env *env)
+{
+        struct osc_session *ses;
+
+        ses = lu_context_key_get(env->le_ses, &osc_session_key);
+        LASSERT(ses != NULL);
+        return ses;
+}
+
+static inline struct osc_io *osc_env_io(const struct lu_env *env)
+{
+        return &osc_env_session(env)->os_io;
+}
+
+static inline int osc_is_object(const struct lu_object *obj)
+{
+        return obj->lo_dev->ld_type == &osc_device_type;
+}
+
+static inline struct osc_device *lu2osc_dev(const struct lu_device *d)
+{
+        LINVRNT(d->ld_type == &osc_device_type);
+        return container_of0(d, struct osc_device, od_cl.cd_lu_dev);
+}
+
+static inline struct obd_export *osc_export(const struct osc_object *obj)
+{
+        return lu2osc_dev(obj->oo_cl.co_lu.lo_dev)->od_exp;
+}
+
+static inline struct osc_object *cl2osc(const struct cl_object *obj)
+{
+        LINVRNT(osc_is_object(&obj->co_lu));
+        return container_of0(obj, struct osc_object, oo_cl);
+}
+
+static inline ldlm_mode_t osc_cl_lock2ldlm(enum cl_lock_mode mode)
+{
+        LASSERT(mode == CLM_READ || mode == CLM_WRITE);
+        return mode == CLM_READ ? LCK_PR : LCK_PW;
+}
+
+static inline enum cl_lock_mode osc_ldlm2cl_lock(ldlm_mode_t mode)
+{
+        LASSERT(mode == LCK_PR || mode == LCK_PW);
+        return mode == LCK_PR ? CLM_READ : CLM_WRITE;
+}
+
+static inline struct osc_page *cl2osc_page(const struct cl_page_slice *slice)
+{
+        LINVRNT(osc_is_object(&slice->cpl_obj->co_lu));
+        return container_of0(slice, struct osc_page, ops_cl);
+}
+
+static inline struct osc_lock *cl2osc_lock(const struct cl_lock_slice *slice)
+{
+        LINVRNT(osc_is_object(&slice->cls_obj->co_lu));
+        return container_of0(slice, struct osc_lock, ols_cl);
+}
+
+static inline struct osc_lock *osc_lock_at(const struct cl_lock *lock)
+{
+        return cl2osc_lock(cl_lock_at(lock, &osc_device_type));
+}
+
+/** @} osc */
+
+#endif /* OSC_CL_INTERNAL_H */
diff --git a/lustre/osc/osc_create.c b/lustre/osc/osc_create.c

index f6b669e..b022304 100644 (file)
--- a/lustre/osc/osc_create.c
+++ b/lustre/osc/osc_create.c
@@ -77,7 +77,7 @@ static int osc_interpret_create(const struct lu_env *env,
  
          oscc = req->rq_async_args.pointer_arg[0];
          LASSERT(oscc && (oscc->oscc_obd != LP_POISON));
-        
+
          spin_lock(&oscc->oscc_lock);
          oscc->oscc_flags &= ~OSCC_FLAG_CREATING;
          switch (rc) {
@@ -101,7 +101,7 @@ static int osc_interpret_create(const struct lu_env *env,
                  DEBUG_REQ(D_INODE, req, "Got EAGAIN - resend \n");
                  break;
          case -ENOSPC:
-        case -EROFS: 
+        case -EROFS:
          case -EFBIG: {
                  oscc->oscc_flags |= OSCC_FLAG_NOSPC;
                  if (body && rc == -ENOSPC) {
@@ -113,7 +113,7 @@ static int osc_interpret_create(const struct lu_env *env,
                  break;
          }
          case -EIO: {
-                /* filter always set body->oa.o_id as the last_id 
+                /* filter always set body->oa.o_id as the last_id
                   * of filter (see filter_handle_precreate for detail)*/
                  if (body && body->oa.o_id > oscc->oscc_last_id)
                          oscc->oscc_last_id = body->oa.o_id;
@@ -194,7 +194,7 @@ static int oscc_internal_create(struct osc_creator *oscc)
  
          request->rq_async_args.pointer_arg[0] = oscc;
          request->rq_interpret_reply = osc_interpret_create;
-        ptlrpcd_add_req(request);
+        ptlrpcd_add_req(request, PSCOPE_OTHER);
  
          RETURN(0);
  }
diff --git a/lustre/osc/osc_dev.c b/lustre/osc/osc_dev.c

new file mode 100644 (file)

index 0000000..228b157
--- /dev/null
+++ b/lustre/osc/osc_dev.c
@@ -0,0 +1,253 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_device, cl_req for OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+/** \addtogroup osc osc @{ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+/* class_name2obd() */
+#include <obd_class.h>
+
+#include "osc_cl_internal.h"
+
+cfs_mem_cache_t *osc_page_kmem;
+cfs_mem_cache_t *osc_lock_kmem;
+cfs_mem_cache_t *osc_object_kmem;
+cfs_mem_cache_t *osc_thread_kmem;
+cfs_mem_cache_t *osc_session_kmem;
+cfs_mem_cache_t *osc_req_kmem;
+
+struct lu_kmem_descr osc_caches[] = {
+        {
+                .ckd_cache = &osc_page_kmem,
+                .ckd_name  = "osc_page_kmem",
+                .ckd_size  = sizeof (struct osc_page)
+        },
+        {
+                .ckd_cache = &osc_lock_kmem,
+                .ckd_name  = "osc_lock_kmem",
+                .ckd_size  = sizeof (struct osc_lock)
+        },
+        {
+                .ckd_cache = &osc_object_kmem,
+                .ckd_name  = "osc_object_kmem",
+                .ckd_size  = sizeof (struct osc_object)
+        },
+        {
+                .ckd_cache = &osc_thread_kmem,
+                .ckd_name  = "osc_thread_kmem",
+                .ckd_size  = sizeof (struct osc_thread_info)
+        },
+        {
+                .ckd_cache = &osc_session_kmem,
+                .ckd_name  = "osc_session_kmem",
+                .ckd_size  = sizeof (struct osc_session)
+        },
+        {
+                .ckd_cache = &osc_req_kmem,
+                .ckd_name  = "osc_req_kmem",
+                .ckd_size  = sizeof (struct osc_req)
+        },
+        {
+                .ckd_cache = NULL
+        }
+};
+
+struct lock_class_key osc_ast_guard_class;
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ */
+
+static struct lu_device *osc2lu_dev(struct osc_device *osc)
+{
+        return &osc->od_cl.cd_lu_dev;
+}
+
+/*****************************************************************************
+ *
+ * Osc device and device type functions.
+ *
+ */
+
+static void *osc_key_init(const struct lu_context *ctx,
+                         struct lu_context_key *key)
+{
+        struct osc_thread_info *info;
+
+        OBD_SLAB_ALLOC_PTR(info, osc_thread_kmem);
+        if (info == NULL)
+                info = ERR_PTR(-ENOMEM);
+        return info;
+}
+
+static void osc_key_fini(const struct lu_context *ctx,
+                         struct lu_context_key *key, void *data)
+{
+        struct osc_thread_info *info = data;
+        OBD_SLAB_FREE_PTR(info, osc_thread_kmem);
+}
+
+struct lu_context_key osc_key = {
+        .lct_tags = LCT_CL_THREAD,
+        .lct_init = osc_key_init,
+        .lct_fini = osc_key_fini
+};
+
+static void *osc_session_init(const struct lu_context *ctx,
+                              struct lu_context_key *key)
+{
+        struct osc_session *info;
+
+        OBD_SLAB_ALLOC_PTR(info, osc_session_kmem);
+        if (info == NULL)
+                info = ERR_PTR(-ENOMEM);
+        return info;
+}
+
+static void osc_session_fini(const struct lu_context *ctx,
+                             struct lu_context_key *key, void *data)
+{
+        struct osc_session *info = data;
+        OBD_SLAB_FREE_PTR(info, osc_session_kmem);
+}
+
+struct lu_context_key osc_session_key = {
+        .lct_tags = LCT_SESSION,
+        .lct_init = osc_session_init,
+        .lct_fini = osc_session_fini
+};
+
+/* type constructor/destructor: osc_type_{init,fini,start,stop}(). */
+LU_TYPE_INIT_FINI(osc, &osc_key, &osc_session_key);
+
+static int osc_cl_process_config(const struct lu_env *env,
+                                 struct lu_device *d, struct lustre_cfg *cfg)
+{
+        ENTRY;
+        RETURN(osc_process_config_base(d->ld_obd, cfg));
+}
+
+static const struct lu_device_operations osc_lu_ops = {
+        .ldo_object_alloc      = osc_object_alloc,
+        .ldo_process_config    = osc_cl_process_config,
+        .ldo_recovery_complete = NULL
+};
+
+static const struct cl_device_operations osc_cl_ops = {
+        .cdo_req_init = osc_req_init
+};
+
+static int osc_device_init(const struct lu_env *env, struct lu_device *d,
+                           const char *name, struct lu_device *next)
+{
+        RETURN(0);
+}
+
+static struct lu_device *osc_device_fini(const struct lu_env *env,
+                                         struct lu_device *d)
+{
+        return 0;
+}
+
+static struct lu_device *osc_device_free(const struct lu_env *env,
+                                         struct lu_device *d)
+{
+        struct osc_device *od = lu2osc_dev(d);
+
+        cl_device_fini(lu2cl_dev(d));
+        OBD_FREE_PTR(od);
+        return NULL;
+}
+
+static struct lu_device *osc_device_alloc(const struct lu_env *env,
+                                          struct lu_device_type *t,
+                                          struct lustre_cfg *cfg)
+{
+        struct lu_device *d;
+        struct osc_device *od;
+        struct obd_device *obd;
+        int rc;
+
+        OBD_ALLOC_PTR(od);
+        if (od == NULL)
+                RETURN(ERR_PTR(-ENOMEM));
+
+        cl_device_init(&od->od_cl, t);
+        d = osc2lu_dev(od);
+        d->ld_ops = &osc_lu_ops;
+        od->od_cl.cd_ops = &osc_cl_ops;
+
+        /* Setup OSC OBD */
+        obd = class_name2obd(lustre_cfg_string(cfg, 0));
+        LASSERT(obd != NULL);
+        rc = osc_setup(obd, cfg);
+        if (rc) {
+                osc_device_free(env, d);
+                RETURN(ERR_PTR(rc));
+        }
+        od->od_exp = obd->obd_self_export;
+        RETURN(d);
+}
+
+static const struct lu_device_type_operations osc_device_type_ops = {
+        .ldto_init = osc_type_init,
+        .ldto_fini = osc_type_fini,
+
+        .ldto_start = osc_type_start,
+        .ldto_stop  = osc_type_stop,
+
+        .ldto_device_alloc = osc_device_alloc,
+        .ldto_device_free  = osc_device_free,
+
+        .ldto_device_init    = osc_device_init,
+        .ldto_device_fini    = osc_device_fini
+};
+
+struct lu_device_type osc_device_type = {
+        .ldt_tags     = LU_DEVICE_CL,
+        .ldt_name     = LUSTRE_OSC_NAME,
+        .ldt_ops      = &osc_device_type_ops,
+        .ldt_ctx_tags = LCT_CL_THREAD
+};
+
+/** @} osc */
diff --git a/lustre/osc/osc_internal.h b/lustre/osc/osc_internal.h

index d6c979b..b7a5143 100644 (file)
--- a/lustre/osc/osc_internal.h
+++ b/lustre/osc/osc_internal.h
@@ -39,6 +39,24 @@
  
  #define OAP_MAGIC 8675309
  
+struct lu_env;
+
+enum async_flags {
+        ASYNC_READY = 0x1, /* ap_make_ready will not be called before this
+                              page is added to an rpc */
+        ASYNC_URGENT = 0x2, /* page must be put into an RPC before return */
+        ASYNC_COUNT_STABLE = 0x4, /* ap_refresh_count will not be called
+                                     to give the caller a chance to update
+                                     or cancel the size of the io */
+};
+
+struct obd_async_page_ops {
+        int  (*ap_make_ready)(const struct lu_env *env, void *data, int cmd);
+        int  (*ap_refresh_count)(const struct lu_env *env, void *data, int cmd);
+        int  (*ap_completion)(const struct lu_env *env,
+                              void *data, int cmd, struct obdo *oa, int rc);
+};
+
  struct osc_async_page {
          int                     oap_magic;
          unsigned short          oap_cmd;
@@ -54,13 +72,11 @@ struct osc_async_page {
  
          struct brw_page         oap_brw_page;
  
-        struct oig_callback_context oap_occ;
-        struct obd_io_group     *oap_oig;
          struct ptlrpc_request   *oap_request;
          struct client_obd       *oap_cli;
          struct lov_oinfo        *oap_loi;
  
-       struct obd_async_page_ops *oap_caller_ops;
+        const struct obd_async_page_ops *oap_caller_ops;
          void                    *oap_caller_data;
          struct list_head         oap_page_list;
          struct ldlm_lock        *oap_ldlm_lock;
@@ -93,6 +109,64 @@ int osc_real_create(struct obd_export *exp, struct obdo *oa,
  void oscc_init(struct obd_device *obd);
  void osc_wake_cache_waiters(struct client_obd *cli);
  
+/*
+ * cl integration.
+ */
+#include <cl_object.h>
+
+extern struct ptlrpc_request_set *PTLRPCD_SET;
+
+int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
+                     int *flags, ldlm_policy_data_t *policy,
+                     struct ost_lvb *lvb, int kms_valid,
+                     obd_enqueue_update_f upcall,
+                     void *cookie, struct ldlm_enqueue_info *einfo,
+                     struct lustre_handle *lockh,
+                     struct ptlrpc_request_set *rqset, int async);
+int osc_cancel_base(struct lustre_handle *lockh, __u32 mode);
+
+int osc_match_base(struct obd_export *exp, struct ldlm_res_id *res_id,
+                   __u32 type, ldlm_policy_data_t *policy, __u32 mode,
+                   int *flags, void *data, struct lustre_handle *lockh,
+                   int unref);
+
+int osc_punch_base(struct obd_export *exp, struct obdo *oa,
+                   struct obd_capa *capa,
+                   obd_enqueue_update_f upcall, void *cookie,
+                   struct ptlrpc_request_set *rqset);
+
+int osc_prep_async_page(struct obd_export *exp, struct lov_stripe_md *lsm,
+                        struct lov_oinfo *loi, cfs_page_t *page,
+                        obd_off offset, const struct obd_async_page_ops *ops,
+                        void *data, void **res, int nocache,
+                        struct lustre_handle *lockh);
+void osc_oap_to_pending(struct osc_async_page *oap);
+int  osc_oap_interrupted(const struct lu_env *env, struct osc_async_page *oap);
+void loi_list_maint(struct client_obd *cli, struct lov_oinfo *loi);
+void osc_check_rpcs(const struct lu_env *env, struct client_obd *cli);
+
+int osc_queue_async_io(const struct lu_env *env,
+                       struct obd_export *exp, struct lov_stripe_md *lsm,
+                       struct lov_oinfo *loi, void *cookie,
+                       int cmd, obd_off off, int count,
+                       obd_flag brw_flags, enum async_flags async_flags);
+int osc_teardown_async_page(struct obd_export *exp,
+                            struct lov_stripe_md *lsm,
+                            struct lov_oinfo *loi, void *cookie);
+int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *cfg);
+int osc_set_async_flags_base(struct client_obd *cli,
+                             struct lov_oinfo *loi, struct osc_async_page *oap,
+                             obd_flag async_flags);
+int osc_enter_cache_try(const struct lu_env *env,
+                        struct client_obd *cli, struct lov_oinfo *loi,
+                        struct osc_async_page *oap, int transient);
+
+struct cl_page *osc_oap2cl_page(struct osc_async_page *oap);
+extern spinlock_t osc_ast_guard;
+
+int osc_cleanup(struct obd_device *obd);
+int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg);
+
  #ifdef LPROCFS
  int lproc_osc_attach_seqstat(struct obd_device *dev);
  void lprocfs_osc_init_vars(struct lprocfs_static_vars *lvars);
@@ -104,6 +178,8 @@ static inline void lprocfs_osc_init_vars(struct lprocfs_static_vars *lvars)
  }
  #endif
  
+extern struct lu_device_type osc_device_type;
+
  static inline int osc_recoverable_error(int rc)
  {
          return (rc == -EIO || rc == -EROFS || rc == -ENOMEM || rc == -EAGAIN);
@@ -112,8 +188,8 @@ static inline int osc_recoverable_error(int rc)
  /* return 1 if osc should be resend request */
  static inline int osc_should_resend(int resend, struct client_obd *cli)
  {
-        return atomic_read(&cli->cl_resends) ? 
-               atomic_read(&cli->cl_resends) > resend : 1; 
+        return atomic_read(&cli->cl_resends) ?
+               atomic_read(&cli->cl_resends) > resend : 1;
  }
  
  #ifndef min_t
@@ -121,4 +197,26 @@ static inline int osc_should_resend(int resend, struct client_obd *cli)
          ({ type __x = (x); type __y = (y); __x < __y ? __x: __y; })
  #endif
  
+struct osc_device {
+        struct cl_device    od_cl;
+        struct obd_export  *od_exp;
+
+        /* Write stats is actually protected by client_obd's lock. */
+        struct osc_stats {
+                uint64_t     os_lockless_writes;          /* by bytes */
+                uint64_t     os_lockless_reads;           /* by bytes */
+                uint64_t     os_lockless_truncates;       /* by times */
+        } od_stats;
+
+        /* configuration item(s) */
+        int                 od_contention_time;
+        int                 od_lockless_truncate;
+};
+
+static inline struct osc_device *obd2osc_dev(const struct obd_device *d)
+{
+        return container_of0(d->obd_lu_dev, struct osc_device, od_cl.cd_lu_dev);
+}
+
+
  #endif /* OSC_INTERNAL_H */
diff --git a/lustre/osc/osc_io.c b/lustre/osc/osc_io.c

new file mode 100644 (file)

index 0000000..86fe589
--- /dev/null
+++ b/lustre/osc/osc_io.c
@@ -0,0 +1,646 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_io for OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+/** \addtogroup osc osc @{ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include "osc_cl_internal.h"
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ */
+
+static struct osc_req *cl2osc_req(const struct cl_req_slice *slice)
+{
+        LINVRNT(slice->crs_dev->cd_lu_dev.ld_type == &osc_device_type);
+        return container_of0(slice, struct osc_req, or_cl);
+}
+
+static struct osc_io *cl2osc_io(const struct lu_env *env,
+                                const struct cl_io_slice *slice)
+{
+        struct osc_io *oio = container_of0(slice, struct osc_io, oi_cl);
+        LINVRNT(oio == osc_env_io(env));
+        return oio;
+}
+
+static struct osc_page *osc_cl_page_osc(struct cl_page *page)
+{
+        const struct cl_page_slice *slice;
+
+        slice = cl_page_at(page, &osc_device_type);
+        LASSERT(slice != NULL);
+
+        return cl2osc_page(slice);
+}
+
+
+/*****************************************************************************
+ *
+ * io operations.
+ *
+ */
+
+static void osc_io_fini(const struct lu_env *env, const struct cl_io_slice *io)
+{
+}
+
+struct cl_page *osc_oap2cl_page(struct osc_async_page *oap)
+{
+        return container_of(oap, struct osc_page, ops_oap)->ops_cl.cpl_page;
+}
+
+static void osc_io_unplug(const struct lu_env *env, struct osc_object *osc,
+                          struct client_obd *cli)
+{
+        loi_list_maint(cli, osc->oo_oinfo);
+        osc_check_rpcs(env, cli);
+        client_obd_list_unlock(&cli->cl_loi_list_lock);
+}
+
+/**
+ * How many pages osc_io_submit() queues before checking whether an RPC is
+ * ready.
+ */
+#define OSC_QUEUE_GRAIN (32)
+
+/**
+ * An implementation of cl_io_operations::cio_io_submit() method for osc
+ * layer. Iterates over pages in the in-queue, prepares each for io by calling
+ * cl_page_prep() and then either submits them through osc_io_submit_page()
+ * or, if page is already submitted, changes osc flags through
+ * osc_set_async_flags_base().
+ */
+static int osc_io_submit(const struct lu_env *env,
+                         const struct cl_io_slice *ios,
+                         enum cl_req_type crt, struct cl_2queue *queue)
+{
+        struct cl_page    *page;
+        struct cl_page    *tmp;
+        struct osc_object *osc0 = NULL;
+        struct client_obd *cli  = NULL;
+        struct osc_object *osc  = NULL; /* to keep gcc happy */
+        struct osc_page   *opg;
+        struct cl_io      *io;
+
+        struct cl_page_list *qin      = &queue->c2_qin;
+        struct cl_page_list *qout     = &queue->c2_qout;
+        int queued = 0;
+        int result = 0;
+
+        LASSERT(qin->pl_nr > 0);
+
+        CDEBUG(D_INFO, "%i %i\n", qin->pl_nr, crt);
+        /*
+         * NOTE: here @page is a top-level page. This is done to avoid
+         *       creation of sub-page-list.
+         */
+        cl_page_list_for_each_safe(page, tmp, qin) {
+                struct osc_async_page *oap;
+                struct obd_export     *exp;
+
+                /* Top level IO. */
+                io = page->cp_owner;
+                LASSERT(io != NULL);
+
+                opg = osc_cl_page_osc(page);
+                oap = &opg->ops_oap;
+                osc = cl2osc(opg->ops_cl.cpl_obj);
+                exp = osc_export(osc);
+
+                /*
+                 * This can be checked without cli->cl_loi_list_lock, because
+                 * ->oap_*_item are always manipulated when the page is owned.
+                 */
+                if (!list_empty(&oap->oap_urgent_item) ||
+                    !list_empty(&oap->oap_rpc_item)) {
+                        result = -EBUSY;
+                        break;
+                }
+
+                if (osc0 == NULL) { /* first iteration */
+                        cli = &exp->exp_obd->u.cli;
+                        osc0 = osc;
+                } else /* check that all pages are against the same object
+                        * (for now) */
+                        LASSERT(osc == osc0);
+                if (queued++ == 0)
+                        client_obd_list_lock(&cli->cl_loi_list_lock);
+                result = cl_page_prep(env, io, page, crt);
+                if (result == 0) {
+                        cl_page_list_move(qout, qin, page);
+                        if (list_empty(&oap->oap_pending_item)) {
+                                osc_io_submit_page(env, cl2osc_io(env, ios),
+                                                   opg, crt);
+                        } else {
+                                result = osc_set_async_flags_base(cli,
+                                                                  osc->oo_oinfo,
+                                                                  oap,
+                                                                  OSC_FLAGS);
+                                if (result != 0)
+                                        break;
+                        }
+                } else {
+                        LASSERT(result < 0);
+                        if (result != -EALREADY)
+                                break;
+                        /*
+                         * Handle -EALREADY error: for read case, the page is
+                         * already in UPTODATE state; for write, the page
+                         * is not dirty.
+                         */
+                        result = 0;
+                }
+                /*
+                 * Don't keep client_obd_list_lock() for too long.
+                 *
+                 * XXX lock_need_resched() should be used here, but it is not
+                 * available in the older of supported kernels.
+                 */
+                if (queued > OSC_QUEUE_GRAIN || cfs_need_resched()) {
+                        queued = 0;
+                        osc_io_unplug(env, osc, cli);
+                        cfs_cond_resched();
+                }
+        }
+
+        LASSERT(ergo(result == 0, cli != NULL));
+        LASSERT(ergo(result == 0, osc == osc0));
+
+        if (queued > 0)
+                osc_io_unplug(env, osc, cli);
+        CDEBUG(D_INFO, "%i/%i %i\n", qin->pl_nr, qout->pl_nr, result);
+        return qout->pl_nr > 0 ? 0 : result;
+}
+
+static void osc_page_touch_at(const struct lu_env *env,
+                              struct cl_object *obj, pgoff_t idx, unsigned to)
+{
+        struct lov_oinfo  *loi  = cl2osc(obj)->oo_oinfo;
+        struct cl_attr    *attr = &osc_env_info(env)->oti_attr;
+        int valid;
+        __u64 kms;
+
+        /* offset within stripe */
+        kms = cl_offset(obj, idx) + to;
+
+        cl_object_attr_lock(obj);
+        /*
+         * XXX old code used
+         *
+         *         ll_inode_size_lock(inode, 0); lov_stripe_lock(lsm);
+         *
+         * here
+         */
+        CDEBUG(D_INODE, "stripe KMS %sincreasing "LPU64"->"LPU64" "LPU64"\n",
+               kms > loi->loi_kms ? "" : "not ", loi->loi_kms, kms,
+               loi->loi_lvb.lvb_size);
+
+        valid = 0;
+        if (kms > loi->loi_kms) {
+                attr->cat_kms = kms;
+                valid |= CAT_KMS;
+        }
+        if (kms > loi->loi_lvb.lvb_size) {
+                attr->cat_size = kms;
+                valid |= CAT_SIZE;
+        }
+        cl_object_attr_set(env, obj, attr, valid);
+        cl_object_attr_unlock(obj);
+}
+
+/**
+ * This is called when a page is accessed within file in a way that creates
+ * new page, if one were missing (i.e., if there were a hole at that place in
+ * the file, or accessed page is beyond the current file size). Examples:
+ * ->commit_write() and ->nopage() methods.
+ *
+ * Expand stripe KMS if necessary.
+ */
+static void osc_page_touch(const struct lu_env *env,
+                           struct osc_page *opage, unsigned to)
+{
+        struct cl_page    *page = opage->ops_cl.cpl_page;
+        struct cl_object  *obj  = opage->ops_cl.cpl_obj;
+
+        osc_page_touch_at(env, obj, page->cp_index, to);
+}
+
+/**
+ * Implements cl_io_operations::cio_prepare_write() method for osc layer.
+ *
+ * \retval -EIO transfer initiated against this osc will most likely fail
+ * \retval 0    transfer initiated against this osc will most likely succeed.
+ *
+ * The reason for this check is to immediately return an error to the caller
+ * in the case of a deactivated import. Note, that import can be deactivated
+ * later, while pages, dirtied by this IO, are still in the cache, but this is
+ * irrelevant, because that would still return an error to the application (if
+ * it does fsync), but many applications don't do fsync because of performance
+ * issues, and we wanted to return an -EIO at write time to notify the
+ * application.
+ */
+static int osc_io_prepare_write(const struct lu_env *env,
+                                const struct cl_io_slice *ios,
+                                const struct cl_page_slice *slice,
+                                unsigned from, unsigned to)
+{
+        struct osc_device *dev = lu2osc_dev(slice->cpl_obj->co_lu.lo_dev);
+        struct obd_import *imp = class_exp2cliimp(dev->od_exp);
+
+        ENTRY;
+
+        /*
+         * This implements OBD_BRW_CHECK logic from old client.
+         */
+
+        RETURN(imp == NULL || imp->imp_invalid ? -EIO : 0);
+}
+
+static int osc_io_commit_write(const struct lu_env *env,
+                               const struct cl_io_slice *ios,
+                               const struct cl_page_slice *slice,
+                               unsigned from, unsigned to)
+{
+        LASSERT(to > 0);
+
+        ENTRY;
+        /*
+         * XXX instead of calling osc_page_touch() here and in
+         * osc_io_fault_start() it might be more logical to introduce
+         * cl_page_touch() method, that generic cl_io_commit_write() and page
+         * fault code calls.
+         */
+        osc_page_touch(env, cl2osc_page(slice), to);
+        RETURN(0);
+}
+
+static int osc_io_fault_start(const struct lu_env *env,
+                              const struct cl_io_slice *ios)
+{
+        struct cl_io       *io;
+        struct cl_fault_io *fio;
+
+        ENTRY;
+
+        io  = ios->cis_io;
+        fio = &io->u.ci_fault;
+        CDEBUG(D_INFO, "%lu %i %i\n",
+               fio->ft_index, fio->ft_writable, fio->ft_nob);
+        /*
+         * If mapping is writeable, adjust kms to cover this page,
+         * but do not extend kms beyond actual file size.
+         * See bug 10919.
+         */
+        if (fio->ft_writable)
+                osc_page_touch_at(env, ios->cis_obj,
+                                  fio->ft_index, fio->ft_nob);
+        RETURN(0);
+}
+
+static int osc_punch_upcall(void *a, int rc)
+{
+        struct osc_punch_cbargs *args = a;
+
+        args->opc_rc = rc;
+        complete(&args->opc_sync);
+        return 0;
+}
+
+#ifdef __KERNEL__
+/**
+ * Checks that there are no pages being written in the extent being truncated.
+ */
+static void osc_trunc_check(const struct lu_env *env, struct cl_io *io,
+                            struct osc_io *oio, size_t size)
+{
+        struct osc_page     *cp;
+        struct osc_object   *obj;
+        struct cl_object    *clob;
+        struct cl_page      *page;
+        struct cl_page_list *list;
+        int                  partial;
+        pgoff_t              start;
+
+        clob    = oio->oi_cl.cis_obj;
+        obj     = cl2osc(clob);
+        start   = cl_index(clob, size);
+        partial = cl_offset(clob, start) < size;
+        list    = &osc_env_info(env)->oti_plist;
+
+        /*
+         * Complain if there are pages in the truncated region.
+         *
+         * XXX this is quite expensive check.
+         */
+        cl_page_list_init(list);
+        cl_page_gang_lookup(env, clob, io, start + partial, CL_PAGE_EOF, list);
+
+        cl_page_list_for_each(page, list)
+                CL_PAGE_DEBUG(D_ERROR, env, page, "exists %lu\n", start);
+
+        cl_page_list_disown(env, io, list);
+        cl_page_list_fini(env, list);
+
+        spin_lock(&obj->oo_seatbelt);
+        list_for_each_entry(cp, &obj->oo_inflight[CRT_WRITE], ops_inflight) {
+                page = cp->ops_cl.cpl_page;
+                if (page->cp_index >= start + partial) {
+                        cfs_task_t *submitter;
+
+                        submitter = cp->ops_submitter;
+                        /*
+                         * XXX Linux specific debugging stuff.
+                         */
+                        CL_PAGE_DEBUG(D_ERROR, env, page, "%s/%i %lu\n",
+                                      submitter->comm, submitter->pid, start);
+                        libcfs_debug_dumpstack(submitter);
+                }
+        }
+        spin_unlock(&obj->oo_seatbelt);
+}
+#else /* __KERNEL__ */
+# define osc_trunc_check(env, io, oio, size) do {;} while (0)
+#endif
+
+static int osc_io_trunc_start(const struct lu_env *env,
+                              const struct cl_io_slice *slice)
+{
+        struct cl_io            *io     = slice->cis_io;
+        struct osc_io           *oio    = cl2osc_io(env, slice);
+        struct cl_object        *obj    = slice->cis_obj;
+        struct lov_oinfo        *loi    = cl2osc(obj)->oo_oinfo;
+        struct cl_attr          *attr   = &osc_env_info(env)->oti_attr;
+        struct obdo             *oa     = &oio->oi_oa;
+        struct osc_punch_cbargs *cbargs = &oio->oi_punch_cbarg;
+        struct obd_capa         *capa;
+        loff_t                   size   = io->u.ci_truncate.tr_size;
+        int                      result;
+
+        memset(oa, 0, sizeof(*oa));
+
+        osc_trunc_check(env, io, oio, size);
+
+        cl_object_attr_lock(obj);
+        result = cl_object_attr_get(env, obj, attr);
+        if (result == 0) {
+                attr->cat_size = attr->cat_kms = size;
+                result = cl_object_attr_set(env, obj, attr, CAT_SIZE|CAT_KMS);
+        }
+        cl_object_attr_unlock(obj);
+
+        if (result == 0) {
+                oa->o_id = loi->loi_id;
+                oa->o_gr = loi->loi_gr;
+                oa->o_mtime = attr->cat_mtime;
+                oa->o_atime = attr->cat_atime;
+                oa->o_ctime = attr->cat_ctime;
+                oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP | OBD_MD_FLATIME |
+                        OBD_MD_FLCTIME | OBD_MD_FLMTIME;
+                if (oio->oi_lockless) {
+                        oa->o_flags = OBD_FL_TRUNCLOCK;
+                        oa->o_valid |= OBD_MD_FLFLAGS;
+                }
+                oa->o_size = size;
+                oa->o_blocks = OBD_OBJECT_EOF;
+                oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
+
+                capa = io->u.ci_truncate.tr_capa;
+                init_completion(&cbargs->opc_sync);
+                result = osc_punch_base(osc_export(cl2osc(obj)), oa, capa,
+                                        osc_punch_upcall, cbargs, PTLRPCD_SET);
+        }
+        return result;
+}
+
+static void osc_io_trunc_end(const struct lu_env *env,
+                             const struct cl_io_slice *slice)
+{
+        struct cl_io            *io     = slice->cis_io;
+        struct osc_io           *oio    = cl2osc_io(env, slice);
+        struct osc_punch_cbargs *cbargs = &oio->oi_punch_cbarg;
+        struct obdo             *oa     = &oio->oi_oa;
+        int result;
+
+        wait_for_completion(&cbargs->opc_sync);
+
+        result = io->ci_result = cbargs->opc_rc;
+        if (result == 0) {
+                struct cl_object *obj = slice->cis_obj;
+                if (oio->oi_lockless == 0) {
+                        struct cl_attr *attr = &osc_env_info(env)->oti_attr;
+                        int valid = 0;
+
+                        /* Update kms & size */
+                        if (oa->o_valid & OBD_MD_FLSIZE) {
+                                attr->cat_size = oa->o_size;
+                                attr->cat_kms  = oa->o_size;
+                                valid |= CAT_KMS|CAT_SIZE;
+                        }
+                        if (oa->o_valid & OBD_MD_FLBLOCKS) {
+                                attr->cat_blocks = oa->o_blocks;
+                                valid |= CAT_BLOCKS;
+                        }
+                        if (oa->o_valid & OBD_MD_FLMTIME) {
+                                attr->cat_mtime = oa->o_mtime;
+                                valid |= CAT_MTIME;
+                        }
+                        if (oa->o_valid & OBD_MD_FLCTIME) {
+                                attr->cat_ctime = oa->o_ctime;
+                                valid |= CAT_CTIME;
+                        }
+                        if (oa->o_valid & OBD_MD_FLATIME) {
+                                attr->cat_atime = oa->o_atime;
+                                valid |= CAT_ATIME;
+                        }
+                        cl_object_attr_lock(obj);
+                        result = cl_object_attr_set(env, obj, attr, valid);
+                        cl_object_attr_unlock(obj);
+                } else {  /* lockless truncate */
+                        struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev);
+                        /* XXX: Need a lock. */
+                        osd->od_stats.os_lockless_truncates++;
+                }
+        }
+
+        /* return result; */
+}
+
+static const struct cl_io_operations osc_io_ops = {
+        .op = {
+                [CIT_READ] = {
+                        .cio_fini   = osc_io_fini
+                },
+                [CIT_WRITE] = {
+                        .cio_fini   = osc_io_fini
+                },
+                [CIT_TRUNC] = {
+                        .cio_start  = osc_io_trunc_start,
+                        .cio_end    = osc_io_trunc_end
+                },
+                [CIT_FAULT] = {
+                        .cio_fini   = osc_io_fini,
+                        .cio_start  = osc_io_fault_start
+                },
+                [CIT_MISC] = {
+                        .cio_fini   = osc_io_fini
+                }
+        },
+        .req_op = {
+                 [CRT_READ] = {
+                         .cio_submit    = osc_io_submit
+                 },
+                 [CRT_WRITE] = {
+                         .cio_submit    = osc_io_submit
+                 }
+         },
+        .cio_prepare_write = osc_io_prepare_write,
+        .cio_commit_write  = osc_io_commit_write
+};
+
+/*****************************************************************************
+ *
+ * Transfer operations.
+ *
+ */
+
+static int osc_req_prep(const struct lu_env *env,
+                        const struct cl_req_slice *slice)
+{
+        return 0;
+}
+
+static void osc_req_completion(const struct lu_env *env,
+                               const struct cl_req_slice *slice, int ioret)
+{
+        struct osc_req *or;
+
+        or = cl2osc_req(slice);
+        OBD_SLAB_FREE_PTR(or, osc_req_kmem);
+}
+
+/**
+ * Implementation of struct cl_req_operations::cro_attr_set() for osc
+ * layer. osc is responsible for struct obdo::o_id and struct obdo::o_gr
+ * fields.
+ */
+static void osc_req_attr_set(const struct lu_env *env,
+                             const struct cl_req_slice *slice,
+                             const struct cl_object *obj,
+                             struct cl_req_attr *attr, obd_valid flags)
+{
+        struct lov_oinfo *oinfo;
+        struct cl_req    *clerq;
+        struct cl_page   *apage; /* _some_ page in @clerq */
+        struct cl_lock   *lock;  /* _some_ lock protecting @apage */
+        struct osc_lock  *olck;
+        struct osc_page  *opg;
+        struct obdo      *oa;
+
+        oa = attr->cra_oa;
+        oinfo = cl2osc(obj)->oo_oinfo;
+        if (flags & OBD_MD_FLID) {
+                oa->o_id = oinfo->loi_id;
+                oa->o_valid |= OBD_MD_FLID;
+        }
+        if (flags & OBD_MD_FLGROUP) {
+                oa->o_gr = oinfo->loi_gr;
+                oa->o_valid |= OBD_MD_FLGROUP;
+        }
+        if (flags & OBD_MD_FLHANDLE) {
+                clerq = slice->crs_req;
+                LASSERT(!list_empty(&clerq->crq_pages));
+                apage = container_of(clerq->crq_pages.next,
+                                     struct cl_page, cp_flight);
+                opg = osc_cl_page_osc(apage);
+                apage = opg->ops_cl.cpl_page; /* now apage is a sub-page */
+                lock = cl_lock_at_page(env, apage->cp_obj, apage, NULL, 1, 1);
+                if (lock != NULL) {
+                        olck = osc_lock_at(lock);
+                        LASSERT(olck != NULL);
+                        /* check for lockless io. */
+                        if (olck->ols_lock != NULL) {
+                                oa->o_handle = olck->ols_lock->l_remote_handle;
+                                oa->o_valid |= OBD_MD_FLHANDLE;
+                        }
+                        cl_lock_put(env, lock);
+                } else {
+                        /* Should only be possible with liblustre */
+                        LASSERT(LIBLUSTRE_CLIENT);
+                }
+        }
+}
+
+static const struct cl_req_operations osc_req_ops = {
+        .cro_prep       = osc_req_prep,
+        .cro_attr_set   = osc_req_attr_set,
+        .cro_completion = osc_req_completion
+};
+
+
+int osc_io_init(const struct lu_env *env,
+                struct cl_object *obj, struct cl_io *io)
+{
+        struct osc_io *oio = osc_env_io(env);
+
+        CL_IO_SLICE_CLEAN(oio, oi_cl);
+        cl_io_slice_add(io, &oio->oi_cl, obj, &osc_io_ops);
+        return 0;
+}
+
+int osc_req_init(const struct lu_env *env, struct cl_device *dev,
+                 struct cl_req *req)
+{
+        struct osc_req *or;
+        int result;
+
+        OBD_SLAB_ALLOC_PTR(or, osc_req_kmem);
+        if (or != NULL) {
+                cl_req_slice_add(req, &or->or_cl, dev, &osc_req_ops);
+                result = 0;
+        } else
+                result = -ENOMEM;
+        return result;
+}
+
+/** @} osc */
diff --git a/lustre/osc/osc_lock.c b/lustre/osc/osc_lock.c

new file mode 100644 (file)

index 0000000..8fa1fdf
--- /dev/null
+++ b/lustre/osc/osc_lock.c
@@ -0,0 +1,1621 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_lock for OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+/** \addtogroup osc osc @{ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#ifdef __KERNEL__
+# include <libcfs/libcfs.h>
+#else
+# include <liblustre.h>
+#endif
+/* fid_build_reg_res_name() */
+#include <lustre_fid.h>
+
+#include "osc_cl_internal.h"
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ */
+
+static const struct cl_lock_operations osc_lock_ops;
+static const struct cl_lock_operations osc_lock_lockless_ops;
+
+int osc_lock_is_lockless(const struct osc_lock *olck)
+{
+        return (olck->ols_cl.cls_ops == &osc_lock_lockless_ops);
+}
+
+/**
+ * Returns a weak pointer to the ldlm lock identified by a handle. Returned
+ * pointer cannot be dereferenced, as lock is not protected from concurrent
+ * reclaim. This function is a helper for osc_lock_invariant().
+ */
+static struct ldlm_lock *osc_handle_ptr(struct lustre_handle *handle)
+{
+        struct ldlm_lock *lock;
+
+        lock = ldlm_handle2lock(handle);
+        if (lock != NULL)
+                LDLM_LOCK_PUT(lock);
+        return lock;
+}
+
+/**
+ * Invariant that has to be true all of the time.
+ */
+static int osc_lock_invariant(struct osc_lock *ols)
+{
+        struct ldlm_lock *lock        = osc_handle_ptr(&ols->ols_handle);
+        struct ldlm_lock *olock       = ols->ols_lock;
+        int               handle_used = lustre_handle_is_used(&ols->ols_handle);
+
+        return
+                ergo(osc_lock_is_lockless(ols),
+                     ols->ols_locklessable && ols->ols_lock == NULL)  ||
+                (ergo(olock != NULL, handle_used) &&
+                 ergo(olock != NULL,
+                      olock->l_handle.h_cookie == ols->ols_handle.cookie) &&
+                 /*
+                  * Check that ->ols_handle and ->ols_lock are consistent, but
+                  * take into account that they are set at the different time.
+                  */
+                 ergo(handle_used,
+                      ergo(lock != NULL && olock != NULL, lock == olock) &&
+                      ergo(lock == NULL, olock == NULL)) &&
+                 ergo(ols->ols_state == OLS_CANCELLED,
+                      olock == NULL && !handle_used) &&
+                 /*
+                  * DLM lock is destroyed only after we have seen cancellation
+                  * ast.
+                  */
+                 ergo(olock != NULL && ols->ols_state < OLS_CANCELLED,
+                      !olock->l_destroyed) &&
+                 ergo(ols->ols_state == OLS_GRANTED,
+                      olock != NULL &&
+                      olock->l_req_mode == olock->l_granted_mode &&
+                      ols->ols_hold));
+}
+
+/*****************************************************************************
+ *
+ * Lock operations.
+ *
+ */
+
+/**
+ * Breaks a link between osc_lock and dlm_lock.
+ */
+static void osc_lock_detach(const struct lu_env *env, struct osc_lock *olck)
+{
+        struct ldlm_lock *dlmlock;
+
+        spin_lock(&osc_ast_guard);
+        dlmlock = olck->ols_lock;
+        if (dlmlock == NULL) {
+                spin_unlock(&osc_ast_guard);
+                return;
+        }
+
+        olck->ols_lock = NULL;
+        /* wb(); --- for all who checks (ols->ols_lock != NULL) before
+         * call to osc_lock_detach() */
+        dlmlock->l_ast_data = NULL;
+        olck->ols_handle.cookie = 0ULL;
+        spin_unlock(&osc_ast_guard);
+
+        lock_res_and_lock(dlmlock);
+        if (dlmlock->l_granted_mode == dlmlock->l_req_mode) {
+                struct cl_object *obj = olck->ols_cl.cls_obj;
+                struct cl_attr *attr  = &osc_env_info(env)->oti_attr;
+                __u64 old_kms = cl2osc(obj)->oo_oinfo->loi_kms;
+
+                /* Update the kms. Need to loop all granted locks.
+                 * Not a problem for the client */
+                attr->cat_kms = ldlm_extent_shift_kms(dlmlock, old_kms);
+                unlock_res_and_lock(dlmlock);
+
+                cl_object_attr_lock(obj);
+                cl_object_attr_set(env, obj, attr, CAT_KMS);
+                cl_object_attr_unlock(obj);
+        } else
+                unlock_res_and_lock(dlmlock);
+
+        /* release a reference taken in osc_lock_upcall0(). */
+        lu_ref_del(&dlmlock->l_reference, "osc_lock", olck);
+        LDLM_LOCK_RELEASE(dlmlock);
+}
+
+static int osc_lock_unuse(const struct lu_env *env,
+                          const struct cl_lock_slice *slice)
+{
+        struct osc_lock *ols = cl2osc_lock(slice);
+        int result;
+
+        LASSERT(ols->ols_state == OLS_GRANTED ||
+                ols->ols_state == OLS_UPCALL_RECEIVED);
+        LINVRNT(osc_lock_invariant(ols));
+
+        if (ols->ols_glimpse) {
+                LASSERT(ols->ols_hold == 0);
+                return 0;
+        }
+        LASSERT(ols->ols_hold);
+
+        /*
+         * Move lock into OLS_RELEASED state before calling osc_cancel_base()
+         * so that possible synchronous cancellation (that always happens
+         * e.g., for liblustre) sees that lock is released.
+         */
+        ols->ols_state = OLS_RELEASED;
+        ols->ols_hold = 0;
+        result = osc_cancel_base(&ols->ols_handle, ols->ols_einfo.ei_mode);
+        ols->ols_has_ref = 0;
+        return result;
+}
+
+static void osc_lock_fini(const struct lu_env *env,
+                          struct cl_lock_slice *slice)
+{
+        struct osc_lock  *ols = cl2osc_lock(slice);
+
+        LINVRNT(osc_lock_invariant(ols));
+        /*
+         * ->ols_hold can still be true at this point if, for example, a
+         * thread that requested a lock was killed (and released a reference
+         * to the lock), before reply from a server was received. In this case
+         * lock is destroyed immediately after upcall.
+         */
+        if (ols->ols_hold)
+                osc_lock_unuse(env, slice);
+        if (ols->ols_lock != NULL)
+                osc_lock_detach(env, ols);
+
+        OBD_SLAB_FREE_PTR(ols, osc_lock_kmem);
+}
+
+void osc_lock_build_res(const struct lu_env *env, const struct osc_object *obj,
+                        struct ldlm_res_id *resname)
+{
+        const struct lu_fid *fid = lu_object_fid(&obj->oo_cl.co_lu);
+        if (0) {
+                /*
+                 * In the perfect world of the future, where ost servers talk
+                 * idif-fids...
+                 */
+                fid_build_reg_res_name(fid, resname);
+        } else {
+                /*
+                 * In reality, where ost server expects ->lsm_object_id and
+                 * ->lsm_object_gr in rename.
+                 */
+                osc_build_res_name(obj->oo_oinfo->loi_id, obj->oo_oinfo->loi_gr,
+                                   resname);
+        }
+}
+
+static void osc_lock_build_policy(const struct lu_env *env,
+                                  const struct cl_lock *lock,
+                                  ldlm_policy_data_t *policy)
+{
+        const struct cl_lock_descr *d = &lock->cll_descr;
+
+        osc_index2policy(policy, d->cld_obj, d->cld_start, d->cld_end);
+}
+
+static int osc_enq2ldlm_flags(__u32 enqflags)
+{
+        int result = 0;
+
+        LASSERT((enqflags & ~(CEF_NONBLOCK|CEF_ASYNC|CEF_DISCARD_DATA)) == 0);
+
+        if (enqflags & CEF_NONBLOCK)
+                result |= LDLM_FL_BLOCK_NOWAIT;
+        if (enqflags & CEF_ASYNC)
+                result |= LDLM_FL_HAS_INTENT;
+        if (enqflags & CEF_DISCARD_DATA)
+                result |= LDLM_AST_DISCARD_DATA;
+        return result;
+}
+
+/**
+ * Global spin-lock protecting consistency of ldlm_lock::l_ast_data
+ * pointers. Initialized in osc_init().
+ */
+spinlock_t osc_ast_guard;
+
+static struct osc_lock *osc_ast_data_get(struct ldlm_lock *dlm_lock)
+{
+        struct osc_lock *olck;
+
+        lock_res_and_lock(dlm_lock);
+        spin_lock(&osc_ast_guard);
+        olck = dlm_lock->l_ast_data;
+        if (olck != NULL) {
+                struct cl_lock *lock = olck->ols_cl.cls_lock;
+                /*
+                 * If osc_lock holds a reference on ldlm lock, return it even
+                 * when cl_lock is in CLS_FREEING state. This way
+                 *
+                 *         osc_ast_data_get(dlmlock) == NULL
+                 *
+                 * guarantees that all osc references on dlmlock were
+                 * released. osc_dlm_blocking_ast0() relies on that.
+                 */
+                if (lock->cll_state < CLS_FREEING || olck->ols_has_ref) {
+                        cl_lock_get_trust(lock);
+                        lu_ref_add_atomic(&lock->cll_reference,
+                                          "ast", cfs_current());
+                } else
+                        olck = NULL;
+        }
+        spin_unlock(&osc_ast_guard);
+        unlock_res_and_lock(dlm_lock);
+        return olck;
+}
+
+static void osc_ast_data_put(const struct lu_env *env, struct osc_lock *olck)
+{
+        struct cl_lock *lock;
+
+        lock = olck->ols_cl.cls_lock;
+        lu_ref_del(&lock->cll_reference, "ast", cfs_current());
+        cl_lock_put(env, lock);
+}
+
+static void osc_lock_to_lockless(struct osc_lock *olck)
+{
+        struct cl_lock_slice *slice = &olck->ols_cl;
+        struct cl_lock  *lock       = slice->cls_lock;
+
+        /*
+         * TODO: Discover which locks we need to convert the lock
+         * to ldlmlockless.
+         */
+        LASSERT(cl_lock_is_mutexed(lock));
+        slice->cls_ops = &osc_lock_lockless_ops;
+}
+
+/**
+ * Updates object attributes from a lock value block (lvb) received together
+ * with the DLM lock reply from the server. Copy of osc_update_enqueue()
+ * logic.
+ *
+ * This can be optimized to not update attributes when lock is a result of a
+ * local match.
+ */
+static void osc_lock_lvb_update(const struct lu_env *env, struct osc_lock *olck,
+                                int rc)
+{
+        struct ost_lvb    *lvb;
+        struct cl_object  *obj;
+        struct lov_oinfo  *oinfo;
+        struct cl_attr    *attr;
+        unsigned           valid;
+
+        ENTRY;
+
+        if (!(olck->ols_flags & LDLM_FL_LVB_READY)) {
+                EXIT;
+                return;
+        }
+
+        lvb   = &olck->ols_lvb;
+        obj   = olck->ols_cl.cls_obj;
+        oinfo = cl2osc(obj)->oo_oinfo;
+        attr  = &osc_env_info(env)->oti_attr;
+        valid = CAT_BLOCKS | CAT_ATIME | CAT_CTIME | CAT_MTIME | CAT_SIZE;
+        cl_lvb2attr(attr, lvb);
+
+        cl_object_attr_lock(obj);
+        if (rc == 0) {
+                struct ldlm_lock  *dlmlock;
+                __u64 size;
+
+                dlmlock = olck->ols_lock;
+                LASSERT(dlmlock != NULL);
+
+                size = lvb->lvb_size;
+                /* Extend KMS up to the end of this lock and no further
+                 * A lock on [x,y] means a KMS of up to y + 1 bytes! */
+                if (size > dlmlock->l_policy_data.l_extent.end)
+                        size = dlmlock->l_policy_data.l_extent.end + 1;
+                if (size >= oinfo->loi_kms) {
+                        LDLM_DEBUG(dlmlock, "lock acquired, setting rss="LPU64
+                                   ", kms="LPU64, lvb->lvb_size, size);
+                        valid |= CAT_KMS;
+                        attr->cat_kms = size;
+                } else {
+                        LDLM_DEBUG(dlmlock, "lock acquired, setting rss="
+                                   LPU64"; leaving kms="LPU64", end="LPU64,
+                                   lvb->lvb_size, oinfo->loi_kms,
+                                   dlmlock->l_policy_data.l_extent.end);
+                }
+                ldlm_lock_allow_match(dlmlock);
+        } else if (rc == -ENAVAIL && olck->ols_glimpse) {
+                CDEBUG(D_INODE, "glimpsed, setting rss="LPU64"; leaving"
+                       " kms="LPU64"\n", lvb->lvb_size, oinfo->loi_kms);
+        } else
+                valid = 0;
+
+        if (valid != 0)
+                cl_object_attr_set(env, obj, attr, valid);
+
+        cl_object_attr_unlock(obj);
+
+        EXIT;
+}
+
+static void osc_lock_granted(const struct lu_env *env, struct osc_lock *olck,
+                             struct ldlm_lock *dlmlock, int rc)
+{
+        struct ldlm_extent   *ext;
+        struct cl_lock       *lock;
+        struct cl_lock_descr *descr;
+
+        LASSERT(dlmlock->l_granted_mode == dlmlock->l_req_mode);
+
+        ENTRY;
+        if (olck->ols_state != OLS_GRANTED) {
+                lock  = olck->ols_cl.cls_lock;
+                ext   = &dlmlock->l_policy_data.l_extent;
+                descr = &osc_env_info(env)->oti_descr;
+                descr->cld_obj = lock->cll_descr.cld_obj;
+
+                /* XXX check that ->l_granted_mode is valid. */
+                descr->cld_mode  = osc_ldlm2cl_lock(dlmlock->l_granted_mode);
+                descr->cld_start = cl_index(descr->cld_obj, ext->start);
+                descr->cld_end   = cl_index(descr->cld_obj, ext->end);
+                /*
+                 * tell upper layers the extent of the lock that was actually
+                 * granted
+                 */
+                cl_lock_modify(env, lock, descr);
+                LINVRNT(osc_lock_invariant(olck));
+                olck->ols_state = OLS_GRANTED;
+                osc_lock_lvb_update(env, olck, rc);
+                cl_lock_signal(env, lock);
+        }
+        EXIT;
+}
+
+static void osc_lock_upcall0(const struct lu_env *env, struct osc_lock *olck)
+
+{
+        struct ldlm_lock *dlmlock;
+
+        ENTRY;
+
+        dlmlock = ldlm_handle2lock_long(&olck->ols_handle, 0);
+        LASSERT(dlmlock != NULL);
+
+        lock_res_and_lock(dlmlock);
+        spin_lock(&osc_ast_guard);
+        LASSERT(dlmlock->l_ast_data == olck);
+        LASSERT(olck->ols_lock == NULL);
+        olck->ols_lock = dlmlock;
+        spin_unlock(&osc_ast_guard);
+        unlock_res_and_lock(dlmlock);
+
+        /*
+         * Lock might be not yet granted. In this case, completion ast
+         * (osc_ldlm_completion_ast()) comes later and finishes lock
+         * granting.
+         */
+        if (dlmlock->l_granted_mode == dlmlock->l_req_mode)
+                osc_lock_granted(env, olck, dlmlock, 0);
+        /*
+         * osc_enqueue_interpret() decrefs asynchronous locks, counter
+         * this.
+         */
+        ldlm_lock_addref(&olck->ols_handle, olck->ols_einfo.ei_mode);
+        olck->ols_hold = olck->ols_has_ref = 1;
+
+        /* lock reference taken by ldlm_handle2lock_long() is owned by
+         * osc_lock and released in osc_lock_detach() */
+        lu_ref_add(&dlmlock->l_reference, "osc_lock", olck);
+}
+
+/**
+ * Lock upcall function that is executed either when a reply to ENQUEUE rpc is
+ * received from a server, or after osc_enqueue_base() matched a local DLM
+ * lock.
+ */
+static int osc_lock_upcall(void *cookie, int errcode)
+{
+        struct osc_lock      *olck  = cookie;
+        struct cl_lock_slice *slice = &olck->ols_cl;
+        struct cl_lock       *lock  = slice->cls_lock;
+        struct lu_env        *env;
+
+        int refcheck;
+
+        ENTRY;
+        /*
+         * XXX environment should be created in ptlrpcd.
+         */
+        env = cl_env_get(&refcheck);
+        if (!IS_ERR(env)) {
+                int rc;
+
+                cl_lock_mutex_get(env, lock);
+
+                LASSERT(lock->cll_state >= CLS_QUEUING);
+                if (olck->ols_state == OLS_ENQUEUED) {
+                        olck->ols_state = OLS_UPCALL_RECEIVED;
+                        rc = ldlm_error2errno(errcode);
+                } else if (olck->ols_state == OLS_CANCELLED) {
+                        rc = -EIO;
+                } else {
+                        CERROR("Impossible state: %i\n", olck->ols_state);
+                        LBUG();
+                }
+                if (rc) {
+                        struct ldlm_lock *dlmlock;
+
+                        dlmlock = ldlm_handle2lock(&olck->ols_handle);
+                        if (dlmlock != NULL) {
+                                lock_res_and_lock(dlmlock);
+                                spin_lock(&osc_ast_guard);
+                                LASSERT(olck->ols_lock == NULL);
+                                dlmlock->l_ast_data = NULL;
+                                olck->ols_handle.cookie = 0ULL;
+                                spin_unlock(&osc_ast_guard);
+                                unlock_res_and_lock(dlmlock);
+                                LDLM_LOCK_PUT(dlmlock);
+                        }
+                } else {
+                        if (olck->ols_glimpse)
+                                olck->ols_glimpse = 0;
+                        osc_lock_upcall0(env, olck);
+                }
+
+                /* Error handling, some errors are tolerable. */
+                if (olck->ols_locklessable && rc == -EUSERS) {
+                        /* This is a tolerable error, turn this lock into
+                         * lockless lock.
+                         */
+                        osc_object_set_contended(cl2osc(slice->cls_obj));
+                        LASSERT(slice->cls_ops == &osc_lock_ops);
+
+                        /* Change this lock to ldlmlock-less lock. */
+                        osc_lock_to_lockless(olck);
+                        olck->ols_state = OLS_GRANTED;
+                        rc = 0;
+                } else if (olck->ols_glimpse && rc == -ENAVAIL) {
+                        osc_lock_lvb_update(env, olck, rc);
+                        cl_lock_delete(env, lock);
+                        /* Hide the error. */
+                        rc = 0;
+                }
+
+                if (rc == 0)
+                        /* on error, lock was signaled by cl_lock_error() */
+                        cl_lock_signal(env, lock);
+                else
+                        cl_lock_error(env, lock, rc);
+
+                cl_lock_mutex_put(env, lock);
+
+                /* release cookie reference, acquired by osc_lock_enqueue() */
+                lu_ref_del(&lock->cll_reference, "upcall", lock);
+                cl_lock_put(env, lock);
+                cl_env_put(env, &refcheck);
+        } else
+                /* should never happen, similar to osc_ldlm_blocking_ast(). */
+                LBUG();
+        RETURN(errcode);
+}
+
+/**
+ * Core of osc_dlm_blocking_ast() logic.
+ */
+static void osc_lock_blocking(const struct lu_env *env,
+                              struct ldlm_lock *dlmlock,
+                              struct osc_lock *olck, int blocking)
+{
+        struct cl_lock *lock = olck->ols_cl.cls_lock;
+
+        LASSERT(olck->ols_lock == dlmlock);
+        CLASSERT(OLS_BLOCKED < OLS_CANCELLED);
+        LASSERT(!osc_lock_is_lockless(olck));
+
+        if (olck->ols_hold)
+                /*
+                 * Lock might be still addref-ed here, if e.g., blocking ast
+                 * is sent for a failed lock.
+                 */
+                osc_lock_unuse(env, &olck->ols_cl);
+
+        if (blocking && olck->ols_state < OLS_BLOCKED)
+                /*
+                 * Move osc_lock into OLS_BLOCKED before canceling the lock,
+                 * because it recursively re-enters osc_lock_blocking(), with
+                 * the state set to OLS_CANCELLED.
+                 */
+                olck->ols_state = OLS_BLOCKED;
+        /*
+         * cancel and destroy lock at least once no matter how blocking ast is
+         * entered (see comment above osc_ldlm_blocking_ast() for use
+         * cases). cl_lock_cancel() and cl_lock_delete() are idempotent.
+         */
+        cl_lock_cancel(env, lock);
+        cl_lock_delete(env, lock);
+}
+
+/**
+ * Helper for osc_dlm_blocking_ast() handling discrepancies between cl_lock
+ * and ldlm_lock caches.
+ */
+static int osc_dlm_blocking_ast0(const struct lu_env *env,
+                                 struct ldlm_lock *dlmlock,
+                                 void *data, int flag)
+{
+        struct osc_lock *olck;
+        struct cl_lock  *lock;
+        int result;
+        int cancel;
+
+        LASSERT(flag == LDLM_CB_BLOCKING || flag == LDLM_CB_CANCELING);
+
+        cancel = 0;
+        olck = osc_ast_data_get(dlmlock);
+        if (olck != NULL) {
+                lock = olck->ols_cl.cls_lock;
+                cl_lock_mutex_get(env, lock);
+                LINVRNT(osc_lock_invariant(olck));
+                if (olck->ols_ast_wait) {
+                        /* wake up osc_lock_use() */
+                        cl_lock_signal(env, lock);
+                        olck->ols_ast_wait = 0;
+                }
+                /*
+                 * Lock might have been canceled while this thread was
+                 * sleeping for lock mutex, but olck is pinned in memory.
+                 */
+                if (olck == dlmlock->l_ast_data) {
+                        /*
+                         * NOTE: DLM sends blocking AST's for failed locks
+                         *       (that are still in pre-OLS_GRANTED state)
+                         *       too, and they have to be canceled otherwise
+                         *       DLM lock is never destroyed and stuck in
+                         *       the memory.
+                         *
+                         *       Alternatively, ldlm_cli_cancel() can be
+                         *       called here directly for osc_locks with
+                         *       ols_state < OLS_GRANTED to maintain an
+                         *       invariant that ->clo_cancel() is only called
+                         *       for locks that were granted.
+                         */
+                        LASSERT(data == olck);
+                        osc_lock_blocking(env, dlmlock,
+                                          olck, flag == LDLM_CB_BLOCKING);
+                } else
+                        cancel = 1;
+                cl_lock_mutex_put(env, lock);
+                osc_ast_data_put(env, olck);
+        } else
+                /*
+                 * DLM lock exists, but there is no cl_lock attached to it.
+                 * This is a `normal' race. cl_object and its cl_lock's can be
+                 * removed by memory pressure, together with all pages.
+                 */
+                cancel = (flag == LDLM_CB_BLOCKING);
+
+        if (cancel) {
+                struct lustre_handle *lockh;
+
+                lockh = &osc_env_info(env)->oti_handle;
+                ldlm_lock2handle(dlmlock, lockh);
+                result = ldlm_cli_cancel(lockh);
+        } else
+                result = 0;
+        return result;
+}
+
+/**
+ * Blocking ast invoked by ldlm when dlm lock is either blocking progress of
+ * some other lock, or is canceled. This function is installed as a
+ * ldlm_lock::l_blocking_ast() for client extent locks.
+ *
+ * Control flow is tricky, because ldlm uses the same call-back
+ * (ldlm_lock::l_blocking_ast()) for both blocking and cancellation ast's.
+ *
+ * \param dlmlock lock for which ast occurred.
+ *
+ * \param new description of a conflicting lock in case of blocking ast.
+ *
+ * \param data value of dlmlock->l_ast_data
+ *
+ * \param flag LDLM_CB_BLOCKING or LDLM_CB_CANCELING. Used to distinguish
+ *             cancellation and blocking ast's.
+ *
+ * Possible use cases:
+ *
+ *     - ldlm calls dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING) to cancel
+ *       lock due to lock lru pressure, or explicit user request to purge
+ *       locks.
+ *
+ *     - ldlm calls dlmlock->l_blocking_ast(..., LDLM_CB_BLOCKING) to notify
+ *       us that dlmlock conflicts with another lock that some client is
+ *       enqueing. Lock is canceled.
+ *
+ *           - cl_lock_cancel() is called. osc_lock_cancel() calls
+ *             ldlm_cli_cancel() that calls
+ *
+ *                  dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING)
+ *
+ *             recursively entering osc_ldlm_blocking_ast().
+ *
+ *     - client cancels lock voluntary (e.g., as a part of early cancellation):
+ *
+ *           cl_lock_cancel()->
+ *             osc_lock_cancel()->
+ *               ldlm_cli_cancel()->
+ *                 dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING)
+ *
+ */
+static int osc_ldlm_blocking_ast(struct ldlm_lock *dlmlock,
+                                 struct ldlm_lock_desc *new, void *data,
+                                 int flag)
+{
+        struct lu_env     *env;
+        struct cl_env_nest nest;
+        int                result;
+
+        /*
+         * This can be called in the context of outer IO, e.g.,
+         *
+         *     cl_enqueue()->...
+         *       ->osc_enqueue_base()->...
+         *         ->ldlm_prep_elc_req()->...
+         *           ->ldlm_cancel_callback()->...
+         *             ->osc_ldlm_blocking_ast()
+         *
+         * new environment has to be created to not corrupt outer context.
+         */
+        env = cl_env_nested_get(&nest);
+        if (!IS_ERR(env))
+                result = osc_dlm_blocking_ast0(env, dlmlock, data, flag);
+        else {
+                result = PTR_ERR(env);
+                /*
+                 * XXX This should never happen, as cl_lock is
+                 * stuck. Pre-allocated environment a la vvp_inode_fini_env
+                 * should be used.
+                 */
+                LBUG();
+        }
+        if (result != 0) {
+                if (result == -ENODATA)
+                        result = 0;
+                else
+                        CERROR("BAST failed: %d\n", result);
+        }
+        cl_env_nested_put(&nest, env);
+        return result;
+}
+
+static int osc_ldlm_completion_ast(struct ldlm_lock *dlmlock,
+                                   int flags, void *data)
+{
+        struct lu_env   *env;
+        void            *env_cookie;
+        struct osc_lock *olck;
+        struct cl_lock  *lock;
+        int refcheck;
+        int result;
+        int dlmrc;
+
+        /* first, do dlm part of the work */
+        dlmrc = ldlm_completion_ast_async(dlmlock, flags, data);
+        /* then, notify cl_lock */
+        env_cookie = cl_env_reenter();
+        env = cl_env_get(&refcheck);
+        if (!IS_ERR(env)) {
+                olck = osc_ast_data_get(dlmlock);
+                if (olck != NULL) {
+                        lock = olck->ols_cl.cls_lock;
+                        cl_lock_mutex_get(env, lock);
+                        /*
+                         * ldlm_handle_cp_callback() copied LVB from request
+                         * to lock->l_lvb_data, store it in osc_lock.
+                         */
+                        LASSERT(dlmlock->l_lvb_data != NULL);
+                        olck->ols_lvb = *(struct ost_lvb *)dlmlock->l_lvb_data;
+                        if (olck->ols_lock == NULL)
+                                /*
+                                 * upcall (osc_lock_upcall()) hasn't yet been
+                                 * called. Do nothing now, upcall will bind
+                                 * olck to dlmlock and signal the waiters.
+                                 *
+                                 * This maintains an invariant that osc_lock
+                                 * and ldlm_lock are always bound when
+                                 * osc_lock is in OLS_GRANTED state.
+                                 */
+                                ;
+                        else if (dlmlock->l_granted_mode != LCK_MINMODE)
+                                osc_lock_granted(env, olck, dlmlock, dlmrc);
+                        if (dlmrc != 0)
+                                cl_lock_error(env, lock, dlmrc);
+                        cl_lock_mutex_put(env, lock);
+                        osc_ast_data_put(env, olck);
+                        result = 0;
+                } else
+                        result = -ELDLM_NO_LOCK_DATA;
+                cl_env_put(env, &refcheck);
+        } else
+                result = PTR_ERR(env);
+        cl_env_reexit(env_cookie);
+        return dlmrc ?: result;
+}
+
+static int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data)
+{
+        struct ptlrpc_request  *req  = data;
+        struct osc_lock        *olck;
+        struct cl_lock         *lock;
+        struct cl_object       *obj;
+        struct lu_env          *env;
+        struct ost_lvb         *lvb;
+        struct req_capsule     *cap;
+        int                     result;
+        int                     refcheck;
+
+        LASSERT(lustre_msg_get_opc(req->rq_reqmsg) == LDLM_GL_CALLBACK);
+
+        env = cl_env_get(&refcheck);
+        if (!IS_ERR(env)) {
+                /*
+                 * osc_ast_data_get() has to go after environment is
+                 * allocated, because osc_ast_data() acquires a
+                 * reference to a lock, and it can only be released in
+                 * environment.
+                 */
+                olck = osc_ast_data_get(dlmlock);
+                if (olck != NULL) {
+                        cap = &req->rq_pill;
+                        req_capsule_extend(cap, &RQF_LDLM_GL_CALLBACK);
+                        req_capsule_set_size(cap, &RMF_DLM_LVB, RCL_SERVER,
+                                             sizeof *lvb);
+                        result = req_capsule_server_pack(cap);
+                        if (result == 0) {
+                                lvb = req_capsule_server_get(cap, &RMF_DLM_LVB);
+                                lock = olck->ols_cl.cls_lock;
+                                obj = lock->cll_descr.cld_obj;
+                                result = cl_object_glimpse(env, obj, lvb);
+                        }
+                        osc_ast_data_put(env, olck);
+                } else {
+                        /*
+                         * These errors are normal races, so we don't want to
+                         * fill the console with messages by calling
+                         * ptlrpc_error()
+                         */
+                        lustre_pack_reply(req, 1, NULL, NULL);
+                        result = -ELDLM_NO_LOCK_DATA;
+                }
+                cl_env_put(env, &refcheck);
+        } else
+                result = PTR_ERR(env);
+        req->rq_status = result;
+        return result;
+}
+
+static unsigned long osc_lock_weigh(const struct lu_env *env,
+                                    const struct cl_lock_slice *slice)
+{
+        /*
+         * don't need to grab coh_page_guard since we don't care the exact #
+         * of pages..
+         */
+        return cl_object_header(slice->cls_obj)->coh_pages;
+}
+
+/**
+ * Get the weight of dlm lock for early cancellation.
+ *
+ * XXX: it should return the pages covered by this \a dlmlock.
+ */
+static unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock)
+{
+        struct lu_env           *env;
+        int                      refcheck;
+        void                    *cookie;
+        struct osc_lock         *lock;
+        struct cl_lock          *cll;
+        unsigned long            weight;
+        ENTRY;
+
+        might_sleep();
+        cookie = cl_env_reenter();
+        /*
+         * osc_ldlm_weigh_ast has a complex context since it might be called
+         * because of lock canceling, or from user's input. We have to make
+         * a new environment for it. Probably it is implementation safe to use
+         * the upper context because cl_lock_put don't modify environment
+         * variables. But in case of ..
+         */
+        env = cl_env_get(&refcheck);
+        if (IS_ERR(env)) {
+                /* Mostly because lack of memory, tend to eliminate this lock*/
+                cl_env_reexit(cookie);
+                RETURN(0);
+        }
+
+        LASSERT(dlmlock->l_resource->lr_type == LDLM_EXTENT);
+        lock = osc_ast_data_get(dlmlock);
+        if (lock == NULL) {
+                /* cl_lock was destroyed because of memory pressure.
+                 * It is much reasonable to assign this type of lock
+                 * a lower cost.
+                 */
+                GOTO(out, weight = 0);
+        }
+
+        cll = lock->ols_cl.cls_lock;
+        cl_lock_mutex_get(env, cll);
+        weight = cl_lock_weigh(env, cll);
+        cl_lock_mutex_put(env, cll);
+        osc_ast_data_put(env, lock);
+        EXIT;
+
+out:
+        cl_env_put(env, &refcheck);
+        cl_env_reexit(cookie);
+        return weight;
+}
+
+static void osc_lock_build_einfo(const struct lu_env *env,
+                                 const struct cl_lock *clock,
+                                 struct osc_lock *lock,
+                                 struct ldlm_enqueue_info *einfo)
+{
+        enum cl_lock_mode mode;
+
+        mode = clock->cll_descr.cld_mode;
+        if (mode == CLM_PHANTOM)
+                /*
+                 * For now, enqueue all glimpse locks in read mode. In the
+                 * future, client might choose to enqueue LCK_PW lock for
+                 * glimpse on a file opened for write.
+                 */
+                mode = CLM_READ;
+
+        einfo->ei_type   = LDLM_EXTENT;
+        einfo->ei_mode   = osc_cl_lock2ldlm(mode);
+        einfo->ei_cb_bl  = osc_ldlm_blocking_ast;
+        einfo->ei_cb_cp  = osc_ldlm_completion_ast;
+        einfo->ei_cb_gl  = osc_ldlm_glimpse_ast;
+        einfo->ei_cb_wg  = osc_ldlm_weigh_ast;
+        einfo->ei_cbdata = lock; /* value to be put into ->l_ast_data */
+}
+
+/**
+ * Cancels \a conflict lock and waits until it reached CLS_FREEING state. This
+ * is called as a part of enqueuing to cancel conflicting locks early.
+ *
+ * \retval            0: success, \a conflict was cancelled and destroyed.
+ *
+ * \retval   CLO_REPEAT: \a conflict was cancelled, but \a lock mutex was
+ *                       released in the process. Repeat enqueing.
+ *
+ * \retval -EWOULDBLOCK: \a conflict cannot be cancelled immediately, and
+ *                       either \a lock is non-blocking, or current thread
+ *                       holds other locks, that prevent it from waiting
+ *                       for cancel to complete.
+ *
+ * \retval          -ve: other error, including -EINTR.
+ *
+ */
+static int osc_lock_cancel_wait(const struct lu_env *env, struct cl_lock *lock,
+                                struct cl_lock *conflict, int canwait)
+{
+        int rc;
+
+        LASSERT(cl_lock_is_mutexed(lock));
+        LASSERT(cl_lock_is_mutexed(conflict));
+
+        rc = 0;
+        if (conflict->cll_state != CLS_FREEING) {
+                cl_lock_cancel(env, conflict);
+                cl_lock_delete(env, conflict);
+                if (conflict->cll_flags & (CLF_CANCELPEND|CLF_DOOMED)) {
+                        rc = -EWOULDBLOCK;
+                        if (cl_lock_nr_mutexed(env) > 2)
+                                /*
+                                 * If mutices of locks other than @lock and
+                                 * @scan are held by the current thread, it
+                                 * cannot wait on @scan state change in a
+                                 * dead-lock safe matter, so simply skip early
+                                 * cancellation in this case.
+                                 *
+                                 * This means that early cancellation doesn't
+                                 * work when there is even slight mutex
+                                 * contention, as top-lock's mutex is usually
+                                 * held at this time.
+                                 */
+                                ;
+                        else if (canwait) {
+                                /* Waiting for @scan to be destroyed */
+                                cl_lock_mutex_put(env, lock);
+                                do {
+                                        rc = cl_lock_state_wait(env, conflict);
+                                } while (!rc &&
+                                         conflict->cll_state < CLS_FREEING);
+                                /* mutex was released, repeat enqueue. */
+                                rc = rc ?: CLO_REPEAT;
+                                cl_lock_mutex_get(env, lock);
+                        }
+                }
+                LASSERT(ergo(!rc, conflict->cll_state == CLS_FREEING));
+                CDEBUG(D_INFO, "lock %p was %s freed now, rc (%d)\n",
+                       conflict, rc ? "not":"", rc);
+        }
+        return rc;
+}
+
+/**
+ * Cancel all conflicting locks and wait for them to be destroyed.
+ *
+ * This function is used for two purposes:
+ *
+ *     - early cancel all conflicting locks before starting IO, and
+ *
+ *     - guarantee that pages added to the page cache by lockless IO are never
+ *       covered by locks other than lockless IO lock, and, hence, are not
+ *       visible to other threads.
+ */
+static int osc_lock_enqueue_wait(const struct lu_env *env,
+                                 const struct osc_lock *olck)
+{
+        struct cl_lock          *lock    = olck->ols_cl.cls_lock;
+        struct cl_lock_descr    *descr   = &lock->cll_descr;
+        struct cl_object_header *hdr     = cl_object_header(descr->cld_obj);
+        struct cl_lock_closure  *closure = &osc_env_info(env)->oti_closure;
+        struct cl_lock          *scan;
+        struct cl_lock          *temp;
+        int lockless                     = osc_lock_is_lockless(olck);
+        int rc                           = 0;
+        int canwait;
+        int stop;
+        ENTRY;
+
+        LASSERT(cl_lock_is_mutexed(lock));
+        LASSERT(lock->cll_state == CLS_QUEUING);
+
+        /*
+         * XXX This function could be sped up if we had asynchronous
+         * cancellation.
+         */
+
+        canwait =
+                !(olck->ols_flags & LDLM_FL_BLOCK_NOWAIT) &&
+                cl_lock_nr_mutexed(env) == 1;
+        cl_lock_closure_init(env, closure, lock, canwait);
+        spin_lock(&hdr->coh_lock_guard);
+        list_for_each_entry_safe(scan, temp, &hdr->coh_locks, cll_linkage) {
+                if (scan == lock)
+                        continue;
+
+                if (scan->cll_state < CLS_QUEUING ||
+                    scan->cll_state == CLS_FREEING ||
+                    scan->cll_descr.cld_start > descr->cld_end ||
+                    scan->cll_descr.cld_end < descr->cld_start)
+                        continue;
+
+                /* overlapped and living locks. */
+                /* A tricky case for lockless pages:
+                 * We need to cancel the compatible locks if we're enqueuing
+                 * a lockless lock, for example:
+                 * imagine that client has PR lock on [0, 1000], and thread T0
+                 * is doing lockless IO in [500, 1500] region. Concurrent
+                 * thread T1 can see lockless data in [500, 1000], which is
+                 * wrong, because these data are possibly stale.
+                 */
+                if (!lockless && cl_lock_compatible(scan, lock))
+                        continue;
+
+                /* Now @scan is conflicting with @lock, this means current
+                 * thread have to sleep for @scan being destroyed. */
+                cl_lock_get_trust(scan);
+                if (&temp->cll_linkage != &hdr->coh_locks)
+                        cl_lock_get_trust(temp);
+                spin_unlock(&hdr->coh_lock_guard);
+                lu_ref_add(&scan->cll_reference, "cancel-wait", lock);
+
+                LASSERT(list_empty(&closure->clc_list));
+                rc = cl_lock_closure_build(env, scan, closure);
+                if (rc == 0) {
+                        rc = osc_lock_cancel_wait(env, lock, scan, canwait);
+                        cl_lock_disclosure(env, closure);
+                        if (rc == -EWOULDBLOCK)
+                                rc = 0;
+                }
+                if (rc == CLO_REPEAT && !canwait)
+                        /* cannot wait... no early cancellation. */
+                        rc = 0;
+
+                lu_ref_del(&scan->cll_reference, "cancel-wait", lock);
+                cl_lock_put(env, scan);
+                spin_lock(&hdr->coh_lock_guard);
+                /*
+                 * Lock list could have been modified, while spin-lock was
+                 * released. Check that it is safe to continue.
+                 */
+                stop = list_empty(&temp->cll_linkage);
+                if (&temp->cll_linkage != &hdr->coh_locks)
+                        cl_lock_put(env, temp);
+                if (stop || rc != 0)
+                        break;
+        }
+        spin_unlock(&hdr->coh_lock_guard);
+        cl_lock_closure_fini(closure);
+        RETURN(rc);
+}
+
+/**
+ * Deadlock avoidance for osc_lock_enqueue(). Consider following scenario:
+ *
+ *     - Thread0: obtains PR:[0, 10]. Lock is busy.
+ *
+ *     - Thread1: enqueues PW:[5, 50]. Blocking ast is sent to
+ *       PR:[0, 10], but cancellation of busy lock is postponed.
+ *
+ *     - Thread0: enqueue PR:[30, 40]. Lock is locally matched to
+ *       PW:[5, 50], and thread0 waits for the lock completion never
+ *       releasing PR:[0, 10]---deadlock.
+ *
+ * The second PR lock can be glimpse (it is to deal with that situation that
+ * ll_glimpse_size() has second argument, preventing local match of
+ * not-yet-granted locks, see bug 10295). Similar situation is possible in the
+ * case of memory mapped user level buffer.
+ *
+ * To prevent this we can detect a situation when current "thread" or "io"
+ * already holds a lock on this object and either add LDLM_FL_BLOCK_GRANTED to
+ * the ols->ols_flags, or prevent local match with PW locks.
+ */
+static int osc_deadlock_is_possible(const struct lu_env *env,
+                                    struct cl_lock *lock)
+{
+        struct cl_object        *obj;
+        struct cl_object_header *head;
+        struct cl_lock          *scan;
+        struct osc_io           *oio;
+
+        int result;
+
+        ENTRY;
+
+        LASSERT(cl_lock_is_mutexed(lock));
+
+        oio  = osc_env_io(env);
+        obj  = lock->cll_descr.cld_obj;
+        head = cl_object_header(obj);
+
+        result = 0;
+        spin_lock(&head->coh_lock_guard);
+        list_for_each_entry(scan, &head->coh_locks, cll_linkage) {
+                if (scan != lock) {
+                        struct osc_lock *oscan;
+
+                        oscan = osc_lock_at(scan);
+                        LASSERT(oscan != NULL);
+                        if (oscan->ols_owner == oio) {
+                                result = 1;
+                                break;
+                        }
+                }
+        }
+        spin_unlock(&head->coh_lock_guard);
+        RETURN(result);
+}
+
+/**
+ * Implementation of cl_lock_operations::clo_enqueue() method for osc
+ * layer. This initiates ldlm enqueue:
+ *
+ *     - checks for possible dead-lock conditions (osc_deadlock_is_possible());
+ *
+ *     - cancels conflicting locks early (osc_lock_enqueue_wait());
+ *
+ *     - calls osc_enqueue_base() to do actual enqueue.
+ *
+ * osc_enqueue_base() is supplied with an upcall function that is executed
+ * when lock is received either after a local cached ldlm lock is matched, or
+ * when a reply from the server is received.
+ *
+ * This function does not wait for the network communication to complete.
+ */
+static int osc_lock_enqueue(const struct lu_env *env,
+                            const struct cl_lock_slice *slice,
+                            struct cl_io *_, __u32 enqflags)
+{
+        struct osc_lock          *ols     = cl2osc_lock(slice);
+        struct cl_lock           *lock    = ols->ols_cl.cls_lock;
+        struct osc_object        *obj     = cl2osc(slice->cls_obj);
+        struct osc_thread_info   *info    = osc_env_info(env);
+        struct ldlm_res_id       *resname = &info->oti_resname;
+        ldlm_policy_data_t       *policy  = &info->oti_policy;
+        struct ldlm_enqueue_info *einfo   = &ols->ols_einfo;
+        int result;
+        ENTRY;
+
+        LASSERT(cl_lock_is_mutexed(lock));
+        LASSERT(lock->cll_state == CLS_QUEUING);
+        LASSERT(ols->ols_state == OLS_NEW);
+
+        osc_lock_build_res(env, obj, resname);
+        osc_lock_build_policy(env, lock, policy);
+        ols->ols_flags = osc_enq2ldlm_flags(enqflags);
+        if (ols->ols_locklessable)
+                ols->ols_flags |= LDLM_FL_DENY_ON_CONTENTION;
+        if (osc_deadlock_is_possible(env, lock))
+                ols->ols_flags |= LDLM_FL_BLOCK_GRANTED;
+        if (ols->ols_flags & LDLM_FL_HAS_INTENT)
+                ols->ols_glimpse = 1;
+
+        result = osc_lock_enqueue_wait(env, ols);
+        if (result == 0) {
+                /* a reference for lock, passed as an upcall cookie */
+                cl_lock_get(lock);
+                lu_ref_add(&lock->cll_reference, "upcall", lock);
+                ols->ols_state = OLS_ENQUEUED;
+
+                /*
+                 * XXX: this is possible blocking point as
+                 * ldlm_lock_match(LDLM_FL_LVB_READY) waits for
+                 * LDLM_CP_CALLBACK.
+                 */
+                result = osc_enqueue_base(osc_export(obj), resname,
+                                          &ols->ols_flags, policy,
+                                          &ols->ols_lvb,
+                                          obj->oo_oinfo->loi_kms_valid,
+                                          osc_lock_upcall,
+                                          ols, einfo, &ols->ols_handle,
+                                          PTLRPCD_SET, 1);
+                if (result != 0) {
+                        lu_ref_del(&lock->cll_reference, "upcall", lock);
+                        cl_lock_put(env, lock);
+                }
+        }
+
+        RETURN(result);
+}
+
+static int osc_lock_wait(const struct lu_env *env,
+                         const struct cl_lock_slice *slice)
+{
+        struct osc_lock *olck = cl2osc_lock(slice);
+        struct cl_lock  *lock = olck->ols_cl.cls_lock;
+
+        LINVRNT(osc_lock_invariant(olck));
+        if (olck->ols_glimpse && olck->ols_state >= OLS_UPCALL_RECEIVED)
+                return 0;
+
+        LASSERT(equi(olck->ols_state >= OLS_UPCALL_RECEIVED &&
+                     lock->cll_error == 0, olck->ols_lock != NULL));
+
+        return lock->cll_error ?: olck->ols_state >= OLS_GRANTED ? 0 : CLO_WAIT;
+}
+
+/**
+ * An implementation of cl_lock_operations::clo_use() method that pins cached
+ * lock.
+ */
+static int osc_lock_use(const struct lu_env *env,
+                        const struct cl_lock_slice *slice)
+{
+        struct osc_lock *olck = cl2osc_lock(slice);
+        int rc;
+
+        LASSERT(!olck->ols_hold);
+        /*
+         * Atomically check for LDLM_FL_CBPENDING and addref a lock if this
+         * flag is not set. This protects us from a concurrent blocking ast.
+         */
+        rc = ldlm_lock_addref_try(&olck->ols_handle, olck->ols_einfo.ei_mode);
+        if (rc == 0) {
+                olck->ols_hold = olck->ols_has_ref = 1;
+                olck->ols_state = OLS_GRANTED;
+        } else {
+                struct cl_lock *lock;
+
+                /*
+                 * Lock is being cancelled somewhere within
+                 * ldlm_handle_bl_callback(): LDLM_FL_CBPENDING is already
+                 * set, but osc_ldlm_blocking_ast() hasn't yet acquired
+                 * cl_lock mutex.
+                 */
+                lock = slice->cls_lock;
+                LASSERT(lock->cll_state == CLS_CACHED);
+                LASSERT(lock->cll_users > 0);
+                LASSERT(olck->ols_lock->l_flags & LDLM_FL_CBPENDING);
+                /* set a flag for osc_dlm_blocking_ast0() to signal the
+                 * lock.*/
+                olck->ols_ast_wait = 1;
+                rc = CLO_WAIT;
+        }
+        return rc;
+}
+
+static int osc_lock_flush(struct osc_lock *ols, int discard)
+{
+        struct cl_lock       *lock  = ols->ols_cl.cls_lock;
+        struct cl_env_nest    nest;
+        struct lu_env        *env;
+        int result = 0;
+
+        env = cl_env_nested_get(&nest);
+        if (!IS_ERR(env)) {
+                result = cl_lock_page_out(env, lock, discard);
+                cl_env_nested_put(&nest, env);
+        } else
+                result = PTR_ERR(env);
+        if (result == 0)
+                ols->ols_flush = 1;
+        return result;
+}
+
+/**
+ * Implements cl_lock_operations::clo_cancel() method for osc layer. This is
+ * called (as part of cl_lock_cancel()) when lock is canceled either voluntary
+ * (LRU pressure, early cancellation, umount, etc.) or due to the conflict
+ * with some other lock some where in the cluster. This function does the
+ * following:
+ *
+ *     - invalidates all pages protected by this lock (after sending dirty
+ *       ones to the server, as necessary);
+ *
+ *     - decref's underlying ldlm lock;
+ *
+ *     - cancels ldlm lock (ldlm_cli_cancel()).
+ */
+static void osc_lock_cancel(const struct lu_env *env,
+                            const struct cl_lock_slice *slice)
+{
+        struct cl_lock   *lock    = slice->cls_lock;
+        struct osc_lock  *olck    = cl2osc_lock(slice);
+        struct ldlm_lock *dlmlock = olck->ols_lock;
+        int               result;
+        int               discard;
+
+        LASSERT(cl_lock_is_mutexed(lock));
+        LINVRNT(osc_lock_invariant(olck));
+
+        if (dlmlock != NULL) {
+                discard = dlmlock->l_flags & LDLM_FL_DISCARD_DATA;
+                result = osc_lock_flush(olck, discard);
+                if (olck->ols_hold)
+                        osc_lock_unuse(env, slice);
+                LASSERT(dlmlock->l_readers == 0 && dlmlock->l_writers == 0);
+                result = ldlm_cli_cancel(&olck->ols_handle);
+                if (result < 0)
+                        CL_LOCK_DEBUG(D_ERROR, env, lock,
+                                      "lock %p cancel failure with error(%d)\n",
+                                      lock, result);
+        }
+        olck->ols_state = OLS_CANCELLED;
+        osc_lock_detach(env, olck);
+}
+
+void cl_lock_page_list_fixup(const struct lu_env *env,
+                             struct cl_io *io, struct cl_lock *lock,
+                             struct cl_page_list *queue);
+
+#ifdef INVARIANT_CHECK
+/**
+ * Returns true iff there are pages under \a olck not protected by other
+ * locks.
+ */
+static int osc_lock_has_pages(struct osc_lock *olck)
+{
+        struct cl_lock       *lock;
+        struct cl_lock_descr *descr;
+        struct cl_object     *obj;
+        struct osc_object    *oob;
+        struct cl_page_list  *plist;
+        struct cl_page       *page;
+        struct cl_env_nest    nest;
+        struct cl_io         *io;
+        struct lu_env        *env;
+        int                   result;
+
+        env = cl_env_nested_get(&nest);
+        if (!IS_ERR(env)) {
+                obj   = olck->ols_cl.cls_obj;
+                oob   = cl2osc(obj);
+                io    = &oob->oo_debug_io;
+                lock  = olck->ols_cl.cls_lock;
+                descr = &lock->cll_descr;
+                plist = &osc_env_info(env)->oti_plist;
+                cl_page_list_init(plist);
+
+                mutex_lock(&oob->oo_debug_mutex);
+
+                io->ci_obj = cl_object_top(obj);
+                cl_io_init(env, io, CIT_MISC, io->ci_obj);
+                cl_page_gang_lookup(env, obj, io,
+                                    descr->cld_start, descr->cld_end, plist);
+                cl_lock_page_list_fixup(env, io, lock, plist);
+                if (plist->pl_nr > 0) {
+                        CL_LOCK_DEBUG(D_ERROR, env, lock, "still has pages\n");
+                        cl_page_list_for_each(page, plist)
+                                CL_PAGE_DEBUG(D_ERROR, env, page, "\n");
+                }
+                result = plist->pl_nr > 0;
+                cl_page_list_disown(env, io, plist);
+                cl_page_list_fini(env, plist);
+                cl_io_fini(env, io);
+                mutex_unlock(&oob->oo_debug_mutex);
+                cl_env_nested_put(&nest, env);
+        } else
+                result = 0;
+        return result;
+}
+#else
+# define osc_lock_has_pages(olck) (0)
+#endif /* INVARIANT_CHECK */
+
+static void osc_lock_delete(const struct lu_env *env,
+                            const struct cl_lock_slice *slice)
+{
+        struct osc_lock *olck;
+
+        olck = cl2osc_lock(slice);
+        LINVRNT(osc_lock_invariant(olck));
+        LINVRNT(!osc_lock_has_pages(olck));
+
+        if (olck->ols_hold)
+                osc_lock_unuse(env, slice);
+        osc_lock_detach(env, olck);
+}
+
+/**
+ * Implements cl_lock_operations::clo_state() method for osc layer.
+ *
+ * Maintains osc_lock::ols_owner field.
+ *
+ * This assumes that lock always enters CLS_HELD (from some other state) in
+ * the same IO context as one that requested the lock. This should not be a
+ * problem, because context is by definition shared by all activity pertaining
+ * to the same high-level IO.
+ */
+static void osc_lock_state(const struct lu_env *env,
+                           const struct cl_lock_slice *slice,
+                           enum cl_lock_state state)
+{
+        struct osc_lock *lock = cl2osc_lock(slice);
+        struct osc_io   *oio  = osc_env_io(env);
+
+        /*
+         * XXX multiple io contexts can use the lock at the same time.
+         */
+        LINVRNT(osc_lock_invariant(lock));
+        if (state == CLS_HELD && slice->cls_lock->cll_state != CLS_HELD) {
+                LASSERT(lock->ols_owner == NULL);
+                lock->ols_owner = oio;
+        } else if (state != CLS_HELD)
+                lock->ols_owner = NULL;
+}
+
+static int osc_lock_print(const struct lu_env *env, void *cookie,
+                          lu_printer_t p, const struct cl_lock_slice *slice)
+{
+        struct osc_lock *lock = cl2osc_lock(slice);
+
+        /*
+         * XXX print ldlm lock and einfo properly.
+         */
+        (*p)(env, cookie, "%p %08x "LPU64" %d %p ",
+             lock->ols_lock, lock->ols_flags, lock->ols_handle.cookie,
+             lock->ols_state, lock->ols_owner);
+        osc_lvb_print(env, cookie, p, &lock->ols_lvb);
+        return 0;
+}
+
+static const struct cl_lock_operations osc_lock_ops = {
+        .clo_fini    = osc_lock_fini,
+        .clo_enqueue = osc_lock_enqueue,
+        .clo_wait    = osc_lock_wait,
+        .clo_unuse   = osc_lock_unuse,
+        .clo_use     = osc_lock_use,
+        .clo_delete  = osc_lock_delete,
+        .clo_state   = osc_lock_state,
+        .clo_cancel  = osc_lock_cancel,
+        .clo_weigh   = osc_lock_weigh,
+        .clo_print   = osc_lock_print
+};
+
+static int osc_lock_lockless_enqueue(const struct lu_env *env,
+                                     const struct cl_lock_slice *slice,
+                                     struct cl_io *_, __u32 enqflags)
+{
+        struct osc_lock          *ols     = cl2osc_lock(slice);
+        struct cl_lock           *lock    = ols->ols_cl.cls_lock;
+        int result;
+
+        LASSERT(cl_lock_is_mutexed(lock));
+        LASSERT(lock->cll_state == CLS_QUEUING);
+        LASSERT(ols->ols_state == OLS_NEW);
+
+        result = osc_lock_enqueue_wait(env, ols);
+        if (result == 0)
+                ols->ols_state = OLS_GRANTED;
+        return result;
+}
+
+static int osc_lock_lockless_unuse(const struct lu_env *env,
+                                   const struct cl_lock_slice *slice)
+{
+        struct osc_lock *ols = cl2osc_lock(slice);
+        struct cl_lock *lock = slice->cls_lock;
+
+        LASSERT(ols->ols_state == OLS_GRANTED);
+        LINVRNT(osc_lock_invariant(ols));
+
+        cl_lock_cancel(env, lock);
+        cl_lock_delete(env, lock);
+        return 0;
+}
+
+static void osc_lock_lockless_cancel(const struct lu_env *env,
+                                     const struct cl_lock_slice *slice)
+{
+        struct osc_lock   *ols  = cl2osc_lock(slice);
+        int result;
+
+        result = osc_lock_flush(ols, 0);
+        if (result)
+                CERROR("Pages for lockless lock %p were not purged(%d)\n",
+                       ols, result);
+        ols->ols_state = OLS_CANCELLED;
+}
+
+static int osc_lock_lockless_wait(const struct lu_env *env,
+                                  const struct cl_lock_slice *slice)
+{
+        struct osc_lock *olck = cl2osc_lock(slice);
+        struct cl_lock  *lock = olck->ols_cl.cls_lock;
+
+        LINVRNT(osc_lock_invariant(olck));
+        LASSERT(olck->ols_state >= OLS_UPCALL_RECEIVED);
+
+        return lock->cll_error;
+}
+
+static void osc_lock_lockless_state(const struct lu_env *env,
+                                    const struct cl_lock_slice *slice,
+                                    enum cl_lock_state state)
+{
+        struct osc_lock *lock = cl2osc_lock(slice);
+        struct osc_io   *oio  = osc_env_io(env);
+
+        LINVRNT(osc_lock_invariant(lock));
+        if (state == CLS_HELD) {
+                LASSERT(lock->ols_owner == NULL);
+                lock->ols_owner = oio;
+                oio->oi_lockless = 1;
+        } else
+                lock->ols_owner = NULL;
+}
+
+static int osc_lock_lockless_fits_into(const struct lu_env *env,
+                                       const struct cl_lock_slice *slice,
+                                       const struct cl_lock_descr *need,
+                                       const struct cl_io *io)
+{
+        return 0;
+}
+
+static const struct cl_lock_operations osc_lock_lockless_ops = {
+        .clo_fini      = osc_lock_fini,
+        .clo_enqueue   = osc_lock_lockless_enqueue,
+        .clo_wait      = osc_lock_lockless_wait,
+        .clo_unuse     = osc_lock_lockless_unuse,
+        .clo_state     = osc_lock_lockless_state,
+        .clo_fits_into = osc_lock_lockless_fits_into,
+        .clo_cancel    = osc_lock_lockless_cancel,
+        .clo_print     = osc_lock_print
+};
+
+int osc_lock_init(const struct lu_env *env,
+                  struct cl_object *obj, struct cl_lock *lock,
+                  const struct cl_io *io)
+{
+        struct osc_lock   *clk;
+        struct osc_io     *oio = osc_env_io(env);
+        struct osc_object *oob = cl2osc(obj);
+        int result;
+
+        OBD_SLAB_ALLOC_PTR(clk, osc_lock_kmem);
+        if (clk != NULL) {
+                const struct cl_lock_operations *ops;
+                const struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev);
+                struct obd_connect_data *ocd;
+
+                osc_lock_build_einfo(env, lock, clk, &clk->ols_einfo);
+                clk->ols_state = OLS_NEW;
+
+                /*
+                 * Check if we need to do lockless IO here.
+                 * Following conditions must be satisfied:
+                 * - the current IO must be locklessable;
+                 * - the stripe is in contention;
+                 * - requested lock is not a glimpse.
+                 *
+                 * if not, we have to inherit the locklessable flag to
+                 * osc_lock, and let ost make the decision.
+                 *
+                 * Additional policy can be implemented here, e.g., never do
+                 * lockless-io for large extents.
+                 */
+                LASSERT(io->ci_lockreq == CILR_MANDATORY ||
+                        io->ci_lockreq == CILR_MAYBE ||
+                        io->ci_lockreq == CILR_NEVER);
+                ocd = &class_exp2cliimp(osc_export(oob))->imp_connect_data;
+                clk->ols_locklessable = (io->ci_type != CIT_TRUNC) &&
+                                (io->ci_lockreq == CILR_MAYBE) &&
+                                (ocd->ocd_connect_flags & OBD_CONNECT_SRVLOCK);
+                ops = &osc_lock_ops;
+                if (io->ci_lockreq == CILR_NEVER ||
+                    /* lockless IO */
+                    (clk->ols_locklessable && osc_object_is_contended(oob)) ||
+                     /* lockless truncate */
+                    (io->ci_type == CIT_TRUNC &&
+                     (ocd->ocd_connect_flags & OBD_CONNECT_TRUNCLOCK) &&
+                     osd->od_lockless_truncate)) {
+                        ops = &osc_lock_lockless_ops;
+                        oio->oi_lockless     = 1;
+                        clk->ols_locklessable = 1;
+                }
+
+                cl_lock_slice_add(lock, &clk->ols_cl, obj, ops);
+                result = 0;
+        } else
+                result = -ENOMEM;
+        return result;
+}
+
+
+/** @} osc */
diff --git a/lustre/osc/osc_object.c b/lustre/osc/osc_object.c

new file mode 100644 (file)

index 0000000..1f099b8
--- /dev/null
+++ b/lustre/osc/osc_object.c
@@ -0,0 +1,243 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_object for OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+/** \addtogroup osc osc @{ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include "osc_cl_internal.h"
+
+/*****************************************************************************
+ *
+ * Type conversions.
+ *
+ */
+
+static struct lu_object *osc2lu(struct osc_object *osc)
+{
+        return &osc->oo_cl.co_lu;
+}
+
+static struct osc_object *lu2osc(const struct lu_object *obj)
+{
+        LINVRNT(osc_is_object(obj));
+        return container_of0(obj, struct osc_object, oo_cl.co_lu);
+}
+
+/*****************************************************************************
+ *
+ * Object operations.
+ *
+ */
+
+static int osc_object_init(const struct lu_env *env, struct lu_object *obj,
+                           const struct lu_object_conf *conf)
+{
+        struct osc_object           *osc   = lu2osc(obj);
+        const struct cl_object_conf *cconf = lu2cl_conf(conf);
+        int i;
+
+        osc->oo_oinfo = cconf->u.coc_oinfo;
+#ifdef INVARIANT_CHECK
+        mutex_init(&osc->oo_debug_mutex);
+#endif
+        spin_lock_init(&osc->oo_seatbelt);
+        for (i = 0; i < CRT_NR; ++i)
+                CFS_INIT_LIST_HEAD(&osc->oo_inflight[i]);
+        return 0;
+}
+
+static void osc_object_free(const struct lu_env *env, struct lu_object *obj)
+{
+        struct osc_object *osc = lu2osc(obj);
+        int i;
+
+        for (i = 0; i < CRT_NR; ++i)
+                LASSERT(list_empty(&osc->oo_inflight[i]));
+
+        lu_object_fini(obj);
+        OBD_SLAB_FREE_PTR(osc, osc_object_kmem);
+}
+
+int osc_lvb_print(const struct lu_env *env, void *cookie,
+                  lu_printer_t p, const struct ost_lvb *lvb)
+{
+        return (*p)(env, cookie, "size: "LPU64" mtime: "LPU64" atime: "LPU64" "
+                    "ctime: "LPU64" blocks: "LPU64,
+                    lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime,
+                    lvb->lvb_ctime, lvb->lvb_blocks);
+}
+
+static int osc_object_print(const struct lu_env *env, void *cookie,
+                            lu_printer_t p, const struct lu_object *obj)
+{
+        struct osc_object   *osc   = lu2osc(obj);
+        struct lov_oinfo    *oinfo = osc->oo_oinfo;
+        struct osc_async_rc *ar    = &oinfo->loi_ar;
+
+        (*p)(env, cookie, "id: "LPU64" gr: "LPU64" "
+             "idx: %d gen: %d kms_valid: %u kms "LPU64" "
+             "rc: %d force_sync: %d min_xid: "LPU64" ",
+             oinfo->loi_id, oinfo->loi_gr, oinfo->loi_ost_idx,
+             oinfo->loi_ost_gen, oinfo->loi_kms_valid, oinfo->loi_kms,
+             ar->ar_rc, ar->ar_force_sync, ar->ar_min_xid);
+        osc_lvb_print(env, cookie, p, &oinfo->loi_lvb);
+        return 0;
+}
+
+
+static int osc_attr_get(const struct lu_env *env, struct cl_object *obj,
+                        struct cl_attr *attr)
+{
+        struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
+
+        cl_lvb2attr(attr, &oinfo->loi_lvb);
+        attr->cat_kms = oinfo->loi_kms_valid ? oinfo->loi_kms : 0;
+        return 0;
+}
+
+int osc_attr_set(const struct lu_env *env, struct cl_object *obj,
+                 const struct cl_attr *attr, unsigned valid)
+{
+        struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
+        struct ost_lvb   *lvb   = &oinfo->loi_lvb;
+
+        if (valid & CAT_SIZE)
+                lvb->lvb_size = attr->cat_size;
+        if (valid & CAT_MTIME)
+                lvb->lvb_mtime = attr->cat_mtime;
+        if (valid & CAT_ATIME)
+                lvb->lvb_atime = attr->cat_atime;
+        if (valid & CAT_CTIME)
+                lvb->lvb_ctime = attr->cat_ctime;
+        if (valid & CAT_BLOCKS)
+                lvb->lvb_blocks = attr->cat_blocks;
+        if (valid & CAT_KMS)
+                loi_kms_set(oinfo, attr->cat_kms);
+        return 0;
+}
+
+static int osc_object_glimpse(const struct lu_env *env,
+                              const struct cl_object *obj, struct ost_lvb *lvb)
+{
+        struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo;
+
+        ENTRY;
+        lvb->lvb_size   = oinfo->loi_kms;
+        lvb->lvb_blocks = oinfo->loi_lvb.lvb_blocks;
+        RETURN(0);
+}
+
+
+void osc_object_set_contended(struct osc_object *obj)
+{
+        obj->oo_contention_time = cfs_time_current();
+        /* mb(); */
+        obj->oo_contended = 1;
+}
+
+void osc_object_clear_contended(struct osc_object *obj)
+{
+        obj->oo_contended = 0;
+}
+
+int osc_object_is_contended(struct osc_object *obj)
+{
+        struct osc_device *dev  = lu2osc_dev(obj->oo_cl.co_lu.lo_dev);
+        int osc_contention_time = dev->od_contention_time;
+        cfs_time_t cur_time     = cfs_time_current();
+        cfs_time_t retry_time;
+
+        if (OBD_FAIL_CHECK(OBD_FAIL_OSC_OBJECT_CONTENTION))
+                return 1;
+
+        if (!obj->oo_contended)
+                return 0;
+
+        /*
+         * I like copy-paste. the code is copied from
+         * ll_file_is_contended.
+         */
+        retry_time = cfs_time_add(obj->oo_contention_time,
+                                  cfs_time_seconds(osc_contention_time));
+        if (cfs_time_after(cur_time, retry_time)) {
+                osc_object_clear_contended(obj);
+                return 0;
+        }
+        return 1;
+}
+
+static const struct cl_object_operations osc_ops = {
+        .coo_page_init = osc_page_init,
+        .coo_lock_init = osc_lock_init,
+        .coo_io_init   = osc_io_init,
+        .coo_attr_get  = osc_attr_get,
+        .coo_attr_set  = osc_attr_set,
+        .coo_glimpse   = osc_object_glimpse
+};
+
+static const struct lu_object_operations osc_lu_obj_ops = {
+        .loo_object_init      = osc_object_init,
+        .loo_object_delete    = NULL,
+        .loo_object_release   = NULL,
+        .loo_object_free      = osc_object_free,
+        .loo_object_print     = osc_object_print,
+        .loo_object_invariant = NULL
+};
+
+struct lu_object *osc_object_alloc(const struct lu_env *env,
+                                   const struct lu_object_header *_,
+                                   struct lu_device *dev)
+{
+        struct osc_object *osc;
+        struct lu_object  *obj;
+
+        OBD_SLAB_ALLOC_PTR(osc, osc_object_kmem);
+        if (osc != NULL) {
+                obj = osc2lu(osc);
+                lu_object_init(obj, NULL, dev);
+                osc->oo_cl.co_ops = &osc_ops;
+                obj->lo_ops = &osc_lu_obj_ops;
+        } else
+                obj = NULL;
+        return obj;
+}
+
+/** @} osc */
diff --git a/lustre/osc/osc_page.c b/lustre/osc/osc_page.c

new file mode 100644 (file)

index 0000000..d42e4f9
--- /dev/null
+++ b/lustre/osc/osc_page.c
@@ -0,0 +1,522 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Implementation of cl_page for OSC layer.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+/** \addtogroup osc osc @{ */
+
+#define DEBUG_SUBSYSTEM S_OSC
+
+#include "osc_cl_internal.h"
+
+static int osc_page_is_dlocked(const struct lu_env *env,
+                               const struct osc_page *opg,
+                               enum cl_lock_mode mode, int pending, int unref)
+{
+        struct cl_page         *page;
+        struct osc_object      *obj;
+        struct osc_thread_info *info;
+        struct ldlm_res_id     *resname;
+        struct lustre_handle   *lockh;
+        ldlm_policy_data_t     *policy;
+        ldlm_mode_t             dlmmode;
+        int                     flags;
+
+        info = osc_env_info(env);
+        resname = &info->oti_resname;
+        policy = &info->oti_policy;
+        lockh = &info->oti_handle;
+        page = opg->ops_cl.cpl_page;
+        obj = cl2osc(opg->ops_cl.cpl_obj);
+
+        flags = LDLM_FL_TEST_LOCK | LDLM_FL_BLOCK_GRANTED;
+        if (pending)
+                flags |= LDLM_FL_CBPENDING;
+
+        dlmmode = osc_cl_lock2ldlm(mode) | LCK_PW;
+        osc_lock_build_res(env, obj, resname);
+        osc_index2policy(policy, page->cp_obj, page->cp_index, page->cp_index);
+        return osc_match_base(osc_export(obj), resname, LDLM_EXTENT, policy,
+                              dlmmode, &flags, NULL, lockh, unref);
+}
+
+static int osc_page_protected(const struct lu_env *env,
+                              const struct osc_page *opg,
+                              enum cl_lock_mode mode, int unref)
+{
+        struct cl_object_header *hdr;
+        struct cl_lock          *scan;
+        struct cl_page          *page;
+        struct cl_lock_descr    *descr;
+        int result;
+
+        LINVRNT(!opg->ops_temp);
+
+        result = osc_page_is_dlocked(env, opg, mode, 1, unref);
+        if (result == 0) {
+                /* maybe this page is a part of a lockless io? */
+                hdr = cl_object_header(opg->ops_cl.cpl_obj);
+                page = opg->ops_cl.cpl_page;
+                descr = &osc_env_info(env)->oti_descr;
+                descr->cld_mode = mode;
+                descr->cld_start = page->cp_index;
+                descr->cld_end   = page->cp_index;
+                spin_lock(&hdr->coh_lock_guard);
+                list_for_each_entry(scan, &hdr->coh_locks, cll_linkage) {
+                        /*
+                         * Lock-less sub-lock has to be either in HELD state
+                         * (when io is actively going on), or in CACHED state,
+                         * when top-lock is being unlocked:
+                         * cl_io_unlock()->cl_unuse()->...->lov_lock_unuse().
+                         */
+                        if ((scan->cll_state == CLS_HELD ||
+                             scan->cll_state == CLS_CACHED) &&
+                            cl_lock_ext_match(&scan->cll_descr, descr)) {
+                                struct osc_lock *olck;
+
+                                olck = osc_lock_at(scan);
+                                result = osc_lock_is_lockless(olck);
+                                break;
+                        }
+                }
+                spin_unlock(&hdr->coh_lock_guard);
+        }
+        return result;
+}
+
+/*****************************************************************************
+ *
+ * Page operations.
+ *
+ */
+static void osc_page_fini(const struct lu_env *env,
+                          struct cl_page_slice *slice)
+{
+        struct osc_page *opg = cl2osc_page(slice);
+        CDEBUG(D_TRACE, "%p\n", opg);
+        OBD_SLAB_FREE_PTR(opg, osc_page_kmem);
+}
+
+static void osc_page_transfer_get(struct osc_page *opg, const char *label)
+{
+        struct cl_page *page = cl_page_top(opg->ops_cl.cpl_page);
+
+        LASSERT(!opg->ops_transfer_pinned);
+        cl_page_get(page);
+        lu_ref_add_atomic(&page->cp_reference, label, page);
+        opg->ops_transfer_pinned = 1;
+}
+
+static void osc_page_transfer_put(const struct lu_env *env,
+                                  struct osc_page *opg)
+{
+        struct cl_page *page = cl_page_top(opg->ops_cl.cpl_page);
+
+        if (opg->ops_transfer_pinned) {
+                lu_ref_del(&page->cp_reference, "transfer", page);
+                opg->ops_transfer_pinned = 0;
+                cl_page_put(env, page);
+        }
+}
+
+/**
+ * This is called once for every page when it is submitted for a transfer
+ * either opportunistic (osc_page_cache_add()), or immediate
+ * (osc_page_submit()).
+ */
+static void osc_page_transfer_add(const struct lu_env *env,
+                                  struct osc_page *opg, enum cl_req_type crt)
+{
+        struct osc_object *obj;
+
+        obj = cl2osc(opg->ops_cl.cpl_obj);
+        spin_lock(&obj->oo_seatbelt);
+        list_add(&opg->ops_inflight, &obj->oo_inflight[crt]);
+        opg->ops_submitter = cfs_current();
+        spin_unlock(&obj->oo_seatbelt);
+}
+
+static int osc_page_cache_add(const struct lu_env *env,
+                              const struct cl_page_slice *slice,
+                              struct cl_io *_)
+{
+        struct osc_page   *opg = cl2osc_page(slice);
+        struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj);
+        struct osc_io     *oio = osc_env_io(env);
+        int result;
+        int brw_flags;
+
+        LINVRNT(osc_page_protected(env, opg, CLM_WRITE, 0));
+        ENTRY;
+
+        /* Set the OBD_BRW_SRVLOCK before the page is queued. */
+        brw_flags = oio->oi_lockless ? OBD_BRW_SRVLOCK : 0;
+
+        osc_page_transfer_get(opg, "transfer\0cache");
+        result = osc_queue_async_io(env, osc_export(obj), NULL, obj->oo_oinfo,
+                                    &opg->ops_oap, OBD_BRW_WRITE,
+                                    0, 0, brw_flags, 0);
+        if (result != 0)
+                osc_page_transfer_put(env, opg);
+        else
+                osc_page_transfer_add(env, opg, CRT_WRITE);
+        RETURN(result);
+}
+
+void osc_index2policy(ldlm_policy_data_t *policy, const struct cl_object *obj,
+                      pgoff_t start, pgoff_t end)
+{
+        memset(policy, 0, sizeof *policy);
+        policy->l_extent.start = cl_offset(obj, start);
+        policy->l_extent.end   = cl_offset(obj, end + 1) - 1;
+}
+
+static int osc_page_is_under_lock(const struct lu_env *env,
+                                  const struct cl_page_slice *slice,
+                                  struct cl_io *_)
+{
+        struct cl_lock *lock;
+        int             result;
+
+        ENTRY;
+        lock = cl_lock_at_page(env, slice->cpl_obj, slice->cpl_page,
+                               NULL, 1, 0);
+        if (lock != NULL) {
+                cl_lock_put(env, lock);
+                result = -EBUSY;
+        } else
+                result = -ENODATA;
+        RETURN(result);
+}
+
+static int osc_page_fail(const struct lu_env *env,
+                         const struct cl_page_slice *slice, struct cl_io *_)
+{
+        /*
+         * Cached read?
+         */
+        LBUG();
+        return 0;
+}
+
+
+static const char *osc_list(struct list_head *head)
+{
+        return list_empty(head) ? "-" : "+";
+}
+
+static int osc_page_print(const struct lu_env *env,
+                          const struct cl_page_slice *slice,
+                          void *cookie, lu_printer_t printer)
+{
+        struct osc_page       *opg = cl2osc_page(slice);
+        struct osc_async_page *oap = &opg->ops_oap;
+
+        return (*printer)(env, cookie, LUSTRE_OSC_NAME"-page@%p: "
+                          "%#x %d %u %s %s %s %llu %u %#x %p %p %p %p %p\n",
+                          opg, oap->oap_magic, oap->oap_cmd,
+                          oap->oap_interrupted,
+                          osc_list(&oap->oap_pending_item),
+                          osc_list(&oap->oap_urgent_item),
+                          osc_list(&oap->oap_rpc_item),
+                          oap->oap_obj_off, oap->oap_page_off,
+                          oap->oap_async_flags, oap->oap_request,
+                          oap->oap_cli, oap->oap_loi, oap->oap_caller_ops,
+                          oap->oap_caller_data);
+}
+
+static void osc_page_delete(const struct lu_env *env,
+                            const struct cl_page_slice *slice)
+{
+        struct osc_page       *opg = cl2osc_page(slice);
+        struct osc_object     *obj = cl2osc(opg->ops_cl.cpl_obj);
+        struct osc_async_page *oap = &opg->ops_oap;
+        int rc;
+
+        LINVRNT(opg->ops_temp || osc_page_protected(env, opg, CLM_READ, 1));
+
+        ENTRY;
+        CDEBUG(D_TRACE, "%p\n", opg);
+        osc_page_transfer_put(env, opg);
+        rc = osc_teardown_async_page(osc_export(obj), NULL, obj->oo_oinfo, oap);
+        LASSERTF(rc == 0, "%i\n", rc);
+        spin_lock(&obj->oo_seatbelt);
+        list_del_init(&opg->ops_inflight);
+        spin_unlock(&obj->oo_seatbelt);
+        EXIT;
+}
+
+void osc_page_clip(const struct lu_env *env, const struct cl_page_slice *slice,
+                   int from, int to)
+{
+        struct osc_page       *opg = cl2osc_page(slice);
+        struct osc_async_page *oap = &opg->ops_oap;
+
+        LINVRNT(osc_page_protected(env, opg, CLM_READ, 0));
+
+        opg->ops_from = from;
+        opg->ops_to   = to;
+        oap->oap_async_flags |= ASYNC_COUNT_STABLE;
+}
+
+static int osc_page_cancel(const struct lu_env *env,
+                           const struct cl_page_slice *slice)
+{
+        struct osc_page *opg       = cl2osc_page(slice);
+        struct osc_async_page *oap = &opg->ops_oap;
+        int rc = 0;
+
+        LINVRNT(osc_page_protected(env, opg, CLM_READ, 0));
+
+        client_obd_list_lock(&oap->oap_cli->cl_loi_list_lock);
+        /* Check if the transferring against this page
+         * is completed, or not even queued. */
+        if (opg->ops_transfer_pinned)
+                /* FIXME: may not be interrupted.. */
+                rc = osc_oap_interrupted(env, oap);
+        LASSERT(ergo(rc == 0, opg->ops_transfer_pinned == 0));
+        client_obd_list_unlock(&oap->oap_cli->cl_loi_list_lock);
+        return rc;
+}
+
+static const struct cl_page_operations osc_page_ops = {
+        .cpo_fini          = osc_page_fini,
+        .cpo_print         = osc_page_print,
+        .cpo_delete        = osc_page_delete,
+        .cpo_is_under_lock = osc_page_is_under_lock,
+        .io = {
+                [CRT_READ] = {
+                        .cpo_cache_add = osc_page_fail
+                },
+                [CRT_WRITE] = {
+                        .cpo_cache_add = osc_page_cache_add
+                }
+        },
+        .cpo_clip           = osc_page_clip,
+        .cpo_cancel         = osc_page_cancel
+};
+
+static int osc_make_ready(const struct lu_env *env, void *data, int cmd)
+{
+        struct osc_page *opg  = data;
+        struct cl_page  *page = cl_page_top(opg->ops_cl.cpl_page);
+        int result;
+
+        LASSERT(cmd == OBD_BRW_WRITE); /* no cached reads */
+        LINVRNT(osc_page_protected(env, opg, CLM_WRITE, 1));
+
+        ENTRY;
+        result = cl_page_make_ready(env, page, CRT_WRITE);
+        RETURN(result);
+}
+
+static int osc_refresh_count(const struct lu_env *env, void *data, int cmd)
+{
+        struct cl_page   *page;
+        struct osc_page  *osc = data;
+        struct cl_object *obj;
+        struct cl_attr   *attr = &osc_env_info(env)->oti_attr;
+
+        int result;
+        loff_t kms;
+
+        LINVRNT(osc_page_protected(env, osc, CLM_READ, 1));
+
+        /* readpage queues with _COUNT_STABLE, shouldn't get here. */
+        LASSERT(!(cmd & OBD_BRW_READ));
+        LASSERT(osc != NULL);
+        page = osc->ops_cl.cpl_page;
+        obj = osc->ops_cl.cpl_obj;
+
+        cl_object_attr_lock(obj);
+        result = cl_object_attr_get(env, obj, attr);
+        cl_object_attr_unlock(obj);
+        if (result < 0)
+                return result;
+        kms = attr->cat_kms;
+        if (cl_offset(obj, page->cp_index) >= kms)
+                /* catch race with truncate */
+                return 0;
+        else if (cl_offset(obj, page->cp_index + 1) > kms)
+                /* catch sub-page write at end of file */
+                return kms % CFS_PAGE_SIZE;
+        else
+                return CFS_PAGE_SIZE;
+}
+
+static int osc_completion(const struct lu_env *env,
+                          void *data, int cmd, struct obdo *oa, int rc)
+{
+        struct osc_page       *opg  = data;
+        struct osc_async_page *oap  = &opg->ops_oap;
+        struct cl_page        *page = cl_page_top(opg->ops_cl.cpl_page);
+        struct osc_object     *obj  = cl2osc(opg->ops_cl.cpl_obj);
+        enum cl_req_type crt;
+
+        LINVRNT(osc_page_protected(env, opg, CLM_READ, 1));
+
+        ENTRY;
+
+        cmd &= ~OBD_BRW_NOQUOTA;
+        LASSERT(equi(page->cp_state == CPS_PAGEIN,  cmd == OBD_BRW_READ));
+        LASSERT(equi(page->cp_state == CPS_PAGEOUT, cmd == OBD_BRW_WRITE));
+        LASSERT(opg->ops_transfer_pinned);
+
+        /*
+         * page->cp_req can be NULL if io submission failed before
+         * cl_req was allocated.
+         */
+        if (page->cp_req != NULL)
+                cl_req_page_done(env, page);
+        LASSERT(page->cp_req == NULL);
+
+        /* As the transfer for this page is being done, clear the flags */
+        oap->oap_async_flags = 0;
+
+        crt = cmd == OBD_BRW_READ ? CRT_READ : CRT_WRITE;
+        /* Clear opg->ops_transfer_pinned before VM lock is released. */
+        opg->ops_transfer_pinned = 0;
+
+        spin_lock(&obj->oo_seatbelt);
+        LASSERT(opg->ops_submitter != NULL);
+        LASSERT(!list_empty(&opg->ops_inflight));
+        list_del_init(&opg->ops_inflight);
+        spin_unlock(&obj->oo_seatbelt);
+
+        cl_page_completion(env, page, crt, rc);
+
+        /* statistic */
+        if (rc == 0 && oap->oap_brw_flags & OBD_BRW_SRVLOCK) {
+                struct lu_device *ld    = opg->ops_cl.cpl_obj->co_lu.lo_dev;
+                struct osc_stats *stats = &lu2osc_dev(ld)->od_stats;
+                int bytes = opg->ops_to - opg->ops_from;
+
+                if (crt == CRT_READ)
+                        stats->os_lockless_reads += bytes;
+                else
+                        stats->os_lockless_writes += bytes;
+        }
+
+        /*
+         * This has to be the last operation with the page, as locks are
+         * released in cl_page_completion() and nothing except for the
+         * reference counter protects page from concurrent reclaim.
+         */
+        lu_ref_del(&page->cp_reference, "transfer", page);
+        /*
+         * As page->cp_obj is pinned by a reference from page->cp_req, it is
+         * safe to call cl_page_put() without risking object destruction in a
+         * non-blocking context.
+         */
+        cl_page_put(env, page);
+        RETURN(0);
+}
+
+const static struct obd_async_page_ops osc_async_page_ops = {
+        .ap_make_ready    = osc_make_ready,
+        .ap_refresh_count = osc_refresh_count,
+        .ap_completion    = osc_completion
+};
+
+struct cl_page *osc_page_init(const struct lu_env *env,
+                              struct cl_object *obj,
+                              struct cl_page *page, cfs_page_t *vmpage)
+{
+        struct osc_object *osc = cl2osc(obj);
+        struct osc_page   *opg;
+        int result;
+
+        OBD_SLAB_ALLOC_PTR(opg, osc_page_kmem);
+        if (opg != NULL) {
+                void *oap = &opg->ops_oap;
+
+                opg->ops_from = 0;
+                opg->ops_to   = CFS_PAGE_SIZE;
+                opg->ops_ignore_quota = !!cfs_capable(CFS_CAP_SYS_RESOURCE);
+
+                result = osc_prep_async_page(osc_export(osc),
+                                             NULL, osc->oo_oinfo, vmpage,
+                                             cl_offset(obj, page->cp_index),
+                                             &osc_async_page_ops,
+                                             opg, (void **)&oap, 1, NULL);
+                if (result == 0)
+                        cl_page_slice_add(page, &opg->ops_cl, obj,
+                                          &osc_page_ops);
+                /*
+                 * Cannot assert osc_page_protected() here as read-ahead
+                 * creates temporary pages outside of a lock.
+                 */
+#ifdef INVARIANT_CHECK
+                opg->ops_temp = !osc_page_protected(env, opg, CLM_READ, 1);
+#endif
+                CFS_INIT_LIST_HEAD(&opg->ops_inflight);
+        } else
+                result = -ENOMEM;
+        return ERR_PTR(result);
+}
+
+void osc_io_submit_page(const struct lu_env *env,
+                        struct osc_io *oio, struct osc_page *opg,
+                        enum cl_req_type crt)
+{
+        struct osc_async_page *oap = &opg->ops_oap;
+        struct client_obd     *cli = oap->oap_cli;
+
+        LINVRNT(osc_page_protected(env, opg,
+                                   crt == CRT_WRITE ? CLM_WRITE : CLM_READ, 1));
+
+        oap->oap_cmd = crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ;
+        if (opg->ops_ignore_quota)
+                oap->oap_cmd |= OBD_BRW_NOQUOTA;
+
+        oap->oap_async_flags |= OSC_FLAGS;
+        if (oap->oap_cmd & OBD_BRW_READ)
+                oap->oap_async_flags |= ASYNC_COUNT_STABLE;
+        else if (!(oap->oap_brw_page.flag & OBD_BRW_FROM_GRANT))
+                osc_enter_cache_try(env, cli, oap->oap_loi, oap, 1);
+
+        oap->oap_page_off   = opg->ops_from;
+        oap->oap_count      = opg->ops_to - opg->ops_from;
+        oap->oap_brw_flags |= oio->oi_lockless ? OBD_BRW_SRVLOCK : 0;
+
+        osc_oap_to_pending(oap);
+        osc_page_transfer_get(opg, "transfer\0imm");
+        osc_page_transfer_add(env, opg, crt);
+}
+
+/** @} osc */
diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c

index 299b3c7..2097a34 100644 (file)
--- a/lustre/osc/osc_request.c
+++ b/lustre/osc/osc_request.c
@@ -61,7 +61,6 @@
  #include <lustre_log.h>
  #include <lustre_debug.h>
  #include <lustre_param.h>
-#include <lustre_cache.h>
  #include "osc_internal.h"
  
  static quota_interface_t *quota_interface = NULL;
@@ -399,7 +398,7 @@ static int osc_setattr_async(struct obd_export *exp, struct obd_info *oinfo,
          /* do mds to ost setattr asynchronously */
          if (!rqset) {
                  /* Do not wait for response. */
-                ptlrpcd_add_req(req);
+                ptlrpcd_add_req(req, PSCOPE_OTHER);
          } else {
                  req->rq_interpret_reply =
                          (ptlrpc_interpterer_t)osc_setattr_interpret;
@@ -501,7 +500,7 @@ out:
  
  static int osc_punch_interpret(const struct lu_env *env,
                                 struct ptlrpc_request *req,
-                               struct osc_async_args *aa, int rc)
+                               struct osc_punch_args *aa, int rc)
  {
          struct ost_body *body;
          ENTRY;
@@ -513,32 +512,28 @@ static int osc_punch_interpret(const struct lu_env *env,
          if (body == NULL)
                  GOTO(out, rc = -EPROTO);
  
-        *aa->aa_oi->oi_oa = body->oa;
+        *aa->pa_oa = body->oa;
  out:
-        rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc);
+        rc = aa->pa_upcall(aa->pa_cookie, rc);
          RETURN(rc);
  }
  
-static int osc_punch(struct obd_export *exp, struct obd_info *oinfo,
-                     struct obd_trans_info *oti,
-                     struct ptlrpc_request_set *rqset)
+int osc_punch_base(struct obd_export *exp, struct obdo *oa,
+                   struct obd_capa *capa,
+                   obd_enqueue_update_f upcall, void *cookie,
+                   struct ptlrpc_request_set *rqset)
  {
          struct ptlrpc_request *req;
-        struct osc_async_args *aa;
+        struct osc_punch_args *aa;
          struct ost_body       *body;
          int                    rc;
          ENTRY;
  
-        if (!oinfo->oi_oa) {
-                CDEBUG(D_INFO, "oa NULL\n");
-                RETURN(-EINVAL);
-        }
-
          req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_PUNCH);
          if (req == NULL)
                  RETURN(-ENOMEM);
  
-        osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa);
+        osc_set_capa_size(req, &RMF_CAPA1, capa);
          rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_PUNCH);
          if (rc) {
                  ptlrpc_request_free(req);
@@ -546,26 +541,40 @@ static int osc_punch(struct obd_export *exp, struct obd_info *oinfo,
          }
          req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */
          ptlrpc_at_set_req_timeout(req);
-        osc_pack_req_body(req, oinfo);
  
-        /* overload the size and blocks fields in the oa with start/end */
          body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
          LASSERT(body);
-        body->oa.o_size = oinfo->oi_policy.l_extent.start;
-        body->oa.o_blocks = oinfo->oi_policy.l_extent.end;
-        body->oa.o_valid |= (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS);
+        body->oa = *oa;
+        osc_pack_capa(req, body, capa);
+
          ptlrpc_request_set_replen(req);
  
  
          req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_punch_interpret;
          CLASSERT (sizeof(*aa) <= sizeof(req->rq_async_args));
          aa = ptlrpc_req_async_args(req);
-        aa->aa_oi = oinfo;
-        ptlrpc_set_add_req(rqset, req);
+        aa->pa_oa     = oa;
+        aa->pa_upcall = upcall;
+        aa->pa_cookie = cookie;
+        if (rqset == PTLRPCD_SET)
+                ptlrpcd_add_req(req, PSCOPE_OTHER);
+        else
+                ptlrpc_set_add_req(rqset, req);
  
          RETURN(0);
  }
  
+static int osc_punch(struct obd_export *exp, struct obd_info *oinfo,
+                     struct obd_trans_info *oti,
+                     struct ptlrpc_request_set *rqset)
+{
+        oinfo->oi_oa->o_size   = oinfo->oi_policy.l_extent.start;
+        oinfo->oi_oa->o_blocks = oinfo->oi_policy.l_extent.end;
+        oinfo->oi_oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS;
+        return osc_punch_base(exp, oinfo->oi_oa, oinfo->oi_capa,
+                              oinfo->oi_cb_up, oinfo, rqset);
+}
+
  static int osc_sync(struct obd_export *exp, struct obdo *oa,
                      struct lov_stripe_md *md, obd_size start, obd_size end,
                      void *capa)
@@ -739,7 +748,7 @@ static int osc_destroy(struct obd_export *exp, struct obdo *oa,
          }
  
          /* Do not wait for response */
-        ptlrpcd_add_req(req);
+        ptlrpcd_add_req(req, PSCOPE_OTHER);
          RETURN(0);
  }
  
@@ -753,13 +762,16 @@ static void osc_announce_cached(struct client_obd *cli, struct obdo *oa,
          oa->o_valid |= bits;
          client_obd_list_lock(&cli->cl_loi_list_lock);
          oa->o_dirty = cli->cl_dirty;
-        if (cli->cl_dirty > cli->cl_dirty_max) {
-                CERROR("dirty %lu > dirty_max %lu\n",
-                       cli->cl_dirty, cli->cl_dirty_max);
+        if (cli->cl_dirty - cli->cl_dirty_transit > cli->cl_dirty_max) {
+                CERROR("dirty %lu - %lu > dirty_max %lu\n",
+                       cli->cl_dirty, cli->cl_dirty_transit, cli->cl_dirty_max);
                  oa->o_undirty = 0;
-        } else if (atomic_read(&obd_dirty_pages) > obd_max_dirty_pages) {
-                CERROR("dirty %d > system dirty_max %d\n",
-                       atomic_read(&obd_dirty_pages), obd_max_dirty_pages);
+        } else if (atomic_read(&obd_dirty_pages) -
+                   atomic_read(&obd_dirty_transit_pages) > obd_max_dirty_pages){
+                CERROR("dirty %d - %d > system dirty_max %d\n",
+                       atomic_read(&obd_dirty_pages),
+                       atomic_read(&obd_dirty_transit_pages),
+                       obd_max_dirty_pages);
                  oa->o_undirty = 0;
          } else if (cli->cl_dirty_max - cli->cl_dirty > 0x7fffffff) {
                  CERROR("dirty %lu - dirty_max %lu too big???\n",
@@ -782,6 +794,7 @@ static void osc_announce_cached(struct client_obd *cli, struct obdo *oa,
  static void osc_consume_write_grant(struct client_obd *cli,
                                      struct brw_page *pga)
  {
+        LASSERT(!(pga->flag & OBD_BRW_FROM_GRANT));
          atomic_inc(&obd_dirty_pages);
          cli->cl_dirty += CFS_PAGE_SIZE;
          cli->cl_avail_grant -= CFS_PAGE_SIZE;
@@ -807,6 +820,11 @@ static void osc_release_write_grant(struct client_obd *cli,
          pga->flag &= ~OBD_BRW_FROM_GRANT;
          atomic_dec(&obd_dirty_pages);
          cli->cl_dirty -= CFS_PAGE_SIZE;
+        if (pga->flag & OBD_BRW_NOCACHE) {
+                pga->flag &= ~OBD_BRW_NOCACHE;
+                atomic_dec(&obd_dirty_transit_pages);
+                cli->cl_dirty_transit -= CFS_PAGE_SIZE;
+        }
          if (!sent) {
                  cli->cl_lost_grant += CFS_PAGE_SIZE;
                  CDEBUG(D_CACHE, "lost grant: %lu avail grant: %lu dirty: %lu\n",
@@ -977,7 +995,7 @@ static int check_write_rcs(struct ptlrpc_request *req,
  static inline int can_merge_pages(struct brw_page *p1, struct brw_page *p2)
  {
          if (p1->flag != p2->flag) {
-                unsigned mask = ~OBD_BRW_FROM_GRANT;
+                unsigned mask = ~(OBD_BRW_FROM_GRANT|OBD_BRW_NOCACHE);
  
                  /* warn if we try to combine flags that we don't know to be
                   * safe to combine */
@@ -1538,63 +1556,6 @@ int osc_brw_redo_request(struct ptlrpc_request *request,
          RETURN(0);
  }
  
-static int async_internal(int cmd, struct obd_export *exp, struct obdo *oa,
-                          struct lov_stripe_md *lsm, obd_count page_count,
-                          struct brw_page **pga, struct ptlrpc_request_set *set,
-                          struct obd_capa *ocapa)
-{
-        struct ptlrpc_request     *req;
-        struct client_obd         *cli = &exp->exp_obd->u.cli;
-        int                        rc, i;
-        struct osc_brw_async_args *aa;
-        ENTRY;
-
-        /* Consume write credits even if doing a sync write -
-         * otherwise we may run out of space on OST due to grant. */
-        if (cmd == OBD_BRW_WRITE) {
-                spin_lock(&cli->cl_loi_list_lock);
-                for (i = 0; i < page_count; i++) {
-                        if (cli->cl_avail_grant >= CFS_PAGE_SIZE)
-                                osc_consume_write_grant(cli, pga[i]);
-                }
-                spin_unlock(&cli->cl_loi_list_lock);
-        }
-
-        rc = osc_brw_prep_request(cmd, cli, oa, lsm, page_count, pga,
-                                  &req, ocapa);
-
-        aa = ptlrpc_req_async_args(req);
-        if (cmd == OBD_BRW_READ) {
-                lprocfs_oh_tally_log2(&cli->cl_read_page_hist, page_count);
-                lprocfs_oh_tally(&cli->cl_read_rpc_hist, cli->cl_r_in_flight);
-        } else {
-                lprocfs_oh_tally_log2(&cli->cl_write_page_hist, page_count);
-                lprocfs_oh_tally(&cli->cl_write_rpc_hist,
-                                 cli->cl_w_in_flight);
-        }
-        ptlrpc_lprocfs_brw(req, aa->aa_requested_nob);
-
-        LASSERT(list_empty(&aa->aa_oaps));
-        if (rc == 0) {
-                req->rq_interpret_reply = brw_interpret;
-                ptlrpc_set_add_req(set, req);
-                client_obd_list_lock(&cli->cl_loi_list_lock);
-                if (cmd == OBD_BRW_READ)
-                        cli->cl_r_in_flight++;
-                else
-                        cli->cl_w_in_flight++;
-                client_obd_list_unlock(&cli->cl_loi_list_lock);
-                OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_DIO_PAUSE, 3);
-        } else if (cmd == OBD_BRW_WRITE) {
-                client_obd_list_lock(&cli->cl_loi_list_lock);
-                for (i = 0; i < page_count; i++)
-                        osc_release_write_grant(cli, pga[i], 0);
-                osc_wake_cache_waiters(cli);
-                client_obd_list_unlock(&cli->cl_loi_list_lock);
-        }
-        RETURN (rc);
-}
-
  /*
   * ugh, we want disk allocation on the target to happen in offset order.  we'll
   * follow sedgewicks advice and stick to the dead simple shellsort -- it'll do
@@ -1743,76 +1704,6 @@ out:
          RETURN(rc);
  }
  
-static int osc_brw_async(int cmd, struct obd_export *exp,
-                         struct obd_info *oinfo, obd_count page_count,
-                         struct brw_page *pga, struct obd_trans_info *oti,
-                         struct ptlrpc_request_set *set)
-{
-        struct brw_page **ppga, **orig;
-        struct client_obd *cli = &exp->exp_obd->u.cli;
-        int page_count_orig;
-        int rc = 0;
-        ENTRY;
-
-        if (cmd & OBD_BRW_CHECK) {
-                struct obd_import *imp = class_exp2cliimp(exp);
-                /* The caller just wants to know if there's a chance that this
-                 * I/O can succeed */
-
-                if (imp == NULL || imp->imp_invalid)
-                        RETURN(-EIO);
-                RETURN(0);
-        }
-
-        orig = ppga = osc_build_ppga(pga, page_count);
-        if (ppga == NULL)
-                RETURN(-ENOMEM);
-        page_count_orig = page_count;
-
-        sort_brw_pages(ppga, page_count);
-        while (page_count) {
-                struct brw_page **copy;
-                obd_count pages_per_brw;
-
-                pages_per_brw = min_t(obd_count, page_count,
-                                      cli->cl_max_pages_per_rpc);
-
-                pages_per_brw = max_unfragmented_pages(ppga, pages_per_brw);
-
-                /* use ppga only if single RPC is going to fly */
-                if (pages_per_brw != page_count_orig || ppga != orig) {
-                        OBD_ALLOC(copy, sizeof(*copy) * pages_per_brw);
-                        if (copy == NULL)
-                                GOTO(out, rc = -ENOMEM);
-                        memcpy(copy, ppga, sizeof(*copy) * pages_per_brw);
-                } else
-                        copy = ppga;
-
-                rc = async_internal(cmd, exp, oinfo->oi_oa, oinfo->oi_md,
-                                    pages_per_brw, copy, set, oinfo->oi_capa);
-
-                if (rc != 0) {
-                        if (copy != ppga)
-                                OBD_FREE(copy, sizeof(*copy) * pages_per_brw);
-                        break;
-                }
-                if (copy == orig) {
-                        /* we passed it to async_internal() which is
-                         * now responsible for releasing memory */
-                        orig = NULL;
-                }
-
-                page_count -= pages_per_brw;
-                ppga += pages_per_brw;
-        }
-out:
-        if (orig)
-                osc_release_ppga(orig, page_count_orig);
-        RETURN(rc);
-}
-
-static void osc_check_rpcs(struct client_obd *cli);
-
  /* The companion to osc_enter_cache(), called when @oap is no longer part of
   * the dirty accounting.  Writeback completes or truncate happens before
   * writing starts.  Must be called with the loi lock held. */
@@ -1883,7 +1774,7 @@ static void on_list(struct list_head *item, struct list_head *list,
  
  /* maintain the loi's cli list membership invariants so that osc_send_oap_rpc
   * can find pages to build into rpcs quickly */
-static void loi_list_maint(struct client_obd *cli, struct lov_oinfo *loi)
+void loi_list_maint(struct client_obd *cli, struct lov_oinfo *loi)
  {
          on_list(&loi->loi_cli_item, &cli->cl_loi_ready_list,
                  lop_makes_rpc(cli, &loi->loi_write_lop, OBD_BRW_WRITE) ||
@@ -1906,34 +1797,35 @@ static void lop_update_pending(struct client_obd *cli,
                  cli->cl_pending_r_pages += delta;
  }
  
-/* this is called when a sync waiter receives an interruption.  Its job is to
+/**
+ * this is called when a sync waiter receives an interruption.  Its job is to
   * get the caller woken as soon as possible.  If its page hasn't been put in an
   * rpc yet it can dequeue immediately.  Otherwise it has to mark the rpc as
   * desiring interruption which will forcefully complete the rpc once the rpc
- * has timed out */
-static void osc_occ_interrupted(struct oig_callback_context *occ)
+ * has timed out.
+ */
+int osc_oap_interrupted(const struct lu_env *env, struct osc_async_page *oap)
  {
-        struct osc_async_page *oap;
          struct loi_oap_pages *lop;
          struct lov_oinfo *loi;
+        int rc = -EBUSY;
          ENTRY;
  
-        /* XXX member_of() */
-        oap = list_entry(occ, struct osc_async_page, oap_occ);
-
-        client_obd_list_lock(&oap->oap_cli->cl_loi_list_lock);
-
+        LASSERT(!oap->oap_interrupted);
          oap->oap_interrupted = 1;
  
          /* ok, it's been put in an rpc. only one oap gets a request reference */
          if (oap->oap_request != NULL) {
                  ptlrpc_mark_interrupted(oap->oap_request);
                  ptlrpcd_wake(oap->oap_request);
-                GOTO(unlock, 0);
+                ptlrpc_req_finished(oap->oap_request);
+                oap->oap_request = NULL;
          }
  
-        /* we don't get interruption callbacks until osc_trigger_group_io()
-         * has been called and put the sync oaps in the pending/urgent lists.*/
+        /*
+         * page completion may be called only if ->cpo_prep() method was
+         * executed by osc_io_submit(), that also adds page the to pending list
+         */
          if (!list_empty(&oap->oap_pending_item)) {
                  list_del_init(&oap->oap_pending_item);
                  list_del_init(&oap->oap_urgent_item);
@@ -1943,13 +1835,12 @@ static void osc_occ_interrupted(struct oig_callback_context *occ)
                          &loi->loi_write_lop : &loi->loi_read_lop;
                  lop_update_pending(oap->oap_cli, lop, oap->oap_cmd, -1);
                  loi_list_maint(oap->oap_cli, oap->oap_loi);
-
-                oig_complete_one(oap->oap_oig, &oap->oap_occ, -EINTR);
-                oap->oap_oig = NULL;
+                rc = oap->oap_caller_ops->ap_completion(env,
+                                          oap->oap_caller_data,
+                                          oap->oap_cmd, NULL, -EINTR);
          }
  
-unlock:
-        client_obd_list_unlock(&oap->oap_cli->cl_loi_list_lock);
+        RETURN(rc);
  }
  
  /* this is trying to propogate async writeback errors back up to the
@@ -1974,7 +1865,7 @@ static void osc_process_ar(struct osc_async_rc *ar, __u64 xid,
                  ar->ar_force_sync = 0;
  }
  
-static void osc_oap_to_pending(struct osc_async_page *oap)
+void osc_oap_to_pending(struct osc_async_page *oap)
  {
          struct loi_oap_pages *lop;
  
@@ -1991,7 +1882,8 @@ static void osc_oap_to_pending(struct osc_async_page *oap)
  
  /* this must be called holding the loi list lock to give coverage to exit_cache,
   * async_flag maintenance, and oap_request */
-static void osc_ap_completion(struct client_obd *cli, struct obdo *oa,
+static void osc_ap_completion(const struct lu_env *env,
+                              struct client_obd *cli, struct obdo *oa,
                                struct osc_async_page *oap, int sent, int rc)
  {
          __u64 xid = 0;
@@ -2022,15 +1914,7 @@ static void osc_ap_completion(struct client_obd *cli, struct obdo *oa,
                          oap->oap_loi->loi_lvb.lvb_ctime = oa->o_ctime;
          }
  
-        if (oap->oap_oig) {
-                osc_exit_cache(cli, oap, sent);
-                oig_complete_one(oap->oap_oig, &oap->oap_occ, rc);
-                oap->oap_oig = NULL;
-                EXIT;
-                return;
-        }
-
-        rc = oap->oap_caller_ops->ap_completion(oap->oap_caller_data,
+        rc = oap->oap_caller_ops->ap_completion(env, oap->oap_caller_data,
                                                  oap->oap_cmd, oa, rc);
  
          /* ll_ap_completion (from llite) drops PG_locked. so, a new
@@ -2049,6 +1933,7 @@ static int brw_interpret(const struct lu_env *env,
  {
          struct osc_brw_async_args *aa = data;
          struct client_obd *cli;
+        int async;
          ENTRY;
  
          rc = osc_brw_fini_request(req, rc);
@@ -2071,13 +1956,14 @@ static int brw_interpret(const struct lu_env *env,
          else
                  cli->cl_r_in_flight--;
  
-        if (!list_empty(&aa->aa_oaps)) { /* from osc_send_oap_rpc() */
+        async = list_empty(&aa->aa_oaps);
+        if (!async) { /* from osc_send_oap_rpc() */
                  struct osc_async_page *oap, *tmp;
                  /* the caller may re-use the oap after the completion call so
                   * we need to clean it up a little */
                  list_for_each_entry_safe(oap, tmp, &aa->aa_oaps, oap_rpc_item) {
                          list_del_init(&oap->oap_rpc_item);
-                        osc_ap_completion(cli, aa->aa_oa, oap, 1, rc);
+                        osc_ap_completion(env, cli, aa->aa_oa, oap, 1, rc);
                  }
                  OBDO_FREE(aa->aa_oa);
          } else { /* from async_internal() */
@@ -2086,14 +1972,16 @@ static int brw_interpret(const struct lu_env *env,
                          osc_release_write_grant(aa->aa_cli, aa->aa_ppga[i], 1);
          }
          osc_wake_cache_waiters(cli);
-        osc_check_rpcs(cli);
+        osc_check_rpcs(env, cli);
          client_obd_list_unlock(&cli->cl_loi_list_lock);
-
+        if (!async)
+                cl_req_completion(env, aa->aa_clerq, rc);
          osc_release_ppga(aa->aa_ppga, aa->aa_page_count);
          RETURN(rc);
  }
  
-static struct ptlrpc_request *osc_build_req(struct client_obd *cli,
+static struct ptlrpc_request *osc_build_req(const struct lu_env *env,
+                                            struct client_obd *cli,
                                              struct list_head *rpc_list,
                                              int page_count, int cmd)
  {
@@ -2101,19 +1989,24 @@ static struct ptlrpc_request *osc_build_req(struct client_obd *cli,
          struct brw_page **pga = NULL;
          struct osc_brw_async_args *aa;
          struct obdo *oa = NULL;
-        struct obd_async_page_ops *ops = NULL;
+        const struct obd_async_page_ops *ops = NULL;
          void *caller_data = NULL;
-        struct obd_capa *ocapa;
          struct osc_async_page *oap;
+        struct osc_async_page *tmp;
+        struct ost_body *body;
+        struct cl_req *clerq = NULL;
+        enum cl_req_type crt = (cmd & OBD_BRW_WRITE) ? CRT_WRITE : CRT_READ;
          struct ldlm_lock *lock = NULL;
+        struct cl_req_attr crattr;
          int i, rc;
  
          ENTRY;
          LASSERT(!list_empty(rpc_list));
  
+        memset(&crattr, 0, sizeof crattr);
          OBD_ALLOC(pga, sizeof(*pga) * page_count);
          if (pga == NULL)
-                RETURN(ERR_PTR(-ENOMEM));
+                GOTO(out, req = ERR_PTR(-ENOMEM));
  
          OBDO_ALLOC(oa);
          if (oa == NULL)
@@ -2121,9 +2014,16 @@ static struct ptlrpc_request *osc_build_req(struct client_obd *cli,
  
          i = 0;
          list_for_each_entry(oap, rpc_list, oap_rpc_item) {
+                struct cl_page *page = osc_oap2cl_page(oap);
                  if (ops == NULL) {
                          ops = oap->oap_caller_ops;
                          caller_data = oap->oap_caller_data;
+
+                        clerq = cl_req_alloc(env, page, crt,
+                                             1 /* only 1-object rpcs for
+                                                * now */);
+                        if (IS_ERR(clerq))
+                                GOTO(out, req = (void *)clerq);
                          lock = oap->oap_ldlm_lock;
                  }
                  pga[i] = &oap->oap_brw_page;
@@ -2131,21 +2031,28 @@ static struct ptlrpc_request *osc_build_req(struct client_obd *cli,
                  CDEBUG(0, "put page %p index %lu oap %p flg %x to pga\n",
                         pga[i]->pg, cfs_page_index(oap->oap_page), oap, pga[i]->flag);
                  i++;
+                cl_req_page_add(env, clerq, page);
          }
  
          /* always get the data for the obdo for the rpc */
          LASSERT(ops != NULL);
-        ops->ap_fill_obdo(caller_data, cmd, oa);
-        ocapa = ops->ap_lookup_capa(caller_data, cmd);
+        crattr.cra_oa = oa;
+        crattr.cra_capa = NULL;
+        cl_req_attr_set(env, clerq, &crattr, ~0ULL);
          if (lock) {
                  oa->o_handle = lock->l_remote_handle;
                  oa->o_valid |= OBD_MD_FLHANDLE;
          }
  
+        rc = cl_req_prep(env, clerq);
+        if (rc != 0) {
+                CERROR("cl_req_prep failed: %d\n", rc);
+                GOTO(out, req = ERR_PTR(rc));
+        }
+
          sort_brw_pages(pga, page_count);
          rc = osc_brw_prep_request(cmd, cli, oa, NULL, page_count,
-                                  pga, &req, ocapa);
-        capa_put(ocapa);
+                                  pga, &req, crattr.cra_capa);
          if (rc != 0) {
                  CERROR("prep_req failed: %d\n", rc);
                  GOTO(out, req = ERR_PTR(rc));
@@ -2156,27 +2063,45 @@ static struct ptlrpc_request *osc_build_req(struct client_obd *cli,
           * later setattr before earlier BRW (as determined by the request xid),
           * the OST will not use BRW timestamps.  Sadly, there is no obvious
           * way to do this in a single call.  bug 10150 */
-        ops->ap_update_obdo(caller_data, cmd, oa,
-                            OBD_MD_FLMTIME | OBD_MD_FLCTIME | OBD_MD_FLATIME);
+        body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY);
+        cl_req_attr_set(env, clerq, &crattr,
+                        OBD_MD_FLMTIME|OBD_MD_FLCTIME|OBD_MD_FLATIME);
  
          CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
          aa = ptlrpc_req_async_args(req);
          CFS_INIT_LIST_HEAD(&aa->aa_oaps);
          list_splice(rpc_list, &aa->aa_oaps);
          CFS_INIT_LIST_HEAD(rpc_list);
-
+        aa->aa_clerq = clerq;
  out:
+        capa_put(crattr.cra_capa);
          if (IS_ERR(req)) {
                  if (oa)
                          OBDO_FREE(oa);
                  if (pga)
                          OBD_FREE(pga, sizeof(*pga) * page_count);
+                /* this should happen rarely and is pretty bad, it makes the
+                 * pending list not follow the dirty order */
+                client_obd_list_lock(&cli->cl_loi_list_lock);
+                list_for_each_entry_safe(oap, tmp, rpc_list, oap_rpc_item) {
+                        list_del_init(&oap->oap_rpc_item);
+
+                        /* queued sync pages can be torn down while the pages
+                         * were between the pending list and the rpc */
+                        if (oap->oap_interrupted) {
+                                CDEBUG(D_INODE, "oap %p interrupted\n", oap);
+                                osc_ap_completion(env, cli, NULL, oap, 0,
+                                                  oap->oap_count);
+                                continue;
+                        }
+                        osc_ap_completion(env, cli, NULL, oap, 0, PTR_ERR(req));
+                }
+                if (clerq && !IS_ERR(clerq))
+                        cl_req_completion(env, clerq, PTR_ERR(req));
          }
          RETURN(req);
  }
  
-/* the loi lock is held across this function but it's allowed to release
- * and reacquire it during its work */
  /**
   * prepare pages for ASYNC io and put pages in send queue.
   *
@@ -2188,18 +2113,21 @@ out:
   * \return zero if pages successfully add to send queue.
   * \return not zere if error occurring.
   */
-static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi,
-                            int cmd, struct loi_oap_pages *lop)
+static int
+osc_send_oap_rpc(const struct lu_env *env, struct client_obd *cli,
+                 struct lov_oinfo *loi,
+                 int cmd, struct loi_oap_pages *lop)
  {
          struct ptlrpc_request *req;
          obd_count page_count = 0;
          struct osc_async_page *oap = NULL, *tmp;
          struct osc_brw_async_args *aa;
-        struct obd_async_page_ops *ops;
+        const struct obd_async_page_ops *ops;
          CFS_LIST_HEAD(rpc_list);
          unsigned int ending_offset;
          unsigned  starting_offset = 0;
          int srvlock = 0;
+        struct cl_object *clob = NULL;
          ENTRY;
  
          /* first we find the pages we're allowed to work with */
@@ -2209,6 +2137,13 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi,
  
                  LASSERT(oap->oap_magic == OAP_MAGIC);
  
+                if (clob == NULL) {
+                        /* pin object in memory, so that completion call-backs
+                         * can be safely called under client_obd_list lock. */
+                        clob = osc_oap2cl_page(oap)->cp_obj;
+                        cl_object_get(clob);
+                }
+
                  if (page_count != 0 &&
                      srvlock != !!(oap->oap_brw_flags & OBD_BRW_SRVLOCK)) {
                          CDEBUG(D_PAGE, "SRVLOCK flag mismatch,"
@@ -2226,7 +2161,8 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi,
                   * will still be on the dirty list).  we could call in
                   * at the end of ll_file_write to process the queue again. */
                  if (!(oap->oap_async_flags & ASYNC_READY)) {
-                        int rc = ops->ap_make_ready(oap->oap_caller_data, cmd);
+                        int rc = ops->ap_make_ready(env, oap->oap_caller_data,
+                                                    cmd);
                          if (rc < 0)
                                  CDEBUG(D_INODE, "oap %p page %p returned %d "
                                                  "instead of ready\n", oap,
@@ -2264,11 +2200,20 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi,
                   * ->ap_make_ready() or by higher layers.
                   */
  #if defined(__KERNEL__) && defined(__linux__)
-                 if(!(PageLocked(oap->oap_page) &&
-                     (CheckWriteback(oap->oap_page, cmd) || oap->oap_oig !=NULL))) {
-                       CDEBUG(D_PAGE, "page %p lost wb %lx/%x\n",
-                               oap->oap_page, (long)oap->oap_page->flags, oap->oap_async_flags);
-                        LBUG();
+                {
+                        struct cl_page *page;
+
+                        page = osc_oap2cl_page(oap);
+
+                        if (page->cp_type == CPT_CACHEABLE &&
+                            !(PageLocked(oap->oap_page) &&
+                              (CheckWriteback(oap->oap_page, cmd)))) {
+                                CDEBUG(D_PAGE, "page %p lost wb %lx/%x\n",
+                                       oap->oap_page,
+                                       (long)oap->oap_page->flags,
+                                       oap->oap_async_flags);
+                                LBUG();
+                        }
                  }
  #endif
                  /* If there is a gap at the start of this page, it can't merge
@@ -2287,13 +2232,17 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi,
                                            (PTLRPC_MAX_BRW_SIZE - 1);
  
                  /* ask the caller for the size of the io as the rpc leaves. */
-                if (!(oap->oap_async_flags & ASYNC_COUNT_STABLE))
+                if (!(oap->oap_async_flags & ASYNC_COUNT_STABLE)) {
                          oap->oap_count =
-                                ops->ap_refresh_count(oap->oap_caller_data,cmd);
+                                ops->ap_refresh_count(env, oap->oap_caller_data,
+                                                      cmd);
+                        LASSERT(oap->oap_page_off + oap->oap_count <= CFS_PAGE_SIZE);
+                }
                  if (oap->oap_count <= 0) {
                          CDEBUG(D_CACHE, "oap %p count %d, completing\n", oap,
                                 oap->oap_count);
-                        osc_ap_completion(cli, NULL, oap, 0, oap->oap_count);
+                        osc_ap_completion(env, cli, NULL,
+                                          oap, 0, oap->oap_count);
                          continue;
                  }
  
@@ -2322,31 +2271,21 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi,
  
          osc_wake_cache_waiters(cli);
  
-        if (page_count == 0)
-                RETURN(0);
-
          loi_list_maint(cli, loi);
  
          client_obd_list_unlock(&cli->cl_loi_list_lock);
  
-        req = osc_build_req(cli, &rpc_list, page_count, cmd);
-        if (IS_ERR(req)) {
-                /* this should happen rarely and is pretty bad, it makes the
-                 * pending list not follow the dirty order */
+        if (clob != NULL)
+                cl_object_put(env, clob);
+
+        if (page_count == 0) {
                  client_obd_list_lock(&cli->cl_loi_list_lock);
-                list_for_each_entry_safe(oap, tmp, &rpc_list, oap_rpc_item) {
-                        list_del_init(&oap->oap_rpc_item);
+                RETURN(0);
+        }
  
-                        /* queued sync pages can be torn down while the pages
-                         * were between the pending list and the rpc */
-                        if (oap->oap_interrupted) {
-                                CDEBUG(D_INODE, "oap %p interrupted\n", oap);
-                                osc_ap_completion(cli, NULL, oap, 0,
-                                                  oap->oap_count);
-                                continue;
-                        }
-                        osc_ap_completion(cli, NULL, oap, 0, PTR_ERR(req));
-                }
+        req = osc_build_req(env, cli, &rpc_list, page_count, cmd);
+        if (IS_ERR(req)) {
+                LASSERT(list_empty(&rpc_list));
                  loi_list_maint(cli, loi);
                  RETURN(PTR_ERR(req));
          }
@@ -2394,7 +2333,7 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi,
                    page_count, aa, cli->cl_r_in_flight, cli->cl_w_in_flight);
  
          req->rq_interpret_reply = brw_interpret;
-        ptlrpcd_add_req(req);
+        ptlrpcd_add_req(req, PSCOPE_BRW);
          RETURN(1);
  }
  
@@ -2441,7 +2380,7 @@ struct lov_oinfo *osc_next_loi(struct client_obd *cli)
  }
  
  /* called with the loi list lock held */
-static void osc_check_rpcs(struct client_obd *cli)
+void osc_check_rpcs(const struct lu_env *env, struct client_obd *cli)
  {
          struct lov_oinfo *loi;
          int rc = 0, race_counter = 0;
@@ -2460,7 +2399,7 @@ static void osc_check_rpcs(struct client_obd *cli)
                   * partial read pending queue when we're given this object to
                   * do io on writes while there are cache waiters */
                  if (lop_makes_rpc(cli, &loi->loi_write_lop, OBD_BRW_WRITE)) {
-                        rc = osc_send_oap_rpc(cli, loi, OBD_BRW_WRITE,
+                        rc = osc_send_oap_rpc(env, cli, loi, OBD_BRW_WRITE,
                                                &loi->loi_write_lop);
                          if (rc < 0)
                                  break;
@@ -2470,7 +2409,7 @@ static void osc_check_rpcs(struct client_obd *cli)
                                  race_counter++;
                  }
                  if (lop_makes_rpc(cli, &loi->loi_read_lop, OBD_BRW_READ)) {
-                        rc = osc_send_oap_rpc(cli, loi, OBD_BRW_READ,
+                        rc = osc_send_oap_rpc(env, cli, loi, OBD_BRW_READ,
                                                &loi->loi_read_lop);
                          if (rc < 0)
                                  break;
@@ -2520,9 +2459,32 @@ static int ocw_granted(struct client_obd *cli, struct osc_cache_waiter *ocw)
          RETURN(rc);
  };
  
+/**
+ * Non-blocking version of osc_enter_cache() that consumes grant only when it
+ * is available.
+ */
+int osc_enter_cache_try(const struct lu_env *env,
+                        struct client_obd *cli, struct lov_oinfo *loi,
+                        struct osc_async_page *oap, int transient)
+{
+        int has_grant;
+
+        has_grant = cli->cl_avail_grant >= CFS_PAGE_SIZE;
+        if (has_grant) {
+                osc_consume_write_grant(cli, &oap->oap_brw_page);
+                if (transient) {
+                        cli->cl_dirty_transit += CFS_PAGE_SIZE;
+                        atomic_inc(&obd_dirty_transit_pages);
+                        oap->oap_brw_flags |= OBD_BRW_NOCACHE;
+                }
+        }
+        return has_grant;
+}
+
  /* Caller must hold loi_list_lock - we drop/regain it if we need to wait for
   * grant or cache space. */
-static int osc_enter_cache(struct client_obd *cli, struct lov_oinfo *loi,
+static int osc_enter_cache(const struct lu_env *env,
+                           struct client_obd *cli, struct lov_oinfo *loi,
                             struct osc_async_page *oap)
  {
          struct osc_cache_waiter ocw;
@@ -2542,13 +2504,10 @@ static int osc_enter_cache(struct client_obd *cli, struct lov_oinfo *loi,
                  RETURN(-EDQUOT);
  
          /* Hopefully normal case - cache space and write credits available */
-        if ((cli->cl_dirty + CFS_PAGE_SIZE <= cli->cl_dirty_max) &&
-            (atomic_read(&obd_dirty_pages) + 1 <= obd_max_dirty_pages) &&
-            (cli->cl_avail_grant >= CFS_PAGE_SIZE)) {
-                /* account for ourselves */
-                osc_consume_write_grant(cli, &oap->oap_brw_page);
+        if (cli->cl_dirty + CFS_PAGE_SIZE <= cli->cl_dirty_max &&
+            atomic_read(&obd_dirty_pages) + 1 <= obd_max_dirty_pages &&
+            osc_enter_cache_try(env, cli, loi, oap, 0))
                  RETURN(0);
-        }
  
          /* Make sure that there are write rpcs in flight to wait for.  This
           * is a little silly as this object may not have any pending but
@@ -2560,7 +2519,7 @@ static int osc_enter_cache(struct client_obd *cli, struct lov_oinfo *loi,
                  ocw.ocw_rc = 0;
  
                  loi_list_maint(cli, loi);
-                osc_check_rpcs(cli);
+                osc_check_rpcs(env, cli);
                  client_obd_list_unlock(&cli->cl_loi_list_lock);
  
                  CDEBUG(D_CACHE, "sleeping for cache space\n");
@@ -2577,84 +2536,15 @@ static int osc_enter_cache(struct client_obd *cli, struct lov_oinfo *loi,
          RETURN(-EDQUOT);
  }
  
-/**
- * Checks if requested extent lock is compatible with a lock under the page.
- *
- * Checks if the lock under \a page is compatible with a read or write lock
- * (specified by \a rw) for an extent [\a start , \a end].
- *
- * \param exp osc export
- * \param lsm striping information for the file
- * \param res osc_async_page placeholder
- * \param rw OBD_BRW_READ if requested for reading,
- *           OBD_BRW_WRITE if requested for writing
- * \param start start of the requested extent
- * \param end end of the requested extent
- * \param cookie transparent parameter for passing locking context
- *
- * \post result == 1, *cookie == context, appropriate lock is referenced or
- * \post result == 0
- *
- * \retval 1 owned lock is reused for the request
- * \retval 0 no lock reused for the request
- *
- * \see osc_release_short_lock
- */
-static int osc_reget_short_lock(struct obd_export *exp,
-                                struct lov_stripe_md *lsm,
-                                void **res, int rw,
-                                obd_off start, obd_off end,
-                                void **cookie)
-{
-        struct osc_async_page *oap = *res;
-        int rc;
-
-        ENTRY;
-
-        spin_lock(&oap->oap_lock);
-        rc = ldlm_lock_fast_match(oap->oap_ldlm_lock, rw,
-                                  start, end, cookie);
-        spin_unlock(&oap->oap_lock);
-
-        RETURN(rc);
-}
-
-/**
- * Releases a reference to a lock taken in a "fast" way.
- *
- * Releases a read or a write (specified by \a rw) lock
- * referenced by \a cookie.
- *
- * \param exp osc export
- * \param lsm striping information for the file
- * \param end end of the locked extent
- * \param rw OBD_BRW_READ if requested for reading,
- *           OBD_BRW_WRITE if requested for writing
- * \param cookie transparent parameter for passing locking context
- *
- * \post appropriate lock is dereferenced
- *
- * \see osc_reget_short_lock
- */
-static int osc_release_short_lock(struct obd_export *exp,
-                                  struct lov_stripe_md *lsm, obd_off end,
-                                  void *cookie, int rw)
-{
-        ENTRY;
-        ldlm_lock_fast_release(cookie, rw);
-        /* no error could have happened at this layer */
-        RETURN(0);
-}
  
  int osc_prep_async_page(struct obd_export *exp, struct lov_stripe_md *lsm,
                          struct lov_oinfo *loi, cfs_page_t *page,
-                        obd_off offset, struct obd_async_page_ops *ops,
+                        obd_off offset, const struct obd_async_page_ops *ops,
                          void *data, void **res, int nocache,
                          struct lustre_handle *lockh)
  {
          struct osc_async_page *oap;
-        struct ldlm_res_id oid;
-        int rc = 0;
+
          ENTRY;
  
          if (!page)
@@ -2671,27 +2561,14 @@ int osc_prep_async_page(struct obd_export *exp, struct lov_stripe_md *lsm,
          oap->oap_page = page;
          oap->oap_obj_off = offset;
  
+        LASSERT(!(offset & ~CFS_PAGE_MASK));
+
          CFS_INIT_LIST_HEAD(&oap->oap_pending_item);
          CFS_INIT_LIST_HEAD(&oap->oap_urgent_item);
          CFS_INIT_LIST_HEAD(&oap->oap_rpc_item);
          CFS_INIT_LIST_HEAD(&oap->oap_page_list);
  
-        oap->oap_occ.occ_interrupted = osc_occ_interrupted;
-
          spin_lock_init(&oap->oap_lock);
-
-        /* If the page was marked as notcacheable - don't add to any locks */
-        if (!nocache) {
-                osc_build_res_name(loi->loi_id, loi->loi_gr, &oid);
-                /* This is the only place where we can call cache_add_extent
-                   without oap_lock, because this page is locked now, and
-                   the lock we are adding it to is referenced, so cannot lose
-                   any pages either. */
-                rc = cache_add_extent(oap->oap_cli->cl_cache, &oid, oap, lockh);
-                if (rc)
-                        RETURN(rc);
-        }
-
          CDEBUG(D_CACHE, "oap %p page %p obj off "LPU64"\n", oap, page, offset);
          RETURN(0);
  }
@@ -2704,10 +2581,11 @@ struct osc_async_page *oap_from_cookie(void *cookie)
          return oap;
  };
  
-static int osc_queue_async_io(struct obd_export *exp, struct lov_stripe_md *lsm,
-                              struct lov_oinfo *loi, void *cookie,
-                              int cmd, obd_off off, int count,
-                              obd_flag brw_flags, enum async_flags async_flags)
+int osc_queue_async_io(const struct lu_env *env,
+                       struct obd_export *exp, struct lov_stripe_md *lsm,
+                       struct lov_oinfo *loi, void *cookie,
+                       int cmd, obd_off off, int count,
+                       obd_flag brw_flags, enum async_flags async_flags)
  {
          struct client_obd *cli = &exp->exp_obd->u.cli;
          struct osc_async_page *oap;
@@ -2728,21 +2606,19 @@ static int osc_queue_async_io(struct obd_export *exp, struct lov_stripe_md *lsm,
  
          /* check if the file's owner/group is over quota */
  #ifdef HAVE_QUOTA_SUPPORT
-        if ((cmd & OBD_BRW_WRITE) && !(cmd & OBD_BRW_NOQUOTA)){
-                struct obd_async_page_ops *ops;
-                struct obdo *oa;
+        if ((cmd & OBD_BRW_WRITE) && !(cmd & OBD_BRW_NOQUOTA)) {
+                struct cl_object *obj;
+                struct cl_attr    attr; /* XXX put attr into thread info */
  
-                OBDO_ALLOC(oa);
-                if (oa == NULL)
-                        RETURN(-ENOMEM);
+                obj = cl_object_top(osc_oap2cl_page(oap)->cp_obj);
  
-                ops = oap->oap_caller_ops;
-                ops->ap_fill_obdo(oap->oap_caller_data, cmd, oa);
-                if (lquota_chkdq(quota_interface, cli, oa->o_uid, oa->o_gid) ==
-                    NO_QUOTA)
-                        rc = -EDQUOT;
+                cl_object_attr_lock(obj);
+                rc = cl_object_attr_get(env, obj, &attr);
+                cl_object_attr_unlock(obj);
  
-                OBDO_FREE(oa);
+                if (rc == 0 && lquota_chkdq(quota_interface, cli, attr.cat_uid,
+                                            attr.cat_gid) == NO_QUOTA)
+                        rc = -EDQUOT;
                  if (rc)
                          RETURN(rc);
          }
@@ -2753,6 +2629,7 @@ static int osc_queue_async_io(struct obd_export *exp, struct lov_stripe_md *lsm,
  
          client_obd_list_lock(&cli->cl_loi_list_lock);
  
+        LASSERT(off + count <= CFS_PAGE_SIZE);
          oap->oap_cmd = cmd;
          oap->oap_page_off = off;
          oap->oap_count = count;
@@ -2760,7 +2637,7 @@ static int osc_queue_async_io(struct obd_export *exp, struct lov_stripe_md *lsm,
          oap->oap_async_flags = async_flags;
  
          if (cmd & OBD_BRW_WRITE) {
-                rc = osc_enter_cache(cli, loi, oap);
+                rc = osc_enter_cache(env, cli, loi, oap);
                  if (rc) {
                          client_obd_list_unlock(&cli->cl_loi_list_lock);
                          RETURN(rc);
@@ -2773,7 +2650,7 @@ static int osc_queue_async_io(struct obd_export *exp, struct lov_stripe_md *lsm,
          LOI_DEBUG(loi, "oap %p page %p added for cmd %d\n", oap, oap->oap_page,
                    cmd);
  
-        osc_check_rpcs(cli);
+        osc_check_rpcs(env, cli);
          client_obd_list_unlock(&cli->cl_loi_list_lock);
  
          RETURN(0);
@@ -2782,50 +2659,27 @@ static int osc_queue_async_io(struct obd_export *exp, struct lov_stripe_md *lsm,
  /* aka (~was & now & flag), but this is more clear :) */
  #define SETTING(was, now, flag) (!(was & flag) && (now & flag))
  
-static int osc_set_async_flags(struct obd_export *exp,
-                               struct lov_stripe_md *lsm,
-                               struct lov_oinfo *loi, void *cookie,
-                               obd_flag async_flags)
+int osc_set_async_flags_base(struct client_obd *cli,
+                             struct lov_oinfo *loi, struct osc_async_page *oap,
+                             obd_flag async_flags)
  {
-        struct client_obd *cli = &exp->exp_obd->u.cli;
          struct loi_oap_pages *lop;
-        struct osc_async_page *oap;
-        int rc = 0;
          ENTRY;
  
-        oap = oap_from_cookie(cookie);
-        if (IS_ERR(oap))
-                RETURN(PTR_ERR(oap));
-
-        /*
-         * bug 7311: OST-side locking is only supported for liblustre for now
-         * (and liblustre never calls obd_set_async_flags(). I hope.), generic
-         * implementation has to handle case where OST-locked page was picked
-         * up by, e.g., ->writepage().
-         */
-        LASSERT(!(oap->oap_brw_flags & OBD_BRW_SRVLOCK));
-        LASSERT(!LIBLUSTRE_CLIENT); /* check that liblustre angels do fear to
-                                     * tread here. */
-
          if (cli->cl_import == NULL || cli->cl_import->imp_invalid)
                  RETURN(-EIO);
  
-        if (loi == NULL)
-                loi = lsm->lsm_oinfo[0];
-
          if (oap->oap_cmd & OBD_BRW_WRITE) {
                  lop = &loi->loi_write_lop;
          } else {
                  lop = &loi->loi_read_lop;
          }
  
-        client_obd_list_lock(&cli->cl_loi_list_lock);
-
          if (list_empty(&oap->oap_pending_item))
-                GOTO(out, rc = -EINVAL);
+                RETURN(-EINVAL);
  
          if ((oap->oap_async_flags & async_flags) == async_flags)
-                GOTO(out, rc = 0);
+                RETURN(0);
  
          if (SETTING(oap->oap_async_flags, async_flags, ASYNC_READY))
                  oap->oap_async_flags |= ASYNC_READY;
@@ -2839,106 +2693,12 @@ static int osc_set_async_flags(struct obd_export *exp,
  
          LOI_DEBUG(loi, "oap %p page %p has flags %x\n", oap, oap->oap_page,
                          oap->oap_async_flags);
-out:
-        osc_check_rpcs(cli);
-        client_obd_list_unlock(&cli->cl_loi_list_lock);
-        RETURN(rc);
-}
-
-static int osc_queue_group_io(struct obd_export *exp, struct lov_stripe_md *lsm,
-                             struct lov_oinfo *loi,
-                             struct obd_io_group *oig, void *cookie,
-                             int cmd, obd_off off, int count,
-                             obd_flag brw_flags,
-                             obd_flag async_flags)
-{
-        struct client_obd *cli = &exp->exp_obd->u.cli;
-        struct osc_async_page *oap;
-        struct loi_oap_pages *lop;
-        int rc = 0;
-        ENTRY;
-
-        oap = oap_from_cookie(cookie);
-        if (IS_ERR(oap))
-                RETURN(PTR_ERR(oap));
-
-        if (cli->cl_import == NULL || cli->cl_import->imp_invalid)
-                RETURN(-EIO);
-
-        if (!list_empty(&oap->oap_pending_item) ||
-            !list_empty(&oap->oap_urgent_item) ||
-            !list_empty(&oap->oap_rpc_item))
-                RETURN(-EBUSY);
-
-        if (loi == NULL)
-                loi = lsm->lsm_oinfo[0];
-
-        client_obd_list_lock(&cli->cl_loi_list_lock);
-
-        oap->oap_cmd = cmd;
-        oap->oap_page_off = off;
-        oap->oap_count = count;
-        oap->oap_brw_flags = brw_flags;
-        oap->oap_async_flags = async_flags;
-
-        if (cmd & OBD_BRW_WRITE)
-                lop = &loi->loi_write_lop;
-        else
-                lop = &loi->loi_read_lop;
-
-        list_add_tail(&oap->oap_pending_item, &lop->lop_pending_group);
-        if (oap->oap_async_flags & ASYNC_GROUP_SYNC) {
-                oap->oap_oig = oig;
-                rc = oig_add_one(oig, &oap->oap_occ);
-        }
-
-        LOI_DEBUG(loi, "oap %p page %p on group pending: rc %d\n",
-                  oap, oap->oap_page, rc);
-
-        client_obd_list_unlock(&cli->cl_loi_list_lock);
-
-        RETURN(rc);
-}
-
-static void osc_group_to_pending(struct client_obd *cli, struct lov_oinfo *loi,
-                                 struct loi_oap_pages *lop, int cmd)
-{
-        struct list_head *pos, *tmp;
-        struct osc_async_page *oap;
-
-        list_for_each_safe(pos, tmp, &lop->lop_pending_group) {
-                oap = list_entry(pos, struct osc_async_page, oap_pending_item);
-                list_del(&oap->oap_pending_item);
-                osc_oap_to_pending(oap);
-        }
-        loi_list_maint(cli, loi);
-}
-
-static int osc_trigger_group_io(struct obd_export *exp,
-                                struct lov_stripe_md *lsm,
-                                struct lov_oinfo *loi,
-                                struct obd_io_group *oig)
-{
-        struct client_obd *cli = &exp->exp_obd->u.cli;
-        ENTRY;
-
-        if (loi == NULL)
-                loi = lsm->lsm_oinfo[0];
-
-        client_obd_list_lock(&cli->cl_loi_list_lock);
-
-        osc_group_to_pending(cli, loi, &loi->loi_write_lop, OBD_BRW_WRITE);
-        osc_group_to_pending(cli, loi, &loi->loi_read_lop, OBD_BRW_READ);
-
-        osc_check_rpcs(cli);
-        client_obd_list_unlock(&cli->cl_loi_list_lock);
-
          RETURN(0);
  }
  
-static int osc_teardown_async_page(struct obd_export *exp,
-                                   struct lov_stripe_md *lsm,
-                                   struct lov_oinfo *loi, void *cookie)
+int osc_teardown_async_page(struct obd_export *exp,
+                            struct lov_stripe_md *lsm,
+                            struct lov_oinfo *loi, void *cookie)
  {
          struct client_obd *cli = &exp->exp_obd->u.cli;
          struct loi_oap_pages *lop;
@@ -2976,85 +2736,44 @@ static int osc_teardown_async_page(struct obd_export *exp,
                  lop_update_pending(cli, lop, oap->oap_cmd, -1);
          }
          loi_list_maint(cli, loi);
-        cache_remove_extent(cli->cl_cache, oap);
-
          LOI_DEBUG(loi, "oap %p page %p torn down\n", oap, oap->oap_page);
  out:
          client_obd_list_unlock(&cli->cl_loi_list_lock);
          RETURN(rc);
  }
  
-int osc_extent_blocking_cb(struct ldlm_lock *lock,
-                           struct ldlm_lock_desc *new, void *data,
-                           int flag)
+static void osc_set_lock_data_with_check(struct ldlm_lock *lock,
+                                         struct ldlm_enqueue_info *einfo,
+                                         int flags)
  {
-        struct lustre_handle lockh = { 0 };
-        int rc;
-        ENTRY;
-
-        if ((unsigned long)data > 0 && (unsigned long)data < 0x1000) {
-                LDLM_ERROR(lock, "cancelling lock with bad data %p", data);
-                LBUG();
-        }
+        void *data = einfo->ei_cbdata;
  
-        switch (flag) {
-        case LDLM_CB_BLOCKING:
-                ldlm_lock2handle(lock, &lockh);
-                rc = ldlm_cli_cancel(&lockh);
-                if (rc != ELDLM_OK)
-                        CERROR("ldlm_cli_cancel failed: %d\n", rc);
-                break;
-        case LDLM_CB_CANCELING: {
-
-                ldlm_lock2handle(lock, &lockh);
-                /* This lock wasn't granted, don't try to do anything */
-                if (lock->l_req_mode != lock->l_granted_mode)
-                        RETURN(0);
+        LASSERT(lock != NULL);
+        LASSERT(lock->l_blocking_ast == einfo->ei_cb_bl);
+        LASSERT(lock->l_resource->lr_type == einfo->ei_type);
+        LASSERT(lock->l_completion_ast == einfo->ei_cb_cp);
+        LASSERT(lock->l_glimpse_ast == einfo->ei_cb_gl);
  
-                cache_remove_lock(lock->l_conn_export->exp_obd->u.cli.cl_cache,
-                                  &lockh);
-
-                if (lock->l_conn_export->exp_obd->u.cli.cl_ext_lock_cancel_cb)
-                        lock->l_conn_export->exp_obd->u.cli.cl_ext_lock_cancel_cb(
-                                                          lock, new, data,flag);
-                break;
-        }
-        default:
-                LBUG();
-        }
-
-        RETURN(0);
+        lock_res_and_lock(lock);
+        spin_lock(&osc_ast_guard);
+        LASSERT(lock->l_ast_data == NULL || lock->l_ast_data == data);
+        lock->l_ast_data = data;
+        spin_unlock(&osc_ast_guard);
+        unlock_res_and_lock(lock);
  }
-EXPORT_SYMBOL(osc_extent_blocking_cb);
  
-static void osc_set_data_with_check(struct lustre_handle *lockh, void *data,
+static void osc_set_data_with_check(struct lustre_handle *lockh,
+                                    struct ldlm_enqueue_info *einfo,
                                      int flags)
  {
          struct ldlm_lock *lock = ldlm_handle2lock(lockh);
  
-        if (lock == NULL) {
-                CERROR("lockh %p, data %p - client evicted?\n", lockh, data);
-                return;
-        }
-        lock_res_and_lock(lock);
-#if defined (__KERNEL__) && defined (__linux__)
-        /* Liang XXX: Darwin and Winnt checking should be added */
-        if (lock->l_ast_data && lock->l_ast_data != data) {
-                struct inode *new_inode = data;
-                struct inode *old_inode = lock->l_ast_data;
-                if (!(old_inode->i_state & I_FREEING))
-                        LDLM_ERROR(lock, "inconsistent l_ast_data found");
-                LASSERTF(old_inode->i_state & I_FREEING,
-                         "Found existing inode %p/%lu/%u state %lu in lock: "
-                         "setting data to %p/%lu/%u\n", old_inode,
-                         old_inode->i_ino, old_inode->i_generation,
-                         old_inode->i_state,
-                         new_inode, new_inode->i_ino, new_inode->i_generation);
-        }
-#endif
-        lock->l_ast_data = data;
-        unlock_res_and_lock(lock);
-        LDLM_LOCK_PUT(lock);
+        if (lock != NULL) {
+                osc_set_lock_data_with_check(lock, einfo, flags);
+                LDLM_LOCK_PUT(lock);
+        } else
+                CERROR("lockh %p, data %p - client evicted?\n",
+                       lockh, einfo->ei_cbdata);
  }
  
  static int osc_change_cbdata(struct obd_export *exp, struct lov_stripe_md *lsm,
@@ -3068,9 +2787,11 @@ static int osc_change_cbdata(struct obd_export *exp, struct lov_stripe_md *lsm,
          return 0;
  }
  
-static int osc_enqueue_fini(struct obd_device *obd, struct ptlrpc_request *req,
-                            struct obd_info *oinfo, int intent, int rc)
+static int osc_enqueue_fini(struct ptlrpc_request *req, struct ost_lvb *lvb,
+                            obd_enqueue_update_f upcall, void *cookie,
+                            int *flags, int rc)
  {
+        int intent = *flags & LDLM_FL_HAS_INTENT;
          ENTRY;
  
          if (intent) {
@@ -3087,17 +2808,13 @@ static int osc_enqueue_fini(struct obd_device *obd, struct ptlrpc_request *req,
          }
  
          if ((intent && rc == ELDLM_LOCK_ABORTED) || !rc) {
+                *flags |= LDLM_FL_LVB_READY;
                  CDEBUG(D_INODE,"got kms "LPU64" blocks "LPU64" mtime "LPU64"\n",
-                       oinfo->oi_md->lsm_oinfo[0]->loi_lvb.lvb_size,
-                       oinfo->oi_md->lsm_oinfo[0]->loi_lvb.lvb_blocks,
-                       oinfo->oi_md->lsm_oinfo[0]->loi_lvb.lvb_mtime);
+                       lvb->lvb_size, lvb->lvb_blocks, lvb->lvb_mtime);
          }
  
-        if (!rc)
-                cache_add_lock(obd->u.cli.cl_cache, oinfo->oi_lockh);
-
          /* Call the update callback. */
-        rc = oinfo->oi_cb_up(oinfo, rc);
+        rc = (*upcall)(cookie, rc);
          RETURN(rc);
  }
  
@@ -3105,36 +2822,87 @@ static int osc_enqueue_interpret(const struct lu_env *env,
                                   struct ptlrpc_request *req,
                                   struct osc_enqueue_args *aa, int rc)
  {
-        int intent = aa->oa_oi->oi_flags & LDLM_FL_HAS_INTENT;
-        struct lov_stripe_md *lsm = aa->oa_oi->oi_md;
          struct ldlm_lock *lock;
+        struct lustre_handle handle;
+        __u32 mode;
+
+        /* Make a local copy of a lock handle and a mode, because aa->oa_*
+         * might be freed anytime after lock upcall has been called. */
+        lustre_handle_copy(&handle, aa->oa_lockh);
+        mode = aa->oa_ei->ei_mode;
  
          /* ldlm_cli_enqueue is holding a reference on the lock, so it must
           * be valid. */
-        lock = ldlm_handle2lock(aa->oa_oi->oi_lockh);
+        lock = ldlm_handle2lock(&handle);
+
+        /* Take an additional reference so that a blocking AST that
+         * ldlm_cli_enqueue_fini() might post for a failed lock, is guaranteed
+         * to arrive after an upcall has been executed by
+         * osc_enqueue_fini(). */
+        ldlm_lock_addref(&handle, mode);
  
          /* Complete obtaining the lock procedure. */
          rc = ldlm_cli_enqueue_fini(aa->oa_exp, req, aa->oa_ei->ei_type, 1,
-                                   aa->oa_ei->ei_mode,
-                                   &aa->oa_oi->oi_flags,
-                                   &lsm->lsm_oinfo[0]->loi_lvb,
-                                   sizeof(lsm->lsm_oinfo[0]->loi_lvb),
-                                   lustre_swab_ost_lvb,
-                                   aa->oa_oi->oi_lockh, rc);
-
+                                   mode, aa->oa_flags, aa->oa_lvb,
+                                   sizeof(*aa->oa_lvb), lustre_swab_ost_lvb,
+                                   &handle, rc);
          /* Complete osc stuff. */
-        rc = osc_enqueue_fini(aa->oa_exp->exp_obd, req, aa->oa_oi, intent, rc);
-
+        rc = osc_enqueue_fini(req, aa->oa_lvb,
+                              aa->oa_upcall, aa->oa_cookie, aa->oa_flags, rc);
          /* Release the lock for async request. */
-        if (lustre_handle_is_used(aa->oa_oi->oi_lockh) && rc == ELDLM_OK)
-                ldlm_lock_decref(aa->oa_oi->oi_lockh, aa->oa_ei->ei_mode);
+        if (lustre_handle_is_used(&handle) && rc == ELDLM_OK)
+                /*
+                 * Releases a reference taken by ldlm_cli_enqueue(), if it is
+                 * not already released by
+                 * ldlm_cli_enqueue_fini()->failed_lock_cleanup()
+                 */
+                ldlm_lock_decref(&handle, mode);
  
          LASSERTF(lock != NULL, "lockh %p, req %p, aa %p - client evicted?\n",
-                 aa->oa_oi->oi_lockh, req, aa);
+                 aa->oa_lockh, req, aa);
+        ldlm_lock_decref(&handle, mode);
          LDLM_LOCK_PUT(lock);
          return rc;
  }
  
+void osc_update_enqueue(struct lustre_handle *lov_lockhp,
+                        struct lov_oinfo *loi, int flags,
+                        struct ost_lvb *lvb, __u32 mode, int rc)
+{
+        if (rc == ELDLM_OK) {
+                struct ldlm_lock *lock = ldlm_handle2lock(lov_lockhp);
+                __u64 tmp;
+
+                LASSERT(lock != NULL);
+                loi->loi_lvb = *lvb;
+                tmp = loi->loi_lvb.lvb_size;
+                /* Extend KMS up to the end of this lock and no further
+                 * A lock on [x,y] means a KMS of up to y + 1 bytes! */
+                if (tmp > lock->l_policy_data.l_extent.end)
+                        tmp = lock->l_policy_data.l_extent.end + 1;
+                if (tmp >= loi->loi_kms) {
+                        LDLM_DEBUG(lock, "lock acquired, setting rss="LPU64
+                                   ", kms="LPU64, loi->loi_lvb.lvb_size, tmp);
+                        loi_kms_set(loi, tmp);
+                } else {
+                        LDLM_DEBUG(lock, "lock acquired, setting rss="
+                                   LPU64"; leaving kms="LPU64", end="LPU64,
+                                   loi->loi_lvb.lvb_size, loi->loi_kms,
+                                   lock->l_policy_data.l_extent.end);
+                }
+                ldlm_lock_allow_match(lock);
+                LDLM_LOCK_PUT(lock);
+        } else if (rc == ELDLM_LOCK_ABORTED && (flags & LDLM_FL_HAS_INTENT)) {
+                loi->loi_lvb = *lvb;
+                CDEBUG(D_INODE, "glimpsed, setting rss="LPU64"; leaving"
+                       " kms="LPU64"\n", loi->loi_lvb.lvb_size, loi->loi_kms);
+                rc = ELDLM_OK;
+        }
+}
+EXPORT_SYMBOL(osc_update_enqueue);
+
+struct ptlrpc_request_set *PTLRPCD_SET = (void *)1;
+
  /* When enqueuing asynchronously, locks are not ordered, we can obtain a lock
   * from the 2nd OSC before a lock from the 1st one. This does not deadlock with
   * other synchronous requests, however keeping some locks and trying to obtain
@@ -3142,28 +2910,33 @@ static int osc_enqueue_interpret(const struct lu_env *env,
   * when other sync requests do not get released lock from a client, the client
   * is excluded from the cluster -- such scenarious make the life difficult, so
   * release locks just after they are obtained. */
-static int osc_enqueue(struct obd_export *exp, struct obd_info *oinfo,
-                       struct ldlm_enqueue_info *einfo,
-                       struct ptlrpc_request_set *rqset)
+int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id,
+                     int *flags, ldlm_policy_data_t *policy,
+                     struct ost_lvb *lvb, int kms_valid,
+                     obd_enqueue_update_f upcall, void *cookie,
+                     struct ldlm_enqueue_info *einfo,
+                     struct lustre_handle *lockh,
+                     struct ptlrpc_request_set *rqset, int async)
  {
-        struct ldlm_res_id res_id;
          struct obd_device *obd = exp->exp_obd;
          struct ptlrpc_request *req = NULL;
-        int intent = oinfo->oi_flags & LDLM_FL_HAS_INTENT;
+        int intent = *flags & LDLM_FL_HAS_INTENT;
          ldlm_mode_t mode;
          int rc;
          ENTRY;
  
-
-        osc_build_res_name(oinfo->oi_md->lsm_object_id,
-                           oinfo->oi_md->lsm_object_gr, &res_id);
          /* Filesystem lock extents are extended to page boundaries so that
           * dealing with the page cache is a little smoother.  */
-        oinfo->oi_policy.l_extent.start -=
-                oinfo->oi_policy.l_extent.start & ~CFS_PAGE_MASK;
-        oinfo->oi_policy.l_extent.end |= ~CFS_PAGE_MASK;
+        policy->l_extent.start -= policy->l_extent.start & ~CFS_PAGE_MASK;
+        policy->l_extent.end |= ~CFS_PAGE_MASK;
  
-        if (oinfo->oi_md->lsm_oinfo[0]->loi_kms_valid == 0)
+        /*
+         * kms is not valid when either object is completely fresh (so that no
+         * locks are cached), or object was evicted. In the latter case cached
+         * lock cannot be used, because it would prime inode state with
+         * potentially stale LVB.
+         */
+        if (!kms_valid)
                  goto no_match;
  
          /* Next, search for already existing extent locks that will cover us */
@@ -3182,32 +2955,37 @@ static int osc_enqueue(struct obd_export *exp, struct obd_info *oinfo,
          if (einfo->ei_mode == LCK_PR)
                  mode |= LCK_PW;
          mode = ldlm_lock_match(obd->obd_namespace,
-                               oinfo->oi_flags | LDLM_FL_LVB_READY, &res_id,
-                               einfo->ei_type, &oinfo->oi_policy, mode,
-                               oinfo->oi_lockh);
+                               *flags | LDLM_FL_LVB_READY, res_id,
+                               einfo->ei_type, policy, mode, lockh, 0);
          if (mode) {
-                /* addref the lock only if not async requests and PW lock is
-                 * matched whereas we asked for PR. */
-                if (!rqset && einfo->ei_mode != mode)
-                        ldlm_lock_addref(oinfo->oi_lockh, LCK_PR);
-                osc_set_data_with_check(oinfo->oi_lockh, einfo->ei_cbdata,
-                                        oinfo->oi_flags);
-                if (intent) {
-                        /* I would like to be able to ASSERT here that rss <=
-                         * kms, but I can't, for reasons which are explained in
-                         * lov_enqueue() */
-                }
-
-                /* We already have a lock, and it's referenced */
-                oinfo->oi_cb_up(oinfo, ELDLM_OK);
+                struct ldlm_lock *matched = ldlm_handle2lock(lockh);
+
+                if (matched->l_ast_data == NULL ||
+                    matched->l_ast_data == einfo->ei_cbdata) {
+                        /* addref the lock only if not async requests and PW
+                         * lock is matched whereas we asked for PR. */
+                        if (!rqset && einfo->ei_mode != mode)
+                                ldlm_lock_addref(lockh, LCK_PR);
+                        osc_set_lock_data_with_check(matched, einfo, *flags);
+                        if (intent) {
+                                /* I would like to be able to ASSERT here that
+                                 * rss <= kms, but I can't, for reasons which
+                                 * are explained in lov_enqueue() */
+                        }
  
-                /* For async requests, decref the lock. */
-                if (einfo->ei_mode != mode)
-                        ldlm_lock_decref(oinfo->oi_lockh, LCK_PW);
-                else if (rqset)
-                        ldlm_lock_decref(oinfo->oi_lockh, einfo->ei_mode);
+                        /* We already have a lock, and it's referenced */
+                        (*upcall)(cookie, ELDLM_OK);
  
-                RETURN(ELDLM_OK);
+                        /* For async requests, decref the lock. */
+                        if (einfo->ei_mode != mode)
+                                ldlm_lock_decref(lockh, LCK_PW);
+                        else if (rqset)
+                                ldlm_lock_decref(lockh, einfo->ei_mode);
+                        LDLM_LOCK_PUT(matched);
+                        RETURN(ELDLM_OK);
+                } else
+                        ldlm_lock_decref(lockh, mode);
+                LDLM_LOCK_PUT(matched);
          }
  
   no_match:
@@ -3223,56 +3001,76 @@ static int osc_enqueue(struct obd_export *exp, struct obd_info *oinfo,
                          RETURN(rc);
  
                  req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER,
-                                     sizeof(oinfo->oi_md->lsm_oinfo[0]->loi_lvb));
+                                     sizeof *lvb);
                  ptlrpc_request_set_replen(req);
          }
  
          /* users of osc_enqueue() can pass this flag for ldlm_lock_match() */
-        oinfo->oi_flags &= ~LDLM_FL_BLOCK_GRANTED;
+        *flags &= ~LDLM_FL_BLOCK_GRANTED;
  
-        rc = ldlm_cli_enqueue(exp, &req, einfo, &res_id,
-                              &oinfo->oi_policy, &oinfo->oi_flags,
-                              &oinfo->oi_md->lsm_oinfo[0]->loi_lvb,
-                              sizeof(oinfo->oi_md->lsm_oinfo[0]->loi_lvb),
-                              lustre_swab_ost_lvb, oinfo->oi_lockh,
-                              rqset ? 1 : 0);
+        rc = ldlm_cli_enqueue(exp, &req, einfo, res_id, policy, flags, lvb,
+                              sizeof(*lvb), lustre_swab_ost_lvb, lockh, async);
          if (rqset) {
                  if (!rc) {
                          struct osc_enqueue_args *aa;
                          CLASSERT (sizeof(*aa) <= sizeof(req->rq_async_args));
                          aa = ptlrpc_req_async_args(req);
-                        aa->oa_oi = oinfo;
                          aa->oa_ei = einfo;
                          aa->oa_exp = exp;
+                        aa->oa_flags  = flags;
+                        aa->oa_upcall = upcall;
+                        aa->oa_cookie = cookie;
+                        aa->oa_lvb    = lvb;
+                        aa->oa_lockh  = lockh;
  
                          req->rq_interpret_reply =
                                  (ptlrpc_interpterer_t)osc_enqueue_interpret;
-                        ptlrpc_set_add_req(rqset, req);
+                        if (rqset == PTLRPCD_SET)
+                                ptlrpcd_add_req(req, PSCOPE_OTHER);
+                        else
+                                ptlrpc_set_add_req(rqset, req);
                  } else if (intent) {
                          ptlrpc_req_finished(req);
                  }
                  RETURN(rc);
          }
  
-        rc = osc_enqueue_fini(obd, req, oinfo, intent, rc);
+        rc = osc_enqueue_fini(req, lvb, upcall, cookie, flags, rc);
          if (intent)
                  ptlrpc_req_finished(req);
  
          RETURN(rc);
  }
  
-static int osc_match(struct obd_export *exp, struct lov_stripe_md *lsm,
-                     __u32 type, ldlm_policy_data_t *policy, __u32 mode,
-                     int *flags, void *data, struct lustre_handle *lockh)
+static int osc_enqueue(struct obd_export *exp, struct obd_info *oinfo,
+                       struct ldlm_enqueue_info *einfo,
+                       struct ptlrpc_request_set *rqset)
  {
          struct ldlm_res_id res_id;
+        int rc;
+        ENTRY;
+
+        osc_build_res_name(oinfo->oi_md->lsm_object_id,
+                           oinfo->oi_md->lsm_object_gr, &res_id);
+
+        rc = osc_enqueue_base(exp, &res_id, &oinfo->oi_flags, &oinfo->oi_policy,
+                              &oinfo->oi_md->lsm_oinfo[0]->loi_lvb,
+                              oinfo->oi_md->lsm_oinfo[0]->loi_kms_valid,
+                              oinfo->oi_cb_up, oinfo, einfo, oinfo->oi_lockh,
+                              rqset, rqset != NULL);
+        RETURN(rc);
+}
+
+int osc_match_base(struct obd_export *exp, struct ldlm_res_id *res_id,
+                   __u32 type, ldlm_policy_data_t *policy, __u32 mode,
+                   int *flags, void *data, struct lustre_handle *lockh,
+                   int unref)
+{
          struct obd_device *obd = exp->exp_obd;
          int lflags = *flags;
          ldlm_mode_t rc;
          ENTRY;
  
-        osc_build_res_name(lsm->lsm_object_id, lsm->lsm_object_gr, &res_id);
-
          if (OBD_FAIL_CHECK(OBD_FAIL_OSC_MATCH))
                  RETURN(-EIO);
  
@@ -3289,9 +3087,10 @@ static int osc_match(struct obd_export *exp, struct lov_stripe_md *lsm,
          if (mode == LCK_PR)
                  rc |= LCK_PW;
          rc = ldlm_lock_match(obd->obd_namespace, lflags | LDLM_FL_LVB_READY,
-                             &res_id, type, policy, rc, lockh);
+                             res_id, type, policy, rc, lockh, unref);
          if (rc) {
-                osc_set_data_with_check(lockh, data, lflags);
+                if (data != NULL)
+                        osc_set_data_with_check(lockh, data, lflags);
                  if (!(lflags & LDLM_FL_TEST_LOCK) && mode != rc) {
                          ldlm_lock_addref(lockh, LCK_PR);
                          ldlm_lock_decref(lockh, LCK_PW);
@@ -3301,8 +3100,7 @@ static int osc_match(struct obd_export *exp, struct lov_stripe_md *lsm,
          RETURN(rc);
  }
  
-static int osc_cancel(struct obd_export *exp, struct lov_stripe_md *md,
-                      __u32 mode, struct lustre_handle *lockh)
+int osc_cancel_base(struct lustre_handle *lockh, __u32 mode)
  {
          ENTRY;
  
@@ -3314,6 +3112,13 @@ static int osc_cancel(struct obd_export *exp, struct lov_stripe_md *md,
          RETURN(0);
  }
  
+static int osc_cancel(struct obd_export *exp, struct lov_stripe_md *md,
+                      __u32 mode, struct lustre_handle *lockh)
+{
+        ENTRY;
+        RETURN(osc_cancel_base(lockh, mode));
+}
+
  static int osc_cancel_unused(struct obd_export *exp,
                               struct lov_stripe_md *lsm, int flags,
                               void *opaque)
@@ -3981,16 +3786,23 @@ static int osc_import_event(struct obd_device *obd,
          }
          case IMP_EVENT_INVALIDATE: {
                  struct ldlm_namespace *ns = obd->obd_namespace;
+                struct lu_env         *env;
+                int                    refcheck;
+
+                env = cl_env_get(&refcheck);
+                if (!IS_ERR(env)) {
+                        /* Reset grants */
+                        cli = &obd->u.cli;
+                        client_obd_list_lock(&cli->cl_loi_list_lock);
+                        /* all pages go to failing rpcs due to the invalid
+                         * import */
+                        osc_check_rpcs(env, cli);
+                        client_obd_list_unlock(&cli->cl_loi_list_lock);
  
-                /* Reset grants */
-                cli = &obd->u.cli;
-                client_obd_list_lock(&cli->cl_loi_list_lock);
-                /* all pages go to failing rpcs due to the invalid import */
-                osc_check_rpcs(cli);
-                client_obd_list_unlock(&cli->cl_loi_list_lock);
-
-                ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
-
+                        ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
+                        cl_env_put(env, &refcheck);
+                } else
+                        rc = PTR_ERR(env);
                  break;
          }
          case IMP_EVENT_ACTIVE: {
@@ -4059,11 +3871,6 @@ int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
                          ptlrpc_init_rq_pool(cli->cl_max_rpcs_in_flight + 2,
                                              OST_MAXREQSIZE,
                                              ptlrpc_add_rqs_to_pool);
-                cli->cl_cache = cache_create(obd);
-                if (!cli->cl_cache) {
-                        osc_cleanup(obd);
-                        rc = -ENOMEM;
-                }
          }
  
          RETURN(rc);
@@ -4130,53 +3937,14 @@ int osc_cleanup(struct obd_device *obd)
          /* free memory of osc quota cache */
          lquota_cleanup(quota_interface, obd);
  
-        cache_destroy(obd->u.cli.cl_cache);
          rc = client_obd_cleanup(obd);
  
          ptlrpcd_decref();
          RETURN(rc);
  }
  
-static int osc_register_page_removal_cb(struct obd_export *exp,
-                                        obd_page_removal_cb_t func,
-                                        obd_pin_extent_cb pin_cb)
-{
-        return cache_add_extent_removal_cb(exp->exp_obd->u.cli.cl_cache, func,
-                                           pin_cb);
-}
-
-static int osc_unregister_page_removal_cb(struct obd_export *exp,
-                                          obd_page_removal_cb_t func)
-{
-        return cache_del_extent_removal_cb(exp->exp_obd->u.cli.cl_cache, func);
-}
-
-static int osc_register_lock_cancel_cb(struct obd_export *exp,
-                                       obd_lock_cancel_cb cb)
-{
-        LASSERT(exp->exp_obd->u.cli.cl_ext_lock_cancel_cb == NULL);
-
-        exp->exp_obd->u.cli.cl_ext_lock_cancel_cb = cb;
-        return 0;
-}
-
-static int osc_unregister_lock_cancel_cb(struct obd_export *exp,
-                                         obd_lock_cancel_cb cb)
-{
-        if (exp->exp_obd->u.cli.cl_ext_lock_cancel_cb != cb) {
-                CERROR("Unregistering cancel cb %p, while only %p was "
-                       "registered\n", cb,
-                       exp->exp_obd->u.cli.cl_ext_lock_cancel_cb);
-                RETURN(-EINVAL);
-        }
-
-        exp->exp_obd->u.cli.cl_ext_lock_cancel_cb = NULL;
-        return 0;
-}
-
-static int osc_process_config(struct obd_device *obd, obd_count len, void *buf)
+int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg)
  {
-        struct lustre_cfg *lcfg = buf;
          struct lprocfs_static_vars lvars = { 0 };
          int rc = 0;
  
@@ -4195,6 +3963,11 @@ static int osc_process_config(struct obd_device *obd, obd_count len, void *buf)
          return(rc);
  }
  
+static int osc_process_config(struct obd_device *obd, obd_count len, void *buf)
+{
+        return osc_process_config_base(obd, buf);
+}
+
  struct obd_ops osc_obd_ops = {
          .o_owner                = THIS_MODULE,
          .o_setup                = osc_setup,
@@ -4217,19 +3990,9 @@ struct obd_ops osc_obd_ops = {
          .o_setattr              = osc_setattr,
          .o_setattr_async        = osc_setattr_async,
          .o_brw                  = osc_brw,
-        .o_brw_async            = osc_brw_async,
-        .o_prep_async_page      = osc_prep_async_page,
-        .o_reget_short_lock     = osc_reget_short_lock,
-        .o_release_short_lock   = osc_release_short_lock,
-        .o_queue_async_io       = osc_queue_async_io,
-        .o_set_async_flags      = osc_set_async_flags,
-        .o_queue_group_io       = osc_queue_group_io,
-        .o_trigger_group_io     = osc_trigger_group_io,
-        .o_teardown_async_page  = osc_teardown_async_page,
          .o_punch                = osc_punch,
          .o_sync                 = osc_sync,
          .o_enqueue              = osc_enqueue,
-        .o_match                = osc_match,
          .o_change_cbdata        = osc_change_cbdata,
          .o_cancel               = osc_cancel,
          .o_cancel_unused        = osc_cancel_unused,
@@ -4240,18 +4003,25 @@ struct obd_ops osc_obd_ops = {
          .o_llog_init            = osc_llog_init,
          .o_llog_finish          = osc_llog_finish,
          .o_process_config       = osc_process_config,
-        .o_register_page_removal_cb = osc_register_page_removal_cb,
-        .o_unregister_page_removal_cb = osc_unregister_page_removal_cb,
-        .o_register_lock_cancel_cb = osc_register_lock_cancel_cb,
-        .o_unregister_lock_cancel_cb = osc_unregister_lock_cancel_cb,
  };
  
+extern struct lu_kmem_descr  osc_caches[];
+extern spinlock_t            osc_ast_guard;
+extern struct lock_class_key osc_ast_guard_class;
+
  int __init osc_init(void)
  {
          struct lprocfs_static_vars lvars = { 0 };
          int rc;
          ENTRY;
  
+        /* print an address of _any_ initialized kernel symbol from this
+         * module, to allow debugging with gdb that doesn't support data
+         * symbols from modules.*/
+        CDEBUG(D_CONSOLE, "Lustre OSC module (%p).\n", &osc_caches);
+
+        rc = lu_kmem_init(osc_caches);
+
          lprocfs_osc_init_vars(&lvars);
  
          request_module("lquota");
@@ -4260,24 +4030,31 @@ int __init osc_init(void)
          init_obd_quota_ops(quota_interface, &osc_obd_ops);
  
          rc = class_register_type(&osc_obd_ops, NULL, lvars.module_vars,
-                                 LUSTRE_OSC_NAME, NULL);
+                                 LUSTRE_OSC_NAME, &osc_device_type);
          if (rc) {
                  if (quota_interface)
                          PORTAL_SYMBOL_PUT(osc_quota_interface);
+                lu_kmem_fini(osc_caches);
                  RETURN(rc);
          }
  
+        spin_lock_init(&osc_ast_guard);
+        lockdep_set_class(&osc_ast_guard, &osc_ast_guard_class);
+
          RETURN(rc);
  }
  
  #ifdef __KERNEL__
  static void /*__exit*/ osc_exit(void)
  {
+        lu_device_type_fini(&osc_device_type);
+
          lquota_exit(quota_interface);
          if (quota_interface)
                  PORTAL_SYMBOL_PUT(osc_quota_interface);
  
          class_unregister_type(LUSTRE_OSC_NAME);
+        lu_kmem_fini(osc_caches);
  }
  
  MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
diff --git a/lustre/osd/osd_handler.c b/lustre/osd/osd_handler.c

index 98a872f..422d255 100644 (file)
--- a/lustre/osd/osd_handler.c
+++ b/lustre/osd/osd_handler.c
@@ -217,11 +217,11 @@ static struct thandle     *osd_trans_start  (const struct lu_env *env,
  static journal_t          *osd_journal      (const struct osd_device *dev);
  
  static const struct lu_device_type_operations osd_device_type_ops;
-static struct lu_device_type            osd_device_type;
+static       struct lu_device_type            osd_device_type;
  static const struct lu_object_operations      osd_lu_obj_ops;
-static struct obd_ops                   osd_obd_device_ops;
+static       struct obd_ops                   osd_obd_device_ops;
  static const struct lu_device_operations      osd_lu_ops;
-static struct lu_context_key            osd_key;
+static       struct lu_context_key            osd_key;
  static const struct dt_object_operations      osd_obj_ops;
  static const struct dt_body_operations        osd_body_ops;
  static const struct dt_index_operations       osd_index_ops;
@@ -433,7 +433,7 @@ static int osd_inode_remove(const struct lu_env *env, struct osd_object *obj)
          struct thandle         *th;
          int result;
  
-        txn_param_init(prm, OSD_TXN_OI_DELETE_CREDITS + 
+        txn_param_init(prm, OSD_TXN_OI_DELETE_CREDITS +
                              OSD_TXN_INODE_DELETE_CREDITS);
          th = osd_trans_start(env, &osd->od_dt_dev, prm);
          if (!IS_ERR(th)) {
@@ -532,7 +532,7 @@ int osd_statfs(const struct lu_env *env, struct dt_device *d,
          }
  
          if (likely(result == 0))
-                *sfs = osd->od_kstatfs; 
+                *sfs = osd->od_kstatfs;
          spin_unlock(&osd->od_osfs_lock);
  
          return result;
@@ -572,7 +572,7 @@ static int osd_param_is_sane(const struct osd_device *dev,
  static void osd_trans_commit_cb(struct journal_callback *jcb, int error)
  {
          struct osd_thandle *oh = container_of0(jcb, struct osd_thandle, ot_jcb);
-        struct thandle     *th = &oh->ot_super;
+        struct thandle     *th  = &oh->ot_super;
          struct dt_device   *dev = th->th_dev;
          struct lu_device   *lud = &dev->dd_lu_dev;
  
@@ -790,7 +790,7 @@ static const int osd_dto_credits[DTO_NR] = {
          /* creadits for inode change during write */
          [DTO_WRITE_BASE]    = 3,
          /* credits for single block write */
-        [DTO_WRITE_BLOCK]   = 12 
+        [DTO_WRITE_BLOCK]   = 12
  };
  
  static int osd_credit_get(const struct lu_env *env, struct dt_device *d,
@@ -824,8 +824,8 @@ static void osd_object_read_lock(const struct lu_env *env,
          LASSERT(obj->oo_owner != env);
          down_read_nested(&obj->oo_sem, role);
  
-                LASSERT(obj->oo_owner == NULL);
-                oti->oti_r_locks++;
+        LASSERT(obj->oo_owner == NULL);
+        oti->oti_r_locks++;
  }
  
  static void osd_object_write_lock(const struct lu_env *env,
@@ -839,21 +839,21 @@ static void osd_object_write_lock(const struct lu_env *env,
          LASSERT(obj->oo_owner != env);
          down_write_nested(&obj->oo_sem, role);
  
-                LASSERT(obj->oo_owner == NULL);
-                obj->oo_owner = env;
-                oti->oti_w_locks++;
+        LASSERT(obj->oo_owner == NULL);
+        obj->oo_owner = env;
+        oti->oti_w_locks++;
  }
  
  static void osd_object_read_unlock(const struct lu_env *env,
                                     struct dt_object *dt)
  {
          struct osd_object *obj = osd_dt_obj(dt);
-                struct osd_thread_info *oti = osd_oti_get(env);
+        struct osd_thread_info *oti = osd_oti_get(env);
  
          LINVRNT(osd_invariant(obj));
  
-                LASSERT(oti->oti_r_locks > 0);
-                oti->oti_r_locks--;
+        LASSERT(oti->oti_r_locks > 0);
+        oti->oti_r_locks--;
          up_read(&obj->oo_sem);
  }
  
@@ -861,14 +861,14 @@ static void osd_object_write_unlock(const struct lu_env *env,
                                      struct dt_object *dt)
  {
          struct osd_object *obj = osd_dt_obj(dt);
-                struct osd_thread_info *oti = osd_oti_get(env);
+        struct osd_thread_info *oti = osd_oti_get(env);
  
          LINVRNT(osd_invariant(obj));
  
-                LASSERT(obj->oo_owner == env);
-                LASSERT(oti->oti_w_locks > 0);
-                oti->oti_w_locks--;
-                obj->oo_owner = NULL;
+        LASSERT(obj->oo_owner == env);
+        LASSERT(oti->oti_w_locks > 0);
+        oti->oti_w_locks--;
+        obj->oo_owner = NULL;
          up_write(&obj->oo_sem);
  }
  
@@ -1241,7 +1241,7 @@ static void osd_ah_init(const struct lu_env *env, struct dt_allocation_hint *ah,
   * Concurrency: @dt is write locked.
   */
  static int osd_object_create(const struct lu_env *env, struct dt_object *dt,
-                             struct lu_attr *attr, 
+                             struct lu_attr *attr,
                               struct dt_allocation_hint *hint,
                               struct thandle *th)
  {
@@ -2235,11 +2235,15 @@ static int osd_device_init(const struct lu_env *env, struct lu_device *d,
                             const char *name, struct lu_device *next)
  {
          int rc;
+        struct lu_context *ctx;
+
          /* context for commit hooks */
-        rc = lu_context_init(&osd_dev(d)->od_env_for_commit.le_ctx,
-                             LCT_MD_THREAD);
-        if (rc == 0)
+        ctx = &osd_dev(d)->od_env_for_commit.le_ctx;
+        rc = lu_context_init(ctx, LCT_MD_THREAD|LCT_REMEMBER|LCT_NOREF);
+        if (rc == 0) {
                  rc = osd_procfs_init(osd_dev(d), name);
+                ctx->lc_cookie = 0x3;
+        }
          return rc;
  }
  
diff --git a/lustre/osd/osd_internal.h b/lustre/osd/osd_internal.h

index 4a476db..bcfcd91 100644 (file)
--- a/lustre/osd/osd_internal.h
+++ b/lustre/osd/osd_internal.h
@@ -98,7 +98,7 @@ struct osd_device {
          __u32                     od_capa_alg;
          struct lustre_capa_key   *od_capa_keys;
          struct hlist_head        *od_capa_hash;
-        
+
          cfs_proc_dir_entry_t     *od_proc_entry;
          struct lprocfs_stats     *od_stats;
          /*
diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c

index 4ff4665..b694ee3 100644 (file)
--- a/lustre/ptlrpc/client.c
+++ b/lustre/ptlrpc/client.c
@@ -1223,9 +1223,9 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
                  }
  
                  /* ptlrpc_queue_wait->l_wait_event guarantees that rq_intr
-                 * will only be set after rq_timedout, but the oig waiting
-                 * path sets rq_intr irrespective of whether ptlrpcd has
-                 * seen a timeout.  our policy is to only interpret
+                 * will only be set after rq_timedout, but the synchronous IO
+                 * waiting path sets rq_intr irrespective of whether ptlrpcd
+                 * has seen a timeout.  our policy is to only interpret
                   * interrupted rpcs after they have timed out */
                  if (req->rq_intr && (req->rq_timedout || req->rq_waiting ||
                                       req->rq_wait_ctx)) {
@@ -1401,7 +1401,7 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
                  if (req->rq_interpret_reply != NULL) {
                          ptlrpc_interpterer_t interpreter =
                                  req->rq_interpret_reply;
-                        req->rq_status = interpreter(NULL, req,
+                        req->rq_status = interpreter(env, req,
                                                       &req->rq_async_args,
                                                       req->rq_status);
                  }
@@ -1902,7 +1902,6 @@ void ptlrpc_free_committed(struct obd_import *imp)
                  EXIT;
                  return;
          }
-
          CDEBUG(D_RPCTRACE, "%s: committing for last_committed "LPU64" gen %d\n",
                 imp->imp_obd->obd_name, imp->imp_peer_committed_transno,
                 imp->imp_generation);
@@ -2193,7 +2192,6 @@ restart:
          rc = ptl_send_rpc(req, 0);
          if (rc)
                  DEBUG_REQ(D_HA, req, "send failed (%d); recovering", rc);
-
  repeat:
          timeoutl = req->rq_deadline - cfs_time_current_sec();
          timeout = (timeoutl <= 0 || rc) ? CFS_TICK :
@@ -2306,7 +2304,7 @@ struct ptlrpc_replay_async_args {
  
  static int ptlrpc_replay_interpret(const struct lu_env *env,
                                     struct ptlrpc_request *req,
-                                    void * data, int rc)
+                                   void * data, int rc)
  {
          struct ptlrpc_replay_async_args *aa = data;
          struct obd_import *imp = req->rq_import;
@@ -2397,7 +2395,7 @@ int ptlrpc_replay_req(struct ptlrpc_request *req)
          atomic_inc(&req->rq_import->imp_replay_inflight);
          ptlrpc_request_addref(req); /* ptlrpcd needs a ref */
  
-        ptlrpcd_add_req(req);
+        ptlrpcd_add_req(req, PSCOPE_OTHER);
          RETURN(0);
  }
  
diff --git a/lustre/ptlrpc/events.c b/lustre/ptlrpc/events.c

index a1cf611..3747f07 100644 (file)
--- a/lustre/ptlrpc/events.c
+++ b/lustre/ptlrpc/events.c
@@ -39,10 +39,12 @@
  #ifndef __KERNEL__
  # include <liblustre.h>
  #else
+# include <libcfs/libcfs.h>
  # ifdef __mips64__
  #  include <linux/kernel.h>
  # endif
  #endif
+
  #include <obd_class.h>
  #include <lustre_net.h>
  #include <lustre_sec.h>
@@ -679,14 +681,14 @@ void
  liblustre_wait_idle(void)
  {
          static int recursed = 0;
-        
+
          struct list_head               *tmp;
          struct liblustre_wait_callback *llwc;
          int                             idle = 0;
  
          LASSERT(!recursed);
          recursed = 1;
-        
+
          do {
                  liblustre_wait_event(0);
  
@@ -695,13 +697,13 @@ liblustre_wait_idle(void)
                  list_for_each(tmp, &liblustre_idle_callbacks) {
                          llwc = list_entry(tmp, struct liblustre_wait_callback,
                                            llwc_list);
-                        
+
                          if (!llwc->llwc_fn(llwc->llwc_arg)) {
                                  idle = 0;
                                  break;
                          }
                  }
-                        
+
          } while (!idle);
  
          recursed = 0;
@@ -722,11 +724,12 @@ int ptlrpc_init_portals(void)
                  liblustre_register_wait_callback("liblustre_check_services",
                                                   &liblustre_check_services,
                                                   NULL);
+        init_completion_module(liblustre_wait_event);
  #endif
          rc = ptlrpcd_addref();
          if (rc == 0)
                  return 0;
-        
+
          CERROR("rpcd initialisation failed\n");
  #ifndef __KERNEL__
          liblustre_deregister_wait_callback(liblustre_services_callback);
diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c

index 18fb0cf..e3891d1 100644 (file)
--- a/lustre/ptlrpc/import.c
+++ b/lustre/ptlrpc/import.c
@@ -694,7 +694,7 @@ int ptlrpc_connect_import(struct obd_import *imp, char *new_uuid)
                                          MSG_CONNECT_TRANSNO);
  
          DEBUG_REQ(D_RPCTRACE, request, "(re)connect request");
-        ptlrpcd_add_req(request);
+        ptlrpcd_add_req(request, PSCOPE_OTHER);
          rc = 0;
  out:
          if (rc != 0) {
@@ -1132,7 +1132,7 @@ out:
  
  static int completed_replay_interpret(const struct lu_env *env,
                                        struct ptlrpc_request *req,
-                                    void * data, int rc)
+                                      void * data, int rc)
  {
          ENTRY;
          atomic_dec(&req->rq_import->imp_replay_inflight);
@@ -1170,7 +1170,7 @@ static int signal_completed_replay(struct obd_import *imp)
          req->rq_timeout *= 3;
          req->rq_interpret_reply = completed_replay_interpret;
  
-        ptlrpcd_add_req(req);
+        ptlrpcd_add_req(req, PSCOPE_OTHER);
          RETURN(0);
  }
  
diff --git a/lustre/ptlrpc/layout.c b/lustre/ptlrpc/layout.c

index 562335c..f975d74 100644 (file)
--- a/lustre/ptlrpc/layout.c
+++ b/lustre/ptlrpc/layout.c
@@ -781,7 +781,7 @@ EXPORT_SYMBOL(RMF_REC_JOINFILE);
  const struct req_msg_field RMF_EADATA = DEFINE_MSGF("eadata", 0, -1, NULL);
  EXPORT_SYMBOL(RMF_EADATA);
  
-const struct req_msg_field RMF_ACL = 
+const struct req_msg_field RMF_ACL =
          DEFINE_MSGF("acl", 0, LUSTRE_POSIX_ACL_MAX_SIZE, NULL);
  EXPORT_SYMBOL(RMF_ACL);
  
@@ -799,7 +799,7 @@ const struct req_msg_field RMF_CAPA2 =
                      lustre_swab_lustre_capa);
  EXPORT_SYMBOL(RMF_CAPA2);
  
-/* 
+/*
   * OST request field.
   */
  const struct req_msg_field RMF_OST_BODY =
@@ -863,11 +863,11 @@ DEFINE_REQ_FMT(name, client, ARRAY_SIZE(client), server, ARRAY_SIZE(server))
  const struct req_format RQF_OBD_PING =
          DEFINE_REQ_FMT0("OBD_PING", empty, empty);
  EXPORT_SYMBOL(RQF_OBD_PING);
- 
+
  const struct req_format RQF_SEC_CTX =
          DEFINE_REQ_FMT0("SEC_CTX", empty, empty);
  EXPORT_SYMBOL(RQF_SEC_CTX);
- 
+
  const struct req_format RQF_MGS_TARGET_REG =
          DEFINE_REQ_FMT0("MGS_TARGET_REG", mgs_target_info_only,
                           mgs_target_info_only);
@@ -1002,11 +1002,11 @@ EXPORT_SYMBOL(RQF_MDS_CONNECT);
  const struct req_format RQF_MDS_DISCONNECT =
          DEFINE_REQ_FMT0("MDS_DISCONNECT", empty, empty);
  EXPORT_SYMBOL(RQF_MDS_DISCONNECT);
- 
+
  const struct req_format RQF_MDS_SET_INFO =
          DEFINE_REQ_FMT0("MDS_SET_INFO", mds_set_info_client, empty);
  EXPORT_SYMBOL(RQF_MDS_SET_INFO);
- 
+
  const struct req_format RQF_LDLM_ENQUEUE =
          DEFINE_REQ_FMT0("LDLM_ENQUEUE",
                          ldlm_enqueue_client, ldlm_enqueue_lvb_server);
@@ -1305,7 +1305,7 @@ int req_capsule_filled_sizes(struct req_capsule *pill,
  
          for (i = 0; i < fmt->rf_fields[loc].nr; ++i) {
                  if (pill->rc_area[loc][i] == -1) {
-                        pill->rc_area[loc][i] = 
+                        pill->rc_area[loc][i] =
                                              fmt->rf_fields[loc].d[i]->rmf_size;
                          if (pill->rc_area[loc][i] == -1) {
                                  /* skip the following fields */
diff --git a/lustre/ptlrpc/pack_generic.c b/lustre/ptlrpc/pack_generic.c

index b03990c..ff856cd 100644 (file)
--- a/lustre/ptlrpc/pack_generic.c
+++ b/lustre/ptlrpc/pack_generic.c
@@ -90,7 +90,7 @@ int lustre_msg_check_version(struct lustre_msg *msg, __u32 version)
          case LUSTRE_MSG_MAGIC_V1:
          case LUSTRE_MSG_MAGIC_V1_SWABBED:
                  CERROR("msg v1 not supported - please upgrade you system\n");
-                return -EINVAL; 
+                return -EINVAL;
          case LUSTRE_MSG_MAGIC_V2:
          case LUSTRE_MSG_MAGIC_V2_SWABBED:
                  return lustre_msg_check_version_v2(msg, version);
@@ -516,7 +516,7 @@ static int lustre_unpack_msg_v2(struct lustre_msg_v2 *m, int len)
                          len, m->lm_bufcount);
                  return -EINVAL;
          }
-        
+
          for (i = 0; i < m->lm_bufcount; i++) {
                  if (flipped)
                          __swab32s(&m->lm_buflens[i]);
diff --git a/lustre/ptlrpc/pinger.c b/lustre/ptlrpc/pinger.c

index a1042d1..da417a5 100644 (file)
--- a/lustre/ptlrpc/pinger.c
+++ b/lustre/ptlrpc/pinger.c
@@ -69,7 +69,7 @@ int ptlrpc_ping(struct obd_import *imp)
                    imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd));
          req->rq_no_resend = req->rq_no_delay = 1;
          ptlrpc_request_set_replen(req);
-        ptlrpcd_add_req(req);
+        ptlrpcd_add_req(req, PSCOPE_OTHER);
  
          RETURN(0);
  }
diff --git a/lustre/ptlrpc/ptlrpcd.c b/lustre/ptlrpc/ptlrpcd.c

index 175fa50..30c2823 100644 (file)
--- a/lustre/ptlrpc/ptlrpcd.c
+++ b/lustre/ptlrpc/ptlrpcd.c
@@ -51,10 +51,44 @@
  #include <lustre_ha.h>
  #include <obd_class.h>   /* for obd_zombie */
  #include <obd_support.h> /* for OBD_FAIL_CHECK */
+#include <cl_object.h> /* cl_env_{get,put}() */
  #include <lprocfs_status.h>
  
-static struct ptlrpcd_ctl ptlrpcd_pc;
-static struct ptlrpcd_ctl ptlrpcd_recovery_pc;
+enum pscope_thread {
+        PT_NORMAL,
+        PT_RECOVERY,
+        PT_NR
+};
+
+struct ptlrpcd_scope_ctl {
+        struct ptlrpcd_thread {
+                const char        *pt_name;
+                struct ptlrpcd_ctl pt_ctl;
+        } pscope_thread[PT_NR];
+};
+
+static struct ptlrpcd_scope_ctl ptlrpcd_scopes[PSCOPE_NR] = {
+        [PSCOPE_BRW] = {
+                .pscope_thread = {
+                        [PT_NORMAL] = {
+                                .pt_name = "ptlrpcd-brw"
+                        },
+                        [PT_RECOVERY] = {
+                                .pt_name = "ptlrpcd-brw-rcv"
+                        }
+                }
+        },
+        [PSCOPE_OTHER] = {
+                .pscope_thread = {
+                        [PT_NORMAL] = {
+                                .pt_name = "ptlrpcd"
+                        },
+                        [PT_RECOVERY] = {
+                                .pt_name = "ptlrpcd-rcv"
+                        }
+                }
+        }
+};
  
  struct semaphore ptlrpcd_sem;
  static int ptlrpcd_users = 0;
@@ -68,24 +102,26 @@ void ptlrpcd_wake(struct ptlrpc_request *req)
          cfs_waitq_signal(&rq_set->set_waitq);
  }
  
-/* 
+/*
   * Requests that are added to the ptlrpcd queue are sent via
   * ptlrpcd_check->ptlrpc_check_set().
   */
-void ptlrpcd_add_req(struct ptlrpc_request *req)
+void ptlrpcd_add_req(struct ptlrpc_request *req, enum ptlrpcd_scope scope)
  {
          struct ptlrpcd_ctl *pc;
+        enum pscope_thread  pt;
          int rc;
  
-        if (req->rq_send_state == LUSTRE_IMP_FULL)
-                pc = &ptlrpcd_pc;
-        else
-                pc = &ptlrpcd_recovery_pc;
-
+        LASSERT(scope < PSCOPE_NR);
+        pt = req->rq_send_state == LUSTRE_IMP_FULL ? PT_NORMAL : PT_RECOVERY;
+        pc = &ptlrpcd_scopes[scope].pscope_thread[pt].pt_ctl;
          rc = ptlrpc_set_add_new_req(pc, req);
-        if (rc) {
+        /*
+         * XXX disable this for CLIO: environment is needed for interpreter.
+         */
+        if (rc && 0) {
                  ptlrpc_interpterer_t interpreter;
-                                   
+
                  interpreter = req->rq_interpret_reply;
  
                  /*
@@ -118,8 +154,8 @@ static int ptlrpcd_check(const struct lu_env *env, struct ptlrpcd_ctl *pc)
                  req = list_entry(pos, struct ptlrpc_request, rq_set_chain);
                  list_del_init(&req->rq_set_chain);
                  ptlrpc_set_add_req(pc->pc_set, req);
-                /* 
-                 * Need to calculate its timeout. 
+                /*
+                 * Need to calculate its timeout.
                   */
                  rc = 1;
          }
@@ -128,9 +164,9 @@ static int ptlrpcd_check(const struct lu_env *env, struct ptlrpcd_ctl *pc)
          if (pc->pc_set->set_remaining) {
                  rc = rc | ptlrpc_check_set(env, pc->pc_set);
  
-                /* 
+                /*
                   * XXX: our set never completes, so we prune the completed
-                 * reqs after each iteration. boy could this be smarter. 
+                 * reqs after each iteration. boy could this be smarter.
                   */
                  list_for_each_safe(pos, tmp, &pc->pc_set->set_requests) {
                          req = list_entry(pos, struct ptlrpc_request,
@@ -145,8 +181,8 @@ static int ptlrpcd_check(const struct lu_env *env, struct ptlrpcd_ctl *pc)
          }
  
          if (rc == 0) {
-                /* 
-                 * If new requests have been added, make sure to wake up. 
+                /*
+                 * If new requests have been added, make sure to wake up.
                   */
                  spin_lock(&pc->pc_set->set_new_req_lock);
                  rc = !list_empty(&pc->pc_set->set_new_requests);
@@ -157,7 +193,7 @@ static int ptlrpcd_check(const struct lu_env *env, struct ptlrpcd_ctl *pc)
  }
  
  #ifdef __KERNEL__
-/* 
+/*
   * ptlrpc's code paths like to execute in process context, so we have this
   * thread which spins on a set which contains the io rpcs. llite specifies
   * ptlrpcd's set when it pushes pages down into the oscs.
@@ -165,34 +201,60 @@ static int ptlrpcd_check(const struct lu_env *env, struct ptlrpcd_ctl *pc)
  static int ptlrpcd(void *arg)
  {
          struct ptlrpcd_ctl *pc = arg;
+        struct lu_env env = { .le_ses = NULL };
          int rc;
          ENTRY;
  
-        if ((rc = cfs_daemonize_ctxt(pc->pc_name))) {
-                complete(&pc->pc_starting);
-                goto out;
+        rc = cfs_daemonize_ctxt(pc->pc_name);
+        if (rc == 0) {
+                /*
+                 * XXX So far only "client" ptlrpcd uses an environment. In
+                 * the future, ptlrpcd thread (or a thread-set) has to given
+                 * an argument, describing its "scope".
+                 */
+                rc = lu_context_init(&env.le_ctx,
+                                     LCT_CL_THREAD|LCT_REMEMBER|LCT_NOREF);
          }
  
          complete(&pc->pc_starting);
  
-        /* 
+        if (rc != 0)
+                RETURN(rc);
+        env.le_ctx.lc_cookie = 0x7;
+        /*
           * This mainloop strongly resembles ptlrpc_set_wait() except that our
           * set never completes.  ptlrpcd_check() calls ptlrpc_check_set() when
-         * there are requests in the set. New requests come in on the set's 
-         * new_req_list and ptlrpcd_check() moves them into the set. 
+         * there are requests in the set. New requests come in on the set's
+         * new_req_list and ptlrpcd_check() moves them into the set.
           */
          while (1) {
                  struct l_wait_info lwi;
                  int timeout;
  
+                rc = lu_env_refill(&env);
+                if (rc != 0) {
+                        /*
+                         * XXX This is very awkward situation, because
+                         * execution can neither continue (request
+                         * interpreters assume that env is set up), nor repeat
+                         * the loop (as this potentially results in a tight
+                         * loop of -ENOMEM's).
+                         *
+                         * Fortunately, refill only ever does something when
+                         * new modules are loaded, i.e., early during boot up.
+                         */
+                        CERROR("Failure to refill session: %d\n", rc);
+                        continue;
+                }
+
                  timeout = ptlrpc_set_next_timeout(pc->pc_set);
-                lwi = LWI_TIMEOUT(cfs_time_seconds(timeout ? timeout : 1), 
+                lwi = LWI_TIMEOUT(cfs_time_seconds(timeout ? timeout : 1),
                                    ptlrpc_expired_set, pc->pc_set);
  
-                lu_context_enter(&pc->pc_env.le_ctx);
+                lu_context_enter(&env.le_ctx);
                  l_wait_event(pc->pc_set->set_waitq,
-                             ptlrpcd_check(&pc->pc_env, pc), &lwi);
-                lu_context_exit(&pc->pc_env.le_ctx);
+                             ptlrpcd_check(&env, pc), &lwi);
+                lu_context_exit(&env.le_ctx);
  
                  /*
                   * Abort inflight rpcs for forced stop case.
@@ -204,14 +266,14 @@ static int ptlrpcd(void *arg)
                          break;
          }
  
-        /* 
-         * Wait for inflight requests to drain. 
+        /*
+         * Wait for inflight requests to drain.
           */
          if (!list_empty(&pc->pc_set->set_requests))
                  ptlrpc_set_wait(pc->pc_set);
-
+        lu_context_fini(&env.le_ctx);
          complete(&pc->pc_finishing);
-out:
+
          clear_bit(LIOD_START, &pc->pc_flags);
          clear_bit(LIOD_STOP, &pc->pc_flags);
          return 0;
@@ -222,10 +284,10 @@ out:
  int ptlrpcd_check_async_rpcs(void *arg)
  {
          struct ptlrpcd_ctl *pc = arg;
-        int                  rc = 0;
+        int                 rc = 0;
  
-        /* 
-         * Single threaded!! 
+        /*
+         * Single threaded!!
           */
          pc->pc_recurred++;
  
@@ -235,10 +297,10 @@ int ptlrpcd_check_async_rpcs(void *arg)
                  lu_context_exit(&pc->pc_env.le_ctx);
                  if (!rc)
                          ptlrpc_expired_set(pc->pc_set);
-                /* 
-                 * XXX: send replay requests. 
+                /*
+                 * XXX: send replay requests.
                   */
-                if (pc == &ptlrpcd_recovery_pc)
+                if (test_bit(LIOD_RECOVERY, &pc->pc_flags))
                          rc = ptlrpcd_check(&pc->pc_env, pc);
          }
  
@@ -256,13 +318,13 @@ int ptlrpcd_idle(void *arg)
  
  #endif
  
-int ptlrpcd_start(char *name, struct ptlrpcd_ctl *pc)
+int ptlrpcd_start(const char *name, struct ptlrpcd_ctl *pc)
  {
          int rc;
          ENTRY;
- 
-        /* 
-         * Do not allow start second thread for one pc. 
+
+        /*
+         * Do not allow start second thread for one pc.
           */
          if (test_and_set_bit(LIOD_START, &pc->pc_flags)) {
                  CERROR("Starting second thread (%s) for same pc %p\n",
@@ -332,28 +394,52 @@ void ptlrpcd_stop(struct ptlrpcd_ctl *pc, int force)
          ptlrpc_set_destroy(pc->pc_set);
  }
  
-int ptlrpcd_addref(void)
+void ptlrpcd_fini(void)
  {
-        int rc = 0;
+        int i;
+        int j;
+
          ENTRY;
  
-        mutex_down(&ptlrpcd_sem);
-        if (++ptlrpcd_users != 1)
-                GOTO(out, rc);
+        for (i = 0; i < PSCOPE_NR; ++i) {
+                for (j = 0; j < PT_NR; ++j) {
+                        struct ptlrpcd_ctl *pc;
  
-        rc = ptlrpcd_start("ptlrpcd", &ptlrpcd_pc);
-        if (rc) {
-                --ptlrpcd_users;
-                GOTO(out, rc);
+                        pc = &ptlrpcd_scopes[i].pscope_thread[j].pt_ctl;
+
+                        if (test_bit(LIOD_START, &pc->pc_flags))
+                                ptlrpcd_stop(pc, 0);
+                }
          }
+        EXIT;
+}
  
-        rc = ptlrpcd_start("ptlrpcd-recov", &ptlrpcd_recovery_pc);
-        if (rc) {
-                ptlrpcd_stop(&ptlrpcd_pc, 0);
-                --ptlrpcd_users;
-                GOTO(out, rc);
+int ptlrpcd_addref(void)
+{
+        int rc = 0;
+        int i;
+        int j;
+        ENTRY;
+
+        mutex_down(&ptlrpcd_sem);
+        if (++ptlrpcd_users == 1) {
+                for (i = 0; rc == 0 && i < PSCOPE_NR; ++i) {
+                        for (j = 0; rc == 0 && j < PT_NR; ++j) {
+                                struct ptlrpcd_thread *pt;
+                                struct ptlrpcd_ctl    *pc;
+
+                                pt = &ptlrpcd_scopes[i].pscope_thread[j];
+                                pc = &pt->pt_ctl;
+                                if (j == PT_RECOVERY)
+                                        set_bit(LIOD_RECOVERY, &pc->pc_flags);
+                                rc = ptlrpcd_start(pt->pt_name, pc);
+                        }
+                }
+                if (rc != 0) {
+                        --ptlrpcd_users;
+                        ptlrpcd_fini();
+                }
          }
-out:
          mutex_up(&ptlrpcd_sem);
          RETURN(rc);
  }
@@ -361,9 +447,7 @@ out:
  void ptlrpcd_decref(void)
  {
          mutex_down(&ptlrpcd_sem);
-        if (--ptlrpcd_users == 0) {
-                ptlrpcd_stop(&ptlrpcd_pc, 0);
-                ptlrpcd_stop(&ptlrpcd_recovery_pc, 0);
-        }
+        if (--ptlrpcd_users == 0)
+                ptlrpcd_fini();
          mutex_up(&ptlrpcd_sem);
  }
diff --git a/lustre/ptlrpc/recov_thread.c b/lustre/ptlrpc/recov_thread.c

index 01918db..497d281 100644 (file)
--- a/lustre/ptlrpc/recov_thread.c
+++ b/lustre/ptlrpc/recov_thread.c
@@ -76,7 +76,7 @@ enum {
          LLOG_LCM_FL_EXIT        = 1 << 1
  };
  
-/** 
+/**
   * Allocate new llcd from cache, init it and return to caller.
   * Bumps number of objects allocated.
   */
@@ -85,10 +85,10 @@ static struct llog_canceld_ctxt *llcd_alloc(void)
          struct llog_canceld_ctxt *llcd;
          int llcd_size;
  
-        /* 
+        /*
           * Payload of lustre_msg V2 is bigger.
           */
-        llcd_size = CFS_PAGE_SIZE - 
+        llcd_size = CFS_PAGE_SIZE -
                  lustre_msg_size(LUSTRE_MSG_MAGIC_V2, 1, NULL);
          llcd_size += offsetof(struct llog_canceld_ctxt, llcd_cookies);
          OBD_SLAB_ALLOC(llcd, llcd_cache, CFS_ALLOC_STD, llcd_size);
@@ -114,10 +114,10 @@ static void llcd_free(struct llog_canceld_ctxt *llcd)
  /**
   * Copy passed @cookies to @llcd.
   */
-static void llcd_copy(struct llog_canceld_ctxt *llcd, 
+static void llcd_copy(struct llog_canceld_ctxt *llcd,
                        struct llog_cookie *cookies)
  {
-        memcpy((char *)llcd->llcd_cookies + llcd->llcd_cookiebytes, 
+        memcpy((char *)llcd->llcd_cookies + llcd->llcd_cookiebytes,
                cookies, sizeof(*cookies));
          llcd->llcd_cookiebytes += sizeof(*cookies);
  }
@@ -129,12 +129,12 @@ static void llcd_copy(struct llog_canceld_ctxt *llcd,
  static int llcd_fit(struct llog_canceld_ctxt *llcd,
                   struct llog_cookie *cookies)
  {
-        return (llcd->llcd_size - 
+        return (llcd->llcd_size -
                  llcd->llcd_cookiebytes) >= sizeof(*cookies);
  }
  
-static void llcd_print(struct llog_canceld_ctxt *llcd, 
-                       const char *func, int line) 
+static void llcd_print(struct llog_canceld_ctxt *llcd,
+                       const char *func, int line)
  {
          CDEBUG(D_RPCTRACE, "Llcd (%p) at %s:%d:\n", llcd, func, line);
          CDEBUG(D_RPCTRACE, "  size: %d\n", llcd->llcd_size);
@@ -148,7 +148,7 @@ static void llcd_print(struct llog_canceld_ctxt *llcd,
   * sending result. Error is passed in @rc. Note, that this will be called
   * in cleanup time when all inflight rpcs aborted.
   */
-static int 
+static int
  llcd_interpret(const struct lu_env *env,
                 struct ptlrpc_request *req, void *noused, int rc)
  {
@@ -157,10 +157,10 @@ llcd_interpret(const struct lu_env *env,
          llcd_free(llcd);
          return 0;
  }
- 
+
  /**
   * Send @llcd to remote node. Free llcd uppon completion or error. Sending
- * is performed in async style so this function will return asap without 
+ * is performed in async style so this function will return asap without
   * blocking.
   */
  static int llcd_send(struct llog_canceld_ctxt *llcd)
@@ -175,7 +175,7 @@ static int llcd_send(struct llog_canceld_ctxt *llcd)
  
          ctxt = llcd->llcd_ctxt;
          if (!ctxt) {
-                CERROR("Invalid llcd with NULL ctxt found (%p)\n", 
+                CERROR("Invalid llcd with NULL ctxt found (%p)\n",
                         llcd);
                  llcd_print(llcd, __FUNCTION__, __LINE__);
                  LBUG();
@@ -186,10 +186,10 @@ static int llcd_send(struct llog_canceld_ctxt *llcd)
                  GOTO(exit, rc = 0);
  
          lcm = llcd->llcd_lcm;
-        
-        /* 
+
+        /*
           * Check if we're in exit stage. Do not send llcd in
-         * this case. 
+         * this case.
           */
          if (test_bit(LLOG_LCM_FL_EXIT, &lcm->lcm_flags))
                  GOTO(exit, rc = -ENODEV);
@@ -197,9 +197,9 @@ static int llcd_send(struct llog_canceld_ctxt *llcd)
          CDEBUG(D_RPCTRACE, "Sending llcd %p\n", llcd);
  
          import = llcd->llcd_ctxt->loc_imp;
-        if (!import || (import == LP_POISON) || 
+        if (!import || (import == LP_POISON) ||
              (import->imp_client == LP_POISON)) {
-                CERROR("Invalid import %p for llcd %p\n", 
+                CERROR("Invalid import %p for llcd %p\n",
                         import, llcd);
                  GOTO(exit, rc = -ENODEV);
          }
@@ -207,12 +207,12 @@ static int llcd_send(struct llog_canceld_ctxt *llcd)
          OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_RECOV, 10);
  
          /*
-         * No need to get import here as it is already done in 
+         * No need to get import here as it is already done in
           * llog_receptor_accept().
           */
          req = ptlrpc_request_alloc(import, &RQF_LOG_CANCEL);
          if (req == NULL) {
-                CERROR("Can't allocate request for sending llcd %p\n", 
+                CERROR("Can't allocate request for sending llcd %p\n",
                         llcd);
                  GOTO(exit, rc = -ENOMEM);
          }
@@ -292,8 +292,8 @@ static struct llog_canceld_ctxt *llcd_detach(struct llog_ctxt *ctxt)
          }
          atomic_dec(&lcm->lcm_count);
          ctxt->loc_llcd = NULL;
-        
-        CDEBUG(D_RPCTRACE, "Detach llcd %p from ctxt %p (%d)\n", 
+
+        CDEBUG(D_RPCTRACE, "Detach llcd %p from ctxt %p (%d)\n",
                 llcd, ctxt, atomic_read(&lcm->lcm_count));
  
          llog_ctxt_put(ctxt);
@@ -344,7 +344,7 @@ static int llcd_push(struct llog_ctxt *ctxt)
          int rc;
  
          /*
-         * Make sure that this llcd will not be sent again as we detach 
+         * Make sure that this llcd will not be sent again as we detach
           * it from ctxt.
           */
          llcd = llcd_detach(ctxt);
@@ -353,7 +353,7 @@ static int llcd_push(struct llog_ctxt *ctxt)
                  llcd_print(llcd, __FUNCTION__, __LINE__);
                  LBUG();
          }
-        
+
          rc = llcd_send(llcd);
          if (rc)
                  CERROR("Couldn't send llcd %p (%d)\n", llcd, rc);
@@ -372,7 +372,7 @@ int llog_recov_thread_start(struct llog_commit_master *lcm)
  
          rc = ptlrpcd_start(lcm->lcm_name, &lcm->lcm_pc);
          if (rc) {
-                CERROR("Error %d while starting recovery thread %s\n", 
+                CERROR("Error %d while starting recovery thread %s\n",
                         rc, lcm->lcm_name);
                  RETURN(rc);
          }
@@ -390,7 +390,7 @@ void llog_recov_thread_stop(struct llog_commit_master *lcm, int force)
          ENTRY;
  
          /**
-         * Let all know that we're stopping. This will also make 
+         * Let all know that we're stopping. This will also make
           * llcd_send() refuse any new llcds.
           */
          set_bit(LLOG_LCM_FL_EXIT, &lcm->lcm_flags);
@@ -427,7 +427,7 @@ struct llog_commit_master *llog_recov_thread_init(char *name)
          /*
           * Try to create threads with unique names.
           */
-        snprintf(lcm->lcm_name, sizeof(lcm->lcm_name), 
+        snprintf(lcm->lcm_name, sizeof(lcm->lcm_name),
                   "ll_log_commit_%s", name);
  
          strncpy(lcm->lcm_name, name, sizeof(lcm->lcm_name));
@@ -457,7 +457,7 @@ void llog_recov_thread_fini(struct llog_commit_master *lcm, int force)
  }
  EXPORT_SYMBOL(llog_recov_thread_fini);
  
-static int llog_recov_thread_replay(struct llog_ctxt *ctxt, 
+static int llog_recov_thread_replay(struct llog_ctxt *ctxt,
                                      void *cb, void *arg)
  {
          struct obd_device *obd = ctxt->loc_obd;
@@ -486,7 +486,7 @@ static int llog_recov_thread_replay(struct llog_ctxt *ctxt,
                  OBD_FREE_PTR(lpca);
                  RETURN(-ENODEV);
          }
-        rc = cfs_kernel_thread(llog_cat_process_thread, lpca, 
+        rc = cfs_kernel_thread(llog_cat_process_thread, lpca,
                                 CLONE_VM | CLONE_FILES);
          if (rc < 0) {
                  CERROR("Error starting llog_cat_process_thread(): %d\n", rc);
@@ -507,14 +507,14 @@ int llog_obd_repl_connect(struct llog_ctxt *ctxt,
          int rc;
          ENTRY;
  
-        /* 
+        /*
           * Send back cached llcd from llog before recovery if we have any.
           * This is void is nothing cached is found there.
           */
          llog_sync(ctxt, NULL);
  
-        /* 
-         * Start recovery in separate thread. 
+        /*
+         * Start recovery in separate thread.
           */
          mutex_down(&ctxt->loc_sem);
          ctxt->loc_gen = *gen;
@@ -525,7 +525,7 @@ int llog_obd_repl_connect(struct llog_ctxt *ctxt,
  }
  EXPORT_SYMBOL(llog_obd_repl_connect);
  
-/** 
+/**
   * Deleted objects have a commit callback that cancels the MDS
   * log record for the deletion. The commit callback calls this
   * function.
@@ -543,7 +543,7 @@ int llog_obd_repl_cancel(struct llog_ctxt *ctxt,
  
          mutex_down(&ctxt->loc_sem);
          lcm = ctxt->loc_lcm;
-        
+
          /*
           * Let's check if we have all structures alive. We also check for
           * possible shutdown. Do nothing if we're stopping.
@@ -559,7 +559,7 @@ int llog_obd_repl_cancel(struct llog_ctxt *ctxt,
          }
  
          if (test_bit(LLOG_LCM_FL_EXIT, &lcm->lcm_flags)) {
-                CDEBUG(D_RPCTRACE, "Commit thread is stopping for ctxt %p\n", 
+                CDEBUG(D_RPCTRACE, "Commit thread is stopping for ctxt %p\n",
                         ctxt);
                  GOTO(out, rc = -ENODEV);
          }
@@ -568,7 +568,7 @@ int llog_obd_repl_cancel(struct llog_ctxt *ctxt,
  
          if (count > 0 && cookies != NULL) {
                  /*
-                 * Get new llcd from ctxt if required. 
+                 * Get new llcd from ctxt if required.
                   */
                  if (!llcd) {
                          llcd = llcd_get(ctxt);
@@ -583,8 +583,8 @@ int llog_obd_repl_cancel(struct llog_ctxt *ctxt,
                  }
  
                  /*
-                 * Llcd does not have enough room for @cookies. Let's push 
-                 * it out and allocate new one. 
+                 * Llcd does not have enough room for @cookies. Let's push
+                 * it out and allocate new one.
                   */
                  if (!llcd_fit(llcd, cookies)) {
                          rc = llcd_push(ctxt);
@@ -663,7 +663,7 @@ int llog_recov_init(void)
  {
          int llcd_size;
  
-        llcd_size = CFS_PAGE_SIZE - 
+        llcd_size = CFS_PAGE_SIZE -
                  lustre_msg_size(LUSTRE_MSG_MAGIC_V2, 1, NULL);
          llcd_size += offsetof(struct llog_canceld_ctxt, llcd_cookies);
          llcd_cache = cfs_mem_cache_create("llcd_cache", llcd_size, 0, 0);
@@ -680,7 +680,7 @@ int llog_recov_init(void)
  void llog_recov_fini(void)
  {
          /*
-         * Kill llcd cache when thread is stopped and we're sure no 
+         * Kill llcd cache when thread is stopped and we're sure no
           * llcd in use left.
           */
          if (llcd_cache) {
@@ -688,7 +688,7 @@ void llog_recov_fini(void)
                   * In 2.6.22 cfs_mem_cache_destroy() will not return error
                   * for busy resources. Let's check it another way.
                   */
-                LASSERTF(atomic_read(&llcd_count) == 0, 
+                LASSERTF(atomic_read(&llcd_count) == 0,
                           "Can't destroy llcd cache! Number of "
                           "busy llcds: %d\n", atomic_read(&llcd_count));
                  cfs_mem_cache_destroy(llcd_cache);
diff --git a/lustre/ptlrpc/sec.c b/lustre/ptlrpc/sec.c

index 56de3ee..9db1780 100644 (file)
--- a/lustre/ptlrpc/sec.c
+++ b/lustre/ptlrpc/sec.c
@@ -2278,7 +2278,7 @@ struct ll_crypto_cipher *ll_crypto_alloc_blkcipher(const char * algname,
  {
          char        buf[CRYPTO_MAX_ALG_NAME + 1];
          const char *pan = algname;
-        u32         flag = 0; 
+        u32         flag = 0;
  
          if (strncmp("cbc(", algname, 4) == 0)
                  flag |= CRYPTO_TFM_MODE_CBC;
diff --git a/lustre/ptlrpc/sec_plain.c b/lustre/ptlrpc/sec_plain.c

index fd4e723..4f164c1 100644 (file)
--- a/lustre/ptlrpc/sec_plain.c
+++ b/lustre/ptlrpc/sec_plain.c
@@ -318,7 +318,7 @@ struct ptlrpc_sec *plain_create_sec(struct obd_import *imp,
          /*
           * initialize plain_sec
           */
-        plsec->pls_lock = RW_LOCK_UNLOCKED;
+        rwlock_init(&plsec->pls_lock);
          plsec->pls_ctx = NULL;
  
          sec = &plsec->pls_base;
diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c

index 76de4bc..2381e72 100644 (file)
--- a/lustre/ptlrpc/service.c
+++ b/lustre/ptlrpc/service.c
@@ -1099,12 +1099,14 @@ ptlrpc_server_handle_request(struct ptlrpc_service *svc,
                                      at_get(&svc->srv_at_estimate));
          }
  
-        rc = lu_context_init(&request->rq_session, LCT_SESSION);
+        rc = lu_context_init(&request->rq_session,
+                             LCT_SESSION|LCT_REMEMBER|LCT_NOREF);
          if (rc) {
                  CERROR("Failure to initialize session: %d\n", rc);
                  goto out_req;
          }
          request->rq_session.lc_thread = thread;
+        request->rq_session.lc_cookie = 0x5;
          lu_context_enter(&request->rq_session);
  
          CDEBUG(D_NET, "got req "LPU64"\n", request->rq_xid);
@@ -1457,12 +1459,14 @@ static int ptlrpc_main(void *arg)
                          goto out;
          }
  
-        rc = lu_context_init(&env.le_ctx, svc->srv_ctx_tags);
+        rc = lu_context_init(&env.le_ctx,
+                             svc->srv_ctx_tags|LCT_REMEMBER|LCT_NOREF);
          if (rc)
                  goto out_srv_fini;
  
          thread->t_env = &env;
          env.le_ctx.lc_thread = thread;
+        env.le_ctx.lc_cookie = 0x6;
  
          /* Alloc reply state structure for this one */
          OBD_ALLOC_GFP(rs, svc->srv_max_reply_size, CFS_ALLOC_STD);
diff --git a/lustre/quota/quota_context.c b/lustre/quota/quota_context.c

index 2b7635f..92313d2 100644 (file)
--- a/lustre/quota/quota_context.c
+++ b/lustre/quota/quota_context.c
@@ -82,7 +82,7 @@ int should_translate_quota (struct obd_import *imp)
          ENTRY;
  
          LASSERT(imp);
-        if ((imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_QUOTA64) && 
+        if ((imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_QUOTA64) &&
              !OBD_FAIL_CHECK(OBD_FAIL_QUOTA_QD_COUNT_32BIT))
                  RETURN(0);
          else
@@ -161,7 +161,7 @@ int compute_remquota(struct obd_device *obd,
                  RETURN(QUOTA_RET_NOLIMIT);
  
          OBD_ALLOC_PTR(qctl);
-        if (qctl == NULL) 
+        if (qctl == NULL)
                  RETURN(-ENOMEM);
  
          /* get fs quota usage & limit */
@@ -173,7 +173,7 @@ int compute_remquota(struct obd_device *obd,
                  if (ret == -ESRCH)      /* no limit */
                          ret = QUOTA_RET_NOLIMIT;
                  else
-                        CDEBUG(D_QUOTA, "can't get fs quota usage! (rc:%d)", 
+                        CDEBUG(D_QUOTA, "can't get fs quota usage! (rc:%d)",
                                 ret);
                  GOTO(out, ret);
          }
@@ -395,10 +395,10 @@ static int split_before_schedule_dqacq(struct obd_device *obd, struct lustre_quo
          QDATA_DEBUG(qdata, "%s quota split.\n",
                      (qdata->qd_flags & QUOTA_IS_BLOCK) ? "block" : "inode");
          if (qdata->qd_flags & QUOTA_IS_BLOCK)
-                factor = MAX_QUOTA_COUNT32 / qctxt->lqc_bunit_sz * 
+                factor = MAX_QUOTA_COUNT32 / qctxt->lqc_bunit_sz *
                          qctxt->lqc_bunit_sz;
          else
-                factor = MAX_QUOTA_COUNT32 / qctxt->lqc_iunit_sz * 
+                factor = MAX_QUOTA_COUNT32 / qctxt->lqc_iunit_sz *
                          qctxt->lqc_iunit_sz;
  
          if (qctxt->lqc_import && should_translate_quota(qctxt->lqc_import) &&
@@ -470,17 +470,17 @@ dqacq_completion(struct obd_device *obd,
  
                  switch (opc) {
                  case QUOTA_DQACQ:
-                        CDEBUG(D_QUOTA, "%s(acq):count: %d, hardlimt: "LPU64 
-                               ",type: %s.\n", obd->obd_name, count, *hardlimit, 
+                        CDEBUG(D_QUOTA, "%s(acq):count: %d, hardlimt: "LPU64
+                               ",type: %s.\n", obd->obd_name, count, *hardlimit,
                                 qdata_type ? "grp": "usr");
                          INC_QLIMIT(*hardlimit, count);
                          break;
                  case QUOTA_DQREL:
-                        CDEBUG(D_QUOTA, "%s(rel):count: %d, hardlimt: "LPU64 
-                               ",type: %s.\n", obd->obd_name, count, *hardlimit, 
+                        CDEBUG(D_QUOTA, "%s(rel):count: %d, hardlimt: "LPU64
+                               ",type: %s.\n", obd->obd_name, count, *hardlimit,
                                 qdata_type ? "grp": "usr");
-                        LASSERTF(count < *hardlimit, 
-                                 "count: %d, hardlimit: "LPU64".\n", 
+                        LASSERTF(count < *hardlimit,
+                                 "count: %d, hardlimit: "LPU64".\n",
                                   count, *hardlimit);
                          *hardlimit -= count;
                          break;
@@ -538,7 +538,7 @@ out:
           *   - local dqacq/dqrel.
           *   - local disk io failure.
           */
-        if (err || (rc && rc != -EBUSY) || 
+        if (err || (rc && rc != -EBUSY) ||
              is_master(obd, qctxt, qdata->qd_id, qdata_type))
                  RETURN(err);
  
@@ -683,18 +683,18 @@ schedule_dqacq(struct obd_device *obd,
          }
  
         if (qdata->qd_flags & QUOTA_IS_BLOCK)
-               factor = MAX_QUOTA_COUNT32 / qctxt->lqc_bunit_sz * 
+               factor = MAX_QUOTA_COUNT32 / qctxt->lqc_bunit_sz *
                           qctxt->lqc_bunit_sz;
          else
-                factor = MAX_QUOTA_COUNT32 / qctxt->lqc_iunit_sz * 
+                factor = MAX_QUOTA_COUNT32 / qctxt->lqc_iunit_sz *
                           qctxt->lqc_iunit_sz;
  
-        LASSERT(!should_translate_quota(qctxt->lqc_import) || 
+        LASSERT(!should_translate_quota(qctxt->lqc_import) ||
                  qdata->qd_count <= factor);
          if (should_translate_quota(qctxt->lqc_import))
          {
                  struct qunit_data_old *reqdata_old, *tmp;
-                        
+
                  reqdata_old = req_capsule_client_get(&req->rq_pill,
                                                       &RMF_QUNIT_DATA);
  
@@ -720,7 +720,7 @@ schedule_dqacq(struct obd_device *obd,
          aa->aa_qunit = qunit;
  
          req->rq_interpret_reply = dqacq_interpret;
-        ptlrpcd_add_req(req);
+        ptlrpcd_add_req(req, PSCOPE_OTHER);
  
          QDATA_DEBUG(qdata, "%s scheduled.\n",
                      opc == QUOTA_DQACQ ? "DQACQ" : "DQREL");
@@ -755,7 +755,7 @@ qctxt_adjust_qunit(struct obd_device *obd, struct lustre_quota_ctxt *qctxt,
                  qdata[i].qd_id = id[i];
                  qdata[i].qd_flags = 0;
                  qdata[i].qd_flags |= i;
-                qdata[i].qd_flags |= isblk ? QUOTA_IS_BLOCK : 0;        
+                qdata[i].qd_flags |= isblk ? QUOTA_IS_BLOCK : 0;
                  qdata[i].qd_count = 0;
  
                  ret = check_cur_qunit(obd, qctxt, &qdata[i]);
@@ -763,7 +763,7 @@ qctxt_adjust_qunit(struct obd_device *obd, struct lustre_quota_ctxt *qctxt,
                          int opc;
                          /* need acquire or release */
                          opc = ret == 1 ? QUOTA_DQACQ : QUOTA_DQREL;
-                        ret = split_before_schedule_dqacq(obd, qctxt, &qdata[i], 
+                        ret = split_before_schedule_dqacq(obd, qctxt, &qdata[i],
                                                            opc, wait);
                          if (!rc)
                                  rc = ret;
@@ -907,7 +907,7 @@ static int qslave_recovery_main(void *arg)
  
                  LASSERT(dqopt->files[type] != NULL);
                  CFS_INIT_LIST_HEAD(&id_list);
-#ifndef KERNEL_SUPPORTS_QUOTA_READ 
+#ifndef KERNEL_SUPPORTS_QUOTA_READ
                  rc = fsfilt_qids(obd, dqopt->files[type], NULL, type, &id_list);
  #else
                  rc = fsfilt_qids(obd, NULL, dqopt->files[type], type, &id_list);
diff --git a/lustre/tests/fsx.c b/lustre/tests/fsx.c

index dc97fd9..af93599 100644 (file)
--- a/lustre/tests/fsx.c
+++ b/lustre/tests/fsx.c
@@ -69,7 +69,7 @@ struct log_entry {
         int     args[3];
  };
  
-#define        LOGSIZE 1000
+#define        LOGSIZE 100000
  
  struct log_entry       oplog[LOGSIZE]; /* the log */
  int                    logptr = 0;     /* current position in log */
@@ -299,7 +299,7 @@ save_buffer(char *buffer, off_t bufferlength, int fd)
                         prterr("save_buffer: lseek eof");
                 else if (bufferlength > size_by_seek) {
                         warn("save_buffer: .fsxgood file too short... will"
-                               "save 0x%llx bytes instead of 0x%llx\n", 
+                               "save 0x%llx bytes instead of 0x%llx\n",
                                 (unsigned long long)size_by_seek,
                                 (unsigned long long)bufferlength);
                         bufferlength = size_by_seek;
@@ -402,7 +402,7 @@ enum fd_iteration_policy {
  int fd_policy = FD_RANDOM;
  int fd_last = 0;
  
-struct test_file * 
+struct test_file *
  get_tf(void)
  {
         unsigned index = 0;
@@ -471,7 +471,7 @@ open_test_files(char **argv, int argc)
         for (i = 0, tf = test_files; i < num_test_files; i++, tf++) {
  
                 tf->path = argv[i];
-               tf->fd = open(tf->path, O_RDWR|(lite ? 0 : O_CREAT|O_TRUNC), 
+               tf->fd = open(tf->path, O_RDWR|(lite ? 0 : O_CREAT|O_TRUNC),
                                 0666);
                 if (tf->fd < 0) {
                         prterr(tf->path);
@@ -575,7 +575,7 @@ alloc_tf_buf(void)
         }
  }
  
-char * 
+char *
  fill_tf_buf(struct test_file *tf)
  {
         if (tf_buf == NULL)
@@ -586,7 +586,7 @@ fill_tf_buf(struct test_file *tf)
  }
  
  void
-output_line(struct test_file *tf, int op, unsigned offset, 
+output_line(struct test_file *tf, int op, unsigned offset,
                 unsigned size, struct timeval *tv)
  {
         char *tf_num = "";
@@ -613,7 +613,7 @@ output_line(struct test_file *tf, int op, unsigned offset,
  
         prt("%06lu %lu.%06lu %.*s%-10s %#08x %s %#08x\t(0x%x bytes)\n",
                 testcalls, tv->tv_sec, tv->tv_usec, max_tf_len,
-               tf_num, ops[op], 
+               tf_num, ops[op],
                 offset, op == OP_TRUNCATE ? " to " : "thru",
                 offset + size - 1, size);
  }
@@ -972,7 +972,7 @@ writefileimage()
                         prterr("writefileimage: write");
                 else
                         prt("short write: 0x%lx bytes instead of 0x%llx\n",
-                           (unsigned long)iret, 
+                           (unsigned long)iret,
                             (unsigned long long)file_size);
                 report_failure(172);
         }
@@ -1185,7 +1185,7 @@ main(int argc, char **argv)
  
         setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */
  
-       while ((ch = getopt(argc, argv, 
+       while ((ch = getopt(argc, argv,
                                 "b:c:dl:m:no:p:qr:s:t:w:D:I:LN:OP:RS:W"))
                != EOF)
                 switch (ch) {
diff --git a/lustre/tests/it_test.c b/lustre/tests/it_test.c

index 0b394a0..714828a 100644 (file)
--- a/lustre/tests/it_test.c
+++ b/lustre/tests/it_test.c
@@ -90,7 +90,7 @@ static enum interval_iter cb(struct interval_node *n, void *args)
                  error("duplicate node accessing found\n");
                  return INTERVAL_ITER_STOP;
          }
-        
+
          if (node->valid == 0) {
                  error("A deleted node "__S" being accessed\n",
                         __F(&n->in_extent));
@@ -128,23 +128,23 @@ static int it_test_search(struct interval_node *root)
                  interval_search(root, &ext, cb, NULL);
  
                  dprintf("\nverifing ...");
-        
+
                  /* verify */
                  for (i = 0; i < it_count; i++) {
                          n = &it_array[i];
                          if (n->valid == 0)
                                  continue;
  
-                        if (extent_overlapped(&ext, &n->node.in_extent) && 
+                        if (extent_overlapped(&ext, &n->node.in_extent) &&
                              n->hit == 0)
                                  error("node "__S" overlaps" __S","
-                                      "but never to be hit.\n", 
+                                      "but never to be hit.\n",
                                        __F(&n->node.in_extent),
                                        __F(&ext));
  
-                        if (!extent_overlapped(&ext, &n->node.in_extent) && 
+                        if (!extent_overlapped(&ext, &n->node.in_extent) &&
                              n->hit)
-                                error("node "__S" overlaps" __S", but hit.\n", 
+                                error("node "__S" overlaps" __S", but hit.\n",
                                        __F(&n->node.in_extent),
                                        __F(&ext));
                  }
@@ -285,7 +285,7 @@ err:
          }
          if (nr)
                  error("wrong tree, unbalanced!\n");
-        
+
          return 0;
  }
  
@@ -341,7 +341,7 @@ static int it_test_search_hole(struct interval_node *root)
          return 0;
  }
  
-static int contended_count = 0; 
+static int contended_count = 0;
  #define LOOP_COUNT 1000
  static enum interval_iter perf_cb(struct interval_node *n, void *args)
  {
@@ -356,7 +356,7 @@ static inline long tv_delta(struct timeval *s, struct timeval *e)
          long c = e->tv_sec - s->tv_sec;
          c *= 1000;
          c += (long int)(e->tv_usec - s->tv_usec) / 1000;
-        dprintf("\tStart: %lu:%lu -> End: %lu:%lu\n", 
+        dprintf("\tStart: %lu:%lu -> End: %lu:%lu\n",
                  s->tv_sec, s->tv_usec, e->tv_sec, e->tv_usec);
          return c;
  }
@@ -368,7 +368,7 @@ static int it_test_performance(struct interval_node *root, unsigned long len)
          struct it_node *n;
          struct timeval start, end;
          unsigned long count;
-        
+
          ext.start = (random() % (max_count - len)) & ALIGN_MASK;
          ext.end = (ext.start + len) & ALIGN_MASK;
          if (have_wide_lock) {
@@ -422,7 +422,7 @@ static struct interval_node *it_test_helper(struct interval_node *root)
                  if (n->valid) {
                          if (!interval_find(root, &n->node.in_extent))
                                  error("Cannot find an existent node\n");
-                        dprintf("Erasing a node "__S"\n", 
+                        dprintf("Erasing a node "__S"\n",
                                  __F(&n->node.in_extent));
                          interval_erase(&n->node, &root);
                          n->valid = 0;
@@ -436,7 +436,7 @@ static struct interval_node *it_test_helper(struct interval_node *root)
                          interval_set(&n->node, low, high);
                          while (interval_insert(&n->node, &root))
                                  interval_set(&n->node, low, ++high);
-                        dprintf("Adding a node "__S"\n", 
+                        dprintf("Adding a node "__S"\n",
                                  __F(&n->node.in_extent));
                          n->valid = 1;
                          list_add(&n->list, &header);
diff --git a/lustre/tests/kbuild b/lustre/tests/kbuild

new file mode 100755 (executable)

index 0000000..4630d82
--- /dev/null
+++ b/lustre/tests/kbuild
@@ -0,0 +1,312 @@
+#! /bin/sh
+
+#
+#  lustre/lustre/tests/kbuild
+#
+#  Copyright (C) 2005 Cluster File Systems, Inc.
+#
+#  Author: Nikita Danilov <nikita@clusterfs.com>
+#
+#  This file is part of Lustre, http://www.lustre.org.
+#
+#         Lustre is free  software; you can  redistribute it and/or  modify it
+#         under the terms of  version 2 of  the GNU General Public License  as
+#         published by the Free Software Foundation.
+#
+#         Lustre  is distributed  in the  hope  that it  will  be useful,  but
+#         WITHOUT  ANY   WARRANTY; without  even    the  implied  warranty  of
+#         MERCHANTABILITY or FITNESS FOR   A PARTICULAR PURPOSE.  See the  GNU
+#         General Public License for more details.
+#
+#         You  should have received a copy  of the  GNU General Public License
+#         along with  Lustre; if not, write to  the Free  Software Foundation,
+#         Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+#
+#
+#  kbuild is a swiss-army linux kernel build script. Its purpose is to run
+#  automated kernel builds on given target file system (presumably Lustre) to
+#  measure file system performance and, occasionally, correctness.
+#
+#  Usual kernel build doesn't not stress file system, because the bottleneck
+#  is CPU consumption by the user level (compiler). To work around this,
+#  kbuild uses ccache(1) that eliminates most of CPU load by the compiler,
+#  once the cache is primed.
+#
+#  Options:
+
+function usage()
+{
+        cat <<EOF
+       $pname --- builds a kernel.
+
+Usage: $pname [-s <source>]         \\
+              [-t <target>]         \\
+              [-m <make-options>]   \\
+              [-i <iterations>]     \\
+              [-v <verbosity>]      \\
+              [-c <config-target>]  \\
+              [-S]                  \\
+              [-C <config-file>]
+
+   -s <source>        source of kernel to build. This can be:
+
+                        . path to directory;
+
+                        . tar.gz, .tgz, or .tar.bz2 archive;
+
+                        . ftp or http URL to the source archive;
+
+                      defaults to "$src".
+
+   -t <target>        target directory, where build process takes place.
+                      Defaults to "$tgt".
+
+   -m <make-options>  additional options supplied to each make invocation.
+                      Defaults to "$mopt"
+
+   -c <config-target> kernel makefile target to invoke to configure kernel
+                      (defconfig, allyesconfig, allmodconfig, etc.). This
+                      option conflicts with -C <config-file>. Defaults to
+                      "$mconfig".
+
+   -C <config-file>   use given .config file as kernel configuration. Not
+                      used by default.
+
+   -S                 skip kernel copying: kernel source is already unpacked
+                      in $target. Defaults to false.
+
+   -v                 increase verbosity level.
+
+Examples:
+
+  $pname -s /usr/src/linux-2.6.10-base.tar.gz -t /mnt/lustre2 \\
+         -m -j4 -C /usr/src/.config.fc3
+
+  $pname -s ftp://ftp.clusterfs.com/pub/kernels/fc3-2.6/linux-2.6.10-base.tgz \\
+         -m -j4 -c defconfig -vvv
+
+EOF
+        exit 1
+}
+
+#
+#  Results:
+#
+#  The output of kbuild are times as reported by time. First line is for build
+#  that fills the ccache cache (that is also located on the target file
+#  system). Consecutive times are repeated builds that reuse ccache
+#  cache. Number of iteration is set through -i option. Example output:
+#  
+#  R 783.757 S 319.615 U 281.720
+#  R 540.823 S 277.387 U 54.168
+#  R 557.762 S 263.566 U 53.222
+#  R 543.877 S 278.569 U 54.412
+#  R 544.455 S 279.096 U 53.697
+#  R 545.445 S 280.546 U 53.943
+#
+#  Notes:
+#
+#  Kernel builds can be quite slow as example output above shows. Create your
+#  own .config file to build smaller kernel.
+#
+#
+
+OPTVAL=`getopt -o s:m:i:t:vc:SC:h -n 'kbuild' -- "$@"` || usage
+
+# Note the quotes around `$OPTVAL': they are essential!
+eval set -- "$OPTVAL"
+
+LOG_CRIT=0
+LOG_ERROR=1
+LOG_WARN=2
+LOG_INFO=3
+LOG_PROGRESS=4
+LOG_TRACE=5
+LOG_ALL=6
+LOG_DEBUG=7
+
+src=/usr/src/linux
+tgt=/mnt/lustre
+verbose=$LOG_CRIT
+
+pname=$(basename $0)
+
+mopt=""
+mconfig=allyesconfig
+it=3
+lfile=/tmp/$pname-tmp-log.$$
+skip_copy=0
+conf_file=""
+
+while : ;do
+        case "$1" in
+                -s)
+                        src="$2"
+                        shift 2
+                ;;
+                -t)
+                        tgt="$2"
+                        shift 2
+                ;;
+                -m)
+                        mopt="$2"
+                        shift 2
+                ;;
+                -C)
+                        conf_file="$2"
+                        shift 2
+                ;;
+                -i)
+                        it="$2"
+                        shift 2
+                ;;
+                -c)
+                        mconfig="$2"
+                        shift 2
+                ;;
+                -S)
+                        skip_copy=1
+                        shift
+                ;;
+                -v)
+                        verbose=$(($verbose + 1))
+                        shift
+                ;;
+                -h)
+                        usage
+                ;;
+                --) 
+                        shift 
+                        break 
+                ;;
+                *) 
+                        echo "Internal error!" 
+                        usage
+                ;;
+        esac
+done
+
+[ $verbose -ge $LOG_ALL ] && set -x
+
+
+function warning()
+{
+        echo WARNING $pname: $*
+}
+
+function fail()
+{
+        local rc
+
+        rc=$1
+        shift
+        warning $* ... failing.
+        exit $rc
+}
+
+function log()
+{
+        local level
+
+        level=$1
+        shift
+        if [ $verbose -ge $level ] ;then
+               echo $*
+        fi
+}
+
+function doquiet()
+{
+        local cmd
+
+        cmd="$*"
+        echo >> $lfile
+        echo ---- start: $(date +"%Y-%m-%d %H:%M:%S") ---- >> $lfile
+        for i in $cmd ;do
+                echo "ARG: $i" >> $lfile
+        done
+        log $LOG_PROGRESS "Running '$cmd'..."
+        $cmd >>$lfile 2>&1 || \
+                fail 1 "Errors while running '$cmd'. See $lfile for transcript"
+        log $LOG_PROGRESS "Finished '$cmd'."
+        echo ---- done: $(date +"%Y-%m-%d %H:%M:%S") ---- >> $lfile
+}
+
+function dotime()
+{
+        local cmd
+
+        cmd="$*"
+        export TIMEFORMAT="R %3R S %3S U %3U"
+        time $cmd
+}
+
+ccache_dir=$tgt/ccache_dir
+cc_script=$tgt/cc_script
+
+which ccache >/dev/null || fail 2 "No ccache found"
+mkdir -p $ccache_dir || fail 3 "Cannot create $ccache_dir"
+
+export CCACHE_DIR=$ccache_dir
+
+# start the stuff
+
+cd $tgt || fail 4 "Cannot cd into $tgt"
+
+echo '#! /bin/sh'   >  $cc_script || fail 5 "Cannot write into $cc_script"
+echo 'ccache cc $*' >> $cc_script || fail 6 "Cannot append to $cc_script"
+chmod u+rx $cc_script || fail 7 "Cannot chmod u+rx $cc_script"
+
+cc_opt="CC=$cc_script"
+
+[ $verbose -ge $LOG_TRACE ] && vopt=-v
+
+if [ $skip_copy -eq 0 ] ;then
+        case "$src" in
+        ftp://*|http://*)
+                wget -c $src
+                src=$(basename $src)
+                ;;
+        esac
+
+        case "$src" in
+        */)
+                log $LOG_PROGRESS "Copying directory $src into $tgt"
+                cp -a$vopt "$src" .
+                ;;
+        *.tar.gz|*.tgz)
+                tar xzf "$src" $vopt
+                ;;
+        *.tar.bz2)
+                tar xjf "$src" $vopt
+                ;;
+        *)
+                fail 10 "No $src"
+                ;;
+        esac
+fi
+
+cd linux-* || fail 20 "Cannot change to linux-* from $PWD"
+
+function dokernel()
+{
+        doquiet make $mopt mrproper
+        if [ x$conf_file = x ] ;then
+                doquiet make $mopt $mconfig
+        else
+                cp $conf_file .config   || fail 8 "Cannot copy $conf_file"
+                ls -l .config
+                doquiet make $mopt oldconfig
+        fi
+
+        dotime doquiet make $mopt $cc_opt bzImage modules
+}
+
+log $LOG_PROGRESS Fill the cache...
+
+dokernel
+
+for i in $(seq 1 $it) ;do
+        log $LOG_PROGRESS Iteration $i...
+        dokernel
+done
diff --git a/lustre/tests/lockorder.sh b/lustre/tests/lockorder.sh

index 4f1ca4b..4d4e7e1 100644 (file)
--- a/lustre/tests/lockorder.sh
+++ b/lustre/tests/lockorder.sh
@@ -42,7 +42,7 @@ while [ $MINRES -gt $MAXRES ]; do
                 MAXDIR=$DIRTMP
                 MAXRES=$DIRRES
         fi
-       if [ $FILERES -lt $MINRES ]; then
+       if [ $FILERES -lt $MINRES -o -z "$MINFILE" ]; then
                 [ -f "$MINFILE" ] && rm $MINFILE
                 MINFILE=$FILETMP
                 MINRES=$FILERES
diff --git a/lustre/tests/multifstat.c b/lustre/tests/multifstat.c

index c305acc..91c0d73 100644 (file)
--- a/lustre/tests/multifstat.c
+++ b/lustre/tests/multifstat.c
@@ -84,7 +84,7 @@ int main(int argc, char **argv)
  
          if ( st1.st_size != st2.st_size ) {
                  printf("Sizes don't match %lu, %lu\n",
-                       (unsigned long)st1.st_size, 
+                       (unsigned long)st1.st_size,
                        (unsigned long)st2.st_size);
                  return 1;
          }
diff --git a/lustre/tests/sanity-nano.sh b/lustre/tests/sanity-nano.sh

new file mode 100755 (executable)

index 0000000..2005b0b
--- /dev/null
+++ b/lustre/tests/sanity-nano.sh
@@ -0,0 +1,29 @@
+#! /bin/sh
+#
+# Extremely minimal regression test set for clio.
+#
+
+MOUNT=${MOUNT:-"/mnt/lustre"}
+
+function cmpcheck() {
+    find /etc/ -type f | while read ;do
+        f=$REPLY
+        echo -n .
+        cmp $f $MOUNT/$f
+    done
+}
+
+cp -vax /etc $MOUNT                                   || exit 1
+cmpcheck
+
+export OSTCOUNT=2
+#export LOV="27c 27d 27e 27f 27g 27j 27k 27l 27m 27s 27t 27w 34f 51d 56 56g 56h"
+#export JOIN="75a 75b 57c 75d 75e 75f 75g"
+#export CHKSUM="77a 77d 77e 77f"
+#export DIO="69 77d 77e 77f 78 119a 119b 119c"
+#export EXCEPT="69 78 118a 129 $JOIN $CHKSUM $DIO"
+#export EXCEPT="77f"
+export SLOW="yes"
+
+sh sanity.sh
+#umount $MOUNT                                        || exit 2
diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh

index 95a6ac5..f3ba16d 100644 (file)
--- a/lustre/tests/sanity.sh
+++ b/lustre/tests/sanity.sh
@@ -3163,6 +3163,7 @@ test_72() { # bug 5695 - Test that on 2.6 remove_suid works properly
         cancel_lru_locks mdc
         test -u $DIR/f72 -o -g $DIR/f72 && error "S/gid is not dropped on MDS"
         true
+       rm -f $DIR/f72
  }
  run_test 72 "Test that remove suid works properly (bug5695) ===="
  
@@ -3205,6 +3206,7 @@ test_74a() { # bug 6149, 6184
         ls $DIR/f74a
         lctl set_param fail_loc=0
         true
+       rm -f $DIR/f74a
  }
  run_test 74a "ldlm_enqueue freed-export error path, ls (shouldn't LBUG)"
  
@@ -3218,6 +3220,7 @@ test_74b() { # bug 13310
         touch $DIR/f74b
         lctl set_param fail_loc=0
         true
+       rm -f $DIR/f74b
  }
  run_test 74b "ldlm_enqueue freed-export error path, touch (shouldn't LBUG)"
  
@@ -3410,6 +3413,7 @@ test_77a() { # bug 10889
         set_checksums 1
         dd if=$F77_TMP of=$DIR/$tfile bs=1M count=$F77SZ || error "dd error"
         set_checksums 0
+       rm -f $DIR/$tfile
  }
  run_test 77a "normal checksum read/write operation ============="
  
@@ -3422,6 +3426,7 @@ test_77b() { # bug 10889
                 error "dd error: $?"
         lctl set_param fail_loc=0
         set_checksums 0
+       rm -f $DIR/f77b
  }
  run_test 77b "checksum error on client write ===================="
  
@@ -3552,7 +3557,12 @@ test_78() { # bug 10901
         echo "MemTotal: $MEMTOTAL"
  # reserve 256MB of memory for the kernel and other running processes,
  # and then take 1/2 of the remaining memory for the read/write buffers.
-       MEMTOTAL=$(((MEMTOTAL - 256 ) / 2))
+    if [ $MEMTOTAL -gt 512 ] ;then
+        MEMTOTAL=$(((MEMTOTAL - 256 ) / 2))
+    else
+        # for those poor memory-starved high-end clusters...
+        MEMTOTAL=$((MEMTOTAL / 2))
+    fi
         echo "Mem to use for directio: $MEMTOTAL"
         [ $F78SIZE -gt $MEMTOTAL ] && F78SIZE=$MEMTOTAL
         [ $F78SIZE -gt 512 ] && F78SIZE=512
@@ -3562,11 +3572,12 @@ test_78() { # bug 10901
         [ $SMALLESTOST -lt 10240 ] && \
                 skip "too small OSTSIZE, useless to run large O_DIRECT test" && return 0
  
-       [ $F78SIZE -gt $((SMALLESTOST * $OSTCOUNT / 1024 - 5)) ] && \
-               F78SIZE=$((SMALLESTOST * $OSTCOUNT / 1024 - 5))
+       [ $F78SIZE -gt $((SMALLESTOST * $OSTCOUNT / 1024 - 80)) ] && \
+               F78SIZE=$((SMALLESTOST * $OSTCOUNT / 1024 - 80))
+
         [ "$SLOW" = "no" ] && NSEQ=1 && [ $F78SIZE -gt 32 ] && F78SIZE=32
         echo "File size: $F78SIZE"
-       $SETSTRIPE $DIR/$tfile -c -1 || error "setstripe failed"
+       $SETSTRIPE $DIR/$tfile -c $OSTCOUNT || error "setstripe failed"
         for i in `seq 1 $NSEQ`
         do
                 FSIZE=$(($F78SIZE / ($NSEQ - $i + 1)))
@@ -3619,6 +3630,7 @@ test_80() { # bug 10718
                  error "elapsed for 1M@1T = $DIFF"
          fi
          true
+       rm -f $DIR/$tfile
  }
  run_test 80 "Page eviction is equally fast at high offsets too  ===="
  
@@ -3685,6 +3697,7 @@ test_99f() {
         [ ! -d $DIR/d99cvsroot ] && test_99d
         cd $DIR/d99reposname
         $RUNAS cvs commit -m 'nomsg' foo99
+    rm -fr $DIR/d99cvsroot
  }
  run_test 99f "cvs commit ======================================="
  
@@ -3787,6 +3800,7 @@ cleanup_test101() {
         [ "$SETUP_TEST101" = "yes" ] || return
         trap 0
         rm -rf $DIR/$tdir
+    rm -f $DIR/$tfile
         SETUP_TEST101=no
  }
  
@@ -3948,6 +3962,7 @@ test_102b() {
         local stripe_count=`grep "count"  $tmp_file| awk '{print $2}'`
         [ "$stripe_size" -eq 65536 ] || error "stripe size $stripe_size != 65536"
         [ "$stripe_count" -eq 2 ] || error "stripe count $stripe_count != 2"
+       rm -f $DIR/$tfile
  }
  run_test 102b "getfattr/setfattr for trusted.lov EAs ============"
  
@@ -4148,6 +4163,7 @@ test_102h() { # bug 15777
                 error "$XBIG different after growing $XSML"
         fi
         log "$XBIG still valid after growing $XSML"
+       rm -f $file
  }
  run_test 102h "grow xattr from inside inode to external block"
  
@@ -4225,6 +4241,7 @@ test_104() {
         lfs df || error "lfs df with deactivated OSC failed"
         lctl --device %$OSC recover
         lfs df || error "lfs df with reactivated OSC failed"
+       rm -f $DIR/$tfile
  }
  run_test 104 "lfs df [-ih] [path] test ========================="
  
@@ -4237,6 +4254,7 @@ test_105a() {
          else
                  flocks_test 1 off -f $DIR/$tfile || error "fail flock off"
          fi
+       rm -f $DIR/$tfile
  }
  run_test 105a "flock when mounted without -o flock test ========"
  
@@ -4248,6 +4266,7 @@ test_105b() {
          else
                  flocks_test 1 off -c $DIR/$tfile || error "fail flock off"
          fi
+       rm -f $DIR/$tfile
  }
  run_test 105b "fcntl when mounted without -o flock test ========"
  
@@ -4259,6 +4278,7 @@ test_105c() {
          else
                  flocks_test 1 off -l $DIR/$tfile || error "fail flock off"
          fi
+       rm -f $DIR/$tfile
  }
  run_test 105c "lockf when mounted without -o flock test ========"
  
@@ -4320,6 +4340,7 @@ test_110() {
         touch $DIR/d110/yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy && error ""create with 256 char should fail, but not
  
         ls -l $DIR/d110
+    rm -fr $DIR/d110
  }
  run_test 110 "filename length checking"
  
@@ -4462,6 +4483,7 @@ test_117() # bug 10891
          > $DIR/$tfile || error "truncate failed"
          lctl set_param fail_loc=0
          echo "Truncate succeeded."
+       rm -f $DIR/$tfile
  }
  run_test 117 "verify fsfilt_extend =========="
  
@@ -4498,6 +4520,7 @@ test_118a() #bug 11710
                 error "Dirty pages not flushed to disk, dirty=$DIRTY, writeback=$WRITEBACK"
                 return 1;
          fi
+       rm -f $DIR/$tfile
  }
  run_test 118a "verify O_SYNC works =========="
  
@@ -5646,6 +5669,49 @@ test_130e() {
  }
  run_test 130e "FIEMAP (test continuation FIEMAP calls)"
  
+# Test for writev/readv
+test_131a() {
+       rwv -f $DIR/$tfile -w -n 3 524288 1048576 1572864 || \
+       error "writev test failed"
+       rwv -f $DIR/$tfile -r -v -n 2 1572864 1048576 || \
+       error "readv failed"
+       rm -f $DIR/$tfile
+}
+run_test 131a "test iov's crossing stripe boundary for writev/readv"
+
+test_131b() {
+       rwv -f $DIR/$tfile -w -a -n 3 524288 1048576 1572864 || \
+       error "append writev test failed"
+       rwv -f $DIR/$tfile -w -a -n 2 1572864 1048576 || \
+       error "append writev test failed"
+       rm -f $DIR/$tfile
+}
+run_test 131b "test append writev"
+
+test_131c() {
+       rwv -f $DIR/$tfile -w -d -n 1 1048576 || return 0
+       error "NOT PASS"
+}
+run_test 131c "test read/write on file w/o objects"
+
+test_131d() {
+       rwv -f $DIR/$tfile -w -n 1 1572864
+       NOB=`rwv -f $DIR/$tfile -r -n 3 524288 524288 1048576 | awk '/error/ {print $6}'`
+       if [ "$NOB" != 1572864 ]; then
+               error "Short read filed: read $NOB bytes instead of 1572864"
+       fi
+       rm -f $DIR/$tfile
+}
+run_test 131d "test short read"
+
+test_131e() {
+       rwv -f $DIR/$tfile -w -s 1048576 -n 1 1048576
+       rwv -f $DIR/$tfile -r -z -s 0 -n 1 524288 || \
+       error "read hitting hole failed"
+       rm -f $DIR/$tfile
+}
+run_test 131e "test read hitting hole"
+
  test_140() { #bug-17379
          mkdir -p $DIR/$tdir || error "Creating dir $DIR/$tdir"
          cd $DIR/$tdir || error "Changing to $DIR/$tdir"
@@ -5915,6 +5981,15 @@ test_200i() {
  }
  run_test 200i "Remove a pool ============================================"
  
+test_212() {
+       size=`date +%s`
+       size=$((size % 8192 + 1))
+       dd if=/dev/urandom of=$DIR/f212 bs=1k count=$size
+       sendfile $DIR/f212 $DIR/f212.xyz || error "sendfile wrong"
+       rm -f $DIR/f212 $DIR/f212.xyz
+}
+run_test 212 "Sendfile test ============================================"
+
  TMPDIR=$OLDTMPDIR
  TMP=$OLDTMP
  HOME=$OLDHOME
diff --git a/lustre/tests/sanityN.sh b/lustre/tests/sanityN.sh

index d4b2ac8..6c773fd 100644 (file)
--- a/lustre/tests/sanityN.sh
+++ b/lustre/tests/sanityN.sh
@@ -629,35 +629,35 @@ run_test 31b "voluntary OST cancel / blocking ast race=============="
  
  # enable/disable lockless truncate feature, depending on the arg 0/1
  enable_lockless_truncate() {
-        lctl set_param -n llite.*.lockless_truncate $1
+        lctl set_param -n osc.*.lockless_truncate $1
  }
  
  test_32a() { # bug 11270
          local p="$TMP/sanityN-$TESTNAME.parameters"
-        save_lustre_params $HOSTNAME llite.*.lockless_truncate > $p
+        save_lustre_params $HOSTNAME osc.*.lockless_truncate > $p
          cancel_lru_locks osc
-        clear_llite_stats
+        clear_osc_stats
          enable_lockless_truncate 1
          dd if=/dev/zero of=$DIR1/$tfile count=10 bs=1M > /dev/null 2>&1
  
          log "checking cached lockless truncate"
          $TRUNCATE $DIR1/$tfile 8000000
          $CHECKSTAT -s 8000000 $DIR2/$tfile || error "wrong file size"
-        [ $(calc_llite_stats lockless_truncate) -eq 0 ] ||
+        [ $(calc_osc_stats lockless_truncate) -eq 0 ] ||
                  error "lockless truncate doesn't use cached locks"
  
          log "checking not cached lockless truncate"
          $TRUNCATE $DIR2/$tfile 5000000
          $CHECKSTAT -s 5000000 $DIR1/$tfile || error "wrong file size"
-        [ $(calc_llite_stats lockless_truncate) -ne 0 ] ||
+        [ $(calc_osc_stats lockless_truncate) -ne 0 ] ||
                  error "not cached trancate isn't lockless"
  
          log "disabled lockless truncate"
          enable_lockless_truncate 0
-        clear_llite_stats
+        clear_osc_stats
          $TRUNCATE $DIR2/$tfile 3000000
          $CHECKSTAT -s 3000000 $DIR1/$tfile || error "wrong file size"
-        [ $(calc_llite_stats lockless_truncate) -eq 0 ] ||
+        [ $(calc_osc_stats lockless_truncate) -eq 0 ] ||
                  error "lockless truncate disabling failed"
          rm $DIR1/$tfile
          # restore lockless_truncate default values
@@ -671,36 +671,36 @@ test_32b() { # bug 11270
  
          local node
          local p="$TMP/sanityN-$TESTNAME.parameters"
-        save_lustre_params $HOSTNAME "llite.*.contention_seconds" > $p
+        save_lustre_params $HOSTNAME "osc.*.contention_seconds" > $p
          for node in $(osts_nodes); do
                  save_lustre_params $node "ldlm.namespaces.filter-*.max_nolock_bytes" >> $p
                  save_lustre_params $node "ldlm.namespaces.filter-*.contended_locks" >> $p
                  save_lustre_params $node "ldlm.namespaces.filter-*.contention_seconds" >> $p
          done
-        clear_llite_stats
+        clear_osc_stats
          # agressive lockless i/o settings 
          for node in $(osts_nodes); do
                  do_node $node 'lctl set_param -n ldlm.namespaces.filter-*.max_nolock_bytes 2000000; lctl set_param -n ldlm.namespaces.filter-*.contended_locks 0; lctl set_param -n ldlm.namespaces.filter-*.contention_seconds 60'
          done
-        lctl set_param -n llite.*.contention_seconds 60
+        lctl set_param -n osc.*.contention_seconds 60
          for i in $(seq 5); do
                  dd if=/dev/zero of=$DIR1/$tfile bs=4k count=1 conv=notrunc > /dev/null 2>&1
                  dd if=/dev/zero of=$DIR2/$tfile bs=4k count=1 conv=notrunc > /dev/null 2>&1
          done
-        [ $(calc_llite_stats lockless_write_bytes) -ne 0 ] || error "lockless i/o was not triggered" 
+        [ $(calc_osc_stats lockless_write_bytes) -ne 0 ] || error "lockless i/o was not triggered" 
          # disable lockless i/o (it is disabled by default)
          for node in $(osts_nodes); do
                  do_node $node 'lctl set_param -n ldlm.namespaces.filter-*.max_nolock_bytes 0; lctl set_param -n ldlm.namespaces.filter-*.contended_locks 32; lctl set_param -n ldlm.namespaces.filter-*.contention_seconds 0'
          done
          # set contention_seconds to 0 at client too, otherwise Lustre still
          # remembers lock contention
-        lctl set_param -n llite.*.contention_seconds 0
-        clear_llite_stats
-        for i in $(seq 5); do
+        lctl set_param -n osc.*.contention_seconds 0
+        clear_osc_stats
+        for i in $(seq 1); do
                  dd if=/dev/zero of=$DIR1/$tfile bs=4k count=1 conv=notrunc > /dev/null 2>&1
                  dd if=/dev/zero of=$DIR2/$tfile bs=4k count=1 conv=notrunc > /dev/null 2>&1
          done
-        [ $(calc_llite_stats lockless_write_bytes) -eq 0 ] ||
+        [ $(calc_osc_stats lockless_write_bytes) -eq 0 ] ||
                  error "lockless i/o works when disabled" 
          rm -f $DIR1/$tfile
          restore_lustre_params <$p
diff --git a/lustre/tests/sendfile.c b/lustre/tests/sendfile.c

index 21ae58a..5cfa110 100644 (file)
--- a/lustre/tests/sendfile.c
+++ b/lustre/tests/sendfile.c
@@ -1,3 +1,38 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
  
  #include <stdio.h>
  #include <unistd.h>
@@ -20,85 +55,99 @@
  
  int main(int argc, char *argv[])
  {
-       char *sfile, *tfile;
-       struct stat stbuf;
-       int size;
-       int infd, outfd;
-       int sd[2];
-       int rc;
-       char *buf;
-       char cmd[1024];
-       int page_size = sysconf(_SC_PAGESIZE);
-       loff_t pos;
-
-       if (argc < 3) {
-               fprintf(stderr, "%s <source file> <dest file>\n", argv[0]);
-               exit(-1);
-       }
-
-       sfile = argv[1];
-       tfile = argv[2];
-
-       if (stat(sfile, &stbuf) < 0) {
-               if (errno == ENOENT) {
-                       /* assume doing non-object file testing */
-                       infd = open(sfile, O_LOV_DELAY_CREATE|O_CREAT|O_RDWR,
-                                   0644);
-                       if (infd < 0)
-                               syserr("open source file:");
-
-                       size = random() % (1 * 1024 * 1024) + 1024;
-                       if (ftruncate(infd, (off_t)size) < 0)
-                               syserr("truncate file error:");
-               } else {
-                       syserr("stat file: ");
-               }
-       } else if (S_ISREG(stbuf.st_mode)) {
-               size = (int)stbuf.st_size;
-               infd = open(sfile, O_RDONLY, 0644);
-               if (infd < 0)
-                       syserr("Open an existing file error:");
-       } else {
-               fprintf(stderr, "%s is not a regular file\n", sfile);
-               exit(-1);
-       }
-
-       outfd = open(tfile, O_WRONLY|O_TRUNC|O_CREAT, 0666);
-       if (outfd < 0)
-               syserr("open dest file:");
-
-       rc = socketpair(AF_LOCAL, SOCK_STREAM, 0, sd);
-       if (rc < 0)
-               syserr("socketpair");
-
-       pos = 0;
-       while (size > 0) {
-               int rc2;
-               size_t seg_size;
-
-               seg_size = (size < page_size) ? size : (random() % size + 1);
-               if (seg_size > 4 * page_size)
-                       seg_size = 4 * page_size;
-               rc = sendfile(sd[0], infd, &pos, seg_size);
-               if (rc < 0)
-                       syserr("sendfile:");
-
-               size -= seg_size;
-               if (size == 0)
-                       close(sd[0]);
-
-               buf = malloc(seg_size);
-               rc = read(sd[1], buf, seg_size);
-               if (rc != seg_size)
-                       syserr("read from socket:");
-
-               rc2 = write(outfd, buf, rc);
-               if (rc2 != rc)
-                       syserr("write dest file error:");
-               free(buf);
-       }
-       close(sd[1]), close(infd), close(outfd);
-
-       sprintf(cmd, "cmp %s %s\n", sfile, tfile);
-       return system(cmd);
+        char *sfile, *tfile;
+        struct stat stbuf;
+        int size;
+        unsigned long bufsize = 1024 * 1024;
+        int infd, outfd;
+        int sd[2];
+        int rc;
+        char *buf;
+        char cmd[1024];
+        loff_t pos;
+
+        if (argc < 3) {
+                fprintf(stderr, "%s <source file> <dest file>\n", argv[0]);
+                exit(-1);
+        }
+
+        sfile = argv[1];
+        tfile = argv[2];
+
+        if (stat(sfile, &stbuf) < 0) {
+                if (errno == ENOENT) {
+                        /* assume doing non-object file testing */
+                        infd = open(sfile, O_LOV_DELAY_CREATE|O_CREAT|O_RDWR,
+                                    0644);
+                        if (infd < 0)
+                                syserr("open source file:");
+
+                        size = random() % (1 * 1024 * 1024) + 1024;
+                        if (ftruncate(infd, (off_t)size) < 0)
+                                syserr("truncate file error:");
+                } else {
+                        syserr("stat file: ");
+                }
+        } else if (S_ISREG(stbuf.st_mode)) {
+                size = (int)stbuf.st_size;
+                infd = open(sfile, O_RDONLY, 0644);
+                if (infd < 0)
+                        syserr("Open an existing file error:");
+        } else {
+                fprintf(stderr, "%s is not a regular file\n", sfile);
+                exit(-1);
+        }
+
+        outfd = open(tfile, O_WRONLY|O_TRUNC|O_CREAT, 0666);
+        if (outfd < 0)
+                syserr("open dest file:");
+
+        rc = socketpair(AF_LOCAL, SOCK_STREAM, 0, sd);
+        if (rc < 0)
+                syserr("socketpair");
+
+        rc = fcntl(sd[0], F_SETFL, O_NONBLOCK);
+        if (rc < 0)
+                syserr("fcntl");
+
+        rc = setsockopt(sd[0], SOL_SOCKET, SO_SNDBUF,
+                        &bufsize, sizeof(bufsize));
+        if (rc)
+                syserr("setsockopt");
+
+        srandom(time(NULL));
+
+        pos = 0;
+        while (size > 0) {
+                int rc2;
+                size_t seg_size;
+
+                seg_size = random() % bufsize + 1;
+                if (seg_size > size)
+                        seg_size = size;
+
+                while (seg_size) {
+                        rc = sendfile(sd[0], infd, &pos, seg_size);
+                        if (rc < 0)
+                                syserr("sendfile:");
+
+                        seg_size -= rc;
+                        size -= rc;
+                        if (size == 0)
+                                close(sd[0]);
+
+                        buf = malloc(rc);
+                        if (read(sd[1], buf, rc) < 0)
+                                syserr("read from socket:");
+
+                        rc2 = write(outfd, buf, rc);
+                        if (rc2 != rc)
+                                syserr("write dest file error:");
+                        free(buf);
+                }
+        }
+        close(sd[1]), close(infd), close(outfd);
+
+        sprintf(cmd, "cmp %s %s\n", sfile, tfile);
+        return system(cmd);
  }
diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh

index 3ef0ab2..f1616c2 100644 (file)
--- a/lustre/tests/test-framework.sh
+++ b/lustre/tests/test-framework.sh
@@ -251,8 +251,8 @@ load_modules() {
  
      load_module llite/lustre
      load_module llite/llite_lloop
-    rm -f $TMP/ogdb-$HOSTNAME
-    OGDB=$TMP
+    OGDB=${OGDB:-$TMP}
+    rm -f $OGDB/ogdb-$HOSTNAME
      [ -d /r ] && OGDB="/r/tmp"
      $LCTL modules > $OGDB/ogdb-$HOSTNAME
  
@@ -2060,6 +2060,18 @@ calc_llite_stats() {
          echo $res
  }
  
+# reset osc stat counters
+clear_osc_stats(){
+        lctl set_param -n osc.*.osc_stats 0
+}
+
+# sum osc stat items
+calc_osc_stats() {
+        local res=$(lctl get_param -n osc.*.osc_stats |
+                    awk 'BEGIN {s = 0} END {print s} /^'"$1"'/ {s += $2}')
+        echo $res
+}
+
  calc_sum () {
          awk 'BEGIN {s = 0}; {s += $1}; END {print s}'
  }
diff --git a/lustre/utils/gss/lsupport.c b/lustre/utils/gss/lsupport.c

index ca964ff..82b7b8e 100644 (file)
--- a/lustre/utils/gss/lsupport.c
+++ b/lustre/utils/gss/lsupport.c
@@ -71,7 +71,7 @@
  #endif
  #include "lsupport.h"
  
-const char * lustre_svc_name[] = 
+const char * lustre_svc_name[] =
  {
          [LUSTRE_GSS_SVC_MDS]    = "MDS",
          [LUSTRE_GSS_SVC_OSS]    = "OSS",
diff --git a/lustre/utils/obdiolib.c b/lustre/utils/obdiolib.c

index 64f0ce4..fa272db 100644 (file)
--- a/lustre/utils/obdiolib.c
+++ b/lustre/utils/obdiolib.c
@@ -152,6 +152,8 @@ obdio_pwrite (struct obdio_conn *conn, __u64 oid,
          conn->oc_data.ioc_obdo1.o_valid =
                  OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLMODE;
  
+        conn->oc_data.ioc_pbuf1 = (void*)1;
+        conn->oc_data.ioc_plen1 = 1;
          conn->oc_data.ioc_pbuf2 = buffer;
          conn->oc_data.ioc_plen2 = count;
          conn->oc_data.ioc_count = count;
author	nikita <nikita>
	Fri, 7 Nov 2008 23:54:43 +0000 (23:54 +0000)
committer	nikita <nikita>
	Fri, 7 Nov 2008 23:54:43 +0000 (23:54 +0000)
lustre/ChangeLog		patch \| blob \| history
lustre/autoMakefile.am		patch \| blob \| history
lustre/autoconf/lustre-core.m4		patch \| blob \| history
lustre/cmm/cmm_device.c		patch \| blob \| history
lustre/cmm/cmm_object.c		patch \| blob \| history
lustre/cmm/mdc_device.c		patch \| blob \| history
lustre/cmm/mdc_object.c		patch \| blob \| history
lustre/doc/lock-ordering	[new file with mode: 0644]	patch \| blob
lustre/fld/fld_request.c		patch \| blob \| history
lustre/include/Makefile.am		patch \| blob \| history
lustre/include/cl_object.h	[new file with mode: 0644]	patch \| blob
lustre/include/dt_object.h		patch \| blob \| history
lustre/include/lclient.h	[new file with mode: 0644]	patch \| blob
lustre/include/liblustre.h		patch \| blob \| history
lustre/include/linux/lustre_acl.h		patch \| blob \| history
lustre/include/linux/lustre_compat25.h		patch \| blob \| history
lustre/include/linux/obd_support.h		patch \| blob \| history
lustre/include/lu_object.h		patch \| blob \| history
lustre/include/lustre/lustre_idl.h		patch \| blob \| history
lustre/include/lustre_cache.h	[deleted file]	patch \| blob \| history
lustre/include/lustre_dlm.h		patch \| blob \| history
lustre/include/lustre_lite.h		patch \| blob \| history
lustre/include/lustre_net.h		patch \| blob \| history
lustre/include/md_object.h		patch \| blob \| history
lustre/include/obd.h		patch \| blob \| history
lustre/include/obd_class.h		patch \| blob \| history
lustre/include/obd_ost.h		patch \| blob \| history
lustre/include/obd_support.h		patch \| blob \| history
lustre/kernel_patches/patches/2.6-rhel4-kgdb-ga.patch		patch \| blob \| history
lustre/kernel_patches/patches/2.6-rhel5-kgdb-ga.patch	[new file with mode: 0644]	patch \| blob
lustre/kernel_patches/patches/kgdb-2.6.18-vanilla.patch	[new file with mode: 0644]	patch \| blob
lustre/kernel_patches/patches/lockdep_chains-2.6.18-vanilla.patch	[new file with mode: 0644]	patch \| blob
lustre/kernel_patches/series/2.6-rhel4.series		patch \| blob \| history
lustre/kernel_patches/series/2.6.18-vanilla.series		patch \| blob \| history
lustre/lclient/Makefile.am	[new file with mode: 0644]	patch \| blob
lustre/lclient/glimpse.c	[new file with mode: 0644]	patch \| blob
lustre/lclient/lcommon_cl.c	[new file with mode: 0644]	patch \| blob
lustre/ldlm/ldlm_extent.c		patch \| blob \| history
lustre/ldlm/ldlm_flock.c		patch \| blob \| history
lustre/ldlm/ldlm_internal.h		patch \| blob \| history
lustre/ldlm/ldlm_lib.c		patch \| blob \| history
lustre/ldlm/ldlm_lock.c		patch \| blob \| history
lustre/ldlm/ldlm_lockd.c		patch \| blob \| history
lustre/ldlm/ldlm_pool.c		patch \| blob \| history
lustre/ldlm/ldlm_request.c		patch \| blob \| history
lustre/liblustre/Makefile.am		patch \| blob \| history
lustre/liblustre/dir.c		patch \| blob \| history
lustre/liblustre/file.c		patch \| blob \| history
lustre/liblustre/llite_cl.c	[new file with mode: 0644]	patch \| blob
lustre/liblustre/llite_lib.c		patch \| blob \| history
lustre/liblustre/llite_lib.h		patch \| blob \| history
lustre/liblustre/namei.c		patch \| blob \| history
lustre/liblustre/rw.c		patch \| blob \| history
lustre/liblustre/super.c		patch \| blob \| history
lustre/liblustre/tests/sanity.c		patch \| blob \| history
lustre/llite/Makefile.in		patch \| blob \| history
lustre/llite/autoMakefile.am		patch \| blob \| history
lustre/llite/dcache.c		patch \| blob \| history
lustre/llite/file.c		patch \| blob \| history
lustre/llite/llite_capa.c		patch \| blob \| history
lustre/llite/llite_close.c		patch \| blob \| history
lustre/llite/llite_internal.h		patch \| blob \| history
lustre/llite/llite_lib.c		patch \| blob \| history
lustre/llite/llite_mmap.c		patch \| blob \| history
lustre/llite/lloop.c		patch \| blob \| history
lustre/llite/lproc_llite.c		patch \| blob \| history
lustre/llite/namei.c		patch \| blob \| history
lustre/llite/rw.c		patch \| blob \| history
lustre/llite/rw26.c		patch \| blob \| history
lustre/llite/super25.c		patch \| blob \| history
lustre/llite/symlink.c		patch \| blob \| history
lustre/llite/vvp_dev.c	[new file with mode: 0644]	patch \| blob
lustre/llite/vvp_internal.h	[new file with mode: 0644]	patch \| blob
lustre/llite/vvp_io.c	[new file with mode: 0644]	patch \| blob
lustre/llite/vvp_lock.c	[new file with mode: 0644]	patch \| blob
lustre/llite/vvp_object.c	[new file with mode: 0644]	patch \| blob
lustre/llite/vvp_page.c	[new file with mode: 0644]	patch \| blob
lustre/lmv/lmv_internal.h		patch \| blob \| history
lustre/lmv/lmv_obd.c		patch \| blob \| history
lustre/lmv/lproc_lmv.c		patch \| blob \| history
lustre/lov/Makefile.in		patch \| blob \| history
lustre/lov/autoMakefile.am		patch \| blob \| history
lustre/lov/lov_cl_internal.h	[new file with mode: 0644]	patch \| blob
lustre/lov/lov_dev.c	[new file with mode: 0644]	patch \| blob
lustre/lov/lov_ea.c		patch \| blob \| history
lustre/lov/lov_internal.h		patch \| blob \| history
lustre/lov/lov_io.c	[new file with mode: 0644]	patch \| blob
lustre/lov/lov_lock.c	[new file with mode: 0644]	patch \| blob
lustre/lov/lov_merge.c		patch \| blob \| history
lustre/lov/lov_obd.c		patch \| blob \| history
lustre/lov/lov_object.c	[new file with mode: 0644]	patch \| blob
lustre/lov/lov_page.c	[new file with mode: 0644]	patch \| blob
lustre/lov/lov_request.c		patch \| blob \| history
lustre/lov/lovsub_dev.c	[new file with mode: 0644]	patch \| blob
lustre/lov/lovsub_io.c	[moved from lustre/include/obd_echo.h with 58% similarity]	patch \| blob \| history
lustre/lov/lovsub_lock.c	[new file with mode: 0644]	patch \| blob
lustre/lov/lovsub_object.c	[new file with mode: 0644]	patch \| blob
lustre/lov/lovsub_page.c	[new file with mode: 0644]	patch \| blob
lustre/lvfs/fsfilt_ext3.c		patch \| blob \| history
lustre/lvfs/lvfs_linux.c		patch \| blob \| history
lustre/lvfs/quotafmt_test.c		patch \| blob \| history
lustre/mdc/mdc_locks.c		patch \| blob \| history
lustre/mdc/mdc_reint.c		patch \| blob \| history
lustre/mdc/mdc_request.c		patch \| blob \| history
lustre/mdd/mdd_device.c		patch \| blob \| history
lustre/mdd/mdd_dir.c		patch \| blob \| history
lustre/mdd/mdd_internal.h		patch \| blob \| history
lustre/mdd/mdd_lov.c		patch \| blob \| history
lustre/mdd/mdd_object.c		patch \| blob \| history
lustre/mdd/mdd_permission.c		patch \| blob \| history
lustre/mds/lproc_mds.c		patch \| blob \| history
lustre/mds/mds_fs.c		patch \| blob \| history
lustre/mds/mds_lov.c		patch \| blob \| history
lustre/mdt/mdt_capa.c		patch \| blob \| history
lustre/mdt/mdt_handler.c		patch \| blob \| history
lustre/mdt/mdt_recovery.c		patch \| blob \| history
lustre/mdt/mdt_reint.c		patch \| blob \| history
lustre/mgs/mgs_llog.c		patch \| blob \| history
lustre/obdclass/Makefile.in		patch \| blob \| history
lustre/obdclass/autoMakefile.am		patch \| blob \| history
lustre/obdclass/cl_internal.h	[new file with mode: 0644]	patch \| blob
lustre/obdclass/cl_io.c	[new file with mode: 0644]	patch \| blob
lustre/obdclass/cl_lock.c	[new file with mode: 0644]	patch \| blob
lustre/obdclass/cl_object.c	[new file with mode: 0644]	patch \| blob
lustre/obdclass/cl_page.c	[new file with mode: 0644]	patch \| blob
lustre/obdclass/class_obd.c		patch \| blob \| history
lustre/obdclass/genops.c		patch \| blob \| history
lustre/obdclass/linux/linux-module.c		patch \| blob \| history
lustre/obdclass/linux/linux-obdo.c		patch \| blob \| history
lustre/obdclass/llog_cat.c		patch \| blob \| history
lustre/obdclass/llog_lvfs.c		patch \| blob \| history
lustre/obdclass/llog_obd.c		patch \| blob \| history
lustre/obdclass/lprocfs_status.c		patch \| blob \| history
lustre/obdclass/lu_object.c		patch \| blob \| history
lustre/obdclass/lu_time.c		patch \| blob \| history
lustre/obdclass/obd_mount.c		patch \| blob \| history
lustre/obdecho/autoMakefile.am		patch \| blob \| history
lustre/obdecho/echo.c		patch \| blob \| history
lustre/obdecho/echo_client.c		patch \| blob \| history
lustre/obdecho/echo_internal.h	[new file with mode: 0644]	patch \| blob
lustre/obdfilter/filter_io.c		patch \| blob \| history
lustre/osc/Makefile.in		patch \| blob \| history
lustre/osc/autoMakefile.am		patch \| blob \| history
lustre/osc/cache.c	[deleted file]	patch \| blob \| history
lustre/osc/lproc_osc.c		patch \| blob \| history
lustre/osc/osc_cl_internal.h	[new file with mode: 0644]	patch \| blob
lustre/osc/osc_create.c		patch \| blob \| history
lustre/osc/osc_dev.c	[new file with mode: 0644]	patch \| blob
lustre/osc/osc_internal.h		patch \| blob \| history
lustre/osc/osc_io.c	[new file with mode: 0644]	patch \| blob
lustre/osc/osc_lock.c	[new file with mode: 0644]	patch \| blob
lustre/osc/osc_object.c	[new file with mode: 0644]	patch \| blob
lustre/osc/osc_page.c	[new file with mode: 0644]	patch \| blob
lustre/osc/osc_request.c		patch \| blob \| history
lustre/osd/osd_handler.c		patch \| blob \| history
lustre/osd/osd_internal.h		patch \| blob \| history
lustre/ptlrpc/client.c		patch \| blob \| history
lustre/ptlrpc/events.c		patch \| blob \| history
lustre/ptlrpc/import.c		patch \| blob \| history
lustre/ptlrpc/layout.c		patch \| blob \| history
lustre/ptlrpc/pack_generic.c		patch \| blob \| history
lustre/ptlrpc/pinger.c		patch \| blob \| history
lustre/ptlrpc/ptlrpcd.c		patch \| blob \| history
lustre/ptlrpc/recov_thread.c		patch \| blob \| history
lustre/ptlrpc/sec.c		patch \| blob \| history
lustre/ptlrpc/sec_plain.c		patch \| blob \| history
lustre/ptlrpc/service.c		patch \| blob \| history
lustre/quota/quota_context.c		patch \| blob \| history
lustre/tests/fsx.c		patch \| blob \| history
lustre/tests/it_test.c		patch \| blob \| history
lustre/tests/kbuild	[new file with mode: 0755]	patch \| blob
lustre/tests/lockorder.sh		patch \| blob \| history
lustre/tests/multifstat.c		patch \| blob \| history
lustre/tests/sanity-nano.sh	[new file with mode: 0755]	patch \| blob
lustre/tests/sanity.sh		patch \| blob \| history
lustre/tests/sanityN.sh		patch \| blob \| history
lustre/tests/sendfile.c		patch \| blob \| history
lustre/tests/test-framework.sh		patch \| blob \| history
lustre/utils/gss/lsupport.c		patch \| blob \| history
lustre/utils/obdiolib.c		patch \| blob \| history